@heylemon/lemonade 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/dist/build-info.json +3 -3
  2. package/dist/canvas-host/a2ui/.bundle.hash +1 -1
  3. package/dist/gateway/skills-http.js +74 -19
  4. package/package.json +1 -1
  5. package/skills/docx/SKILL.md +25 -30
  6. package/skills/docx/scripts/accept_changes.py +0 -17
  7. package/skills/docx/scripts/comment.py +10 -39
  8. package/skills/docx/scripts/office/helpers/merge_runs.py +1 -33
  9. package/skills/docx/scripts/office/helpers/simplify_redlines.py +0 -43
  10. package/skills/docx/scripts/office/pack.py +0 -30
  11. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  12. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  13. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  14. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  15. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  16. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  17. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  18. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  19. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  20. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  21. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  22. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  23. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  24. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  25. package/skills/docx/scripts/office/soffice.py +0 -55
  26. package/skills/docx/scripts/office/unpack.py +5 -27
  27. package/skills/docx/scripts/office/validate.py +19 -14
  28. package/skills/docx/scripts/office/validators/base.py +48 -224
  29. package/skills/docx/scripts/office/validators/docx.py +44 -117
  30. package/skills/docx/scripts/office/validators/pptx.py +2 -42
  31. package/skills/docx/scripts/office/validators/redlining.py +3 -40
  32. package/skills/pdf/SKILL.md +22 -15
  33. package/skills/pdf/{FORMS.md → forms.md} +0 -14
  34. package/skills/pdf/scripts/check_bounding_boxes.py +0 -5
  35. package/skills/pdf/scripts/check_fillable_fields.py +0 -1
  36. package/skills/pdf/scripts/convert_pdf_to_images.py +0 -2
  37. package/skills/pdf/scripts/create_validation_image.py +0 -4
  38. package/skills/pdf/scripts/extract_form_field_info.py +1 -31
  39. package/skills/pdf/scripts/extract_form_structure.py +0 -9
  40. package/skills/pdf/scripts/fill_fillable_fields.py +0 -23
  41. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +3 -38
  42. package/skills/pptx/SKILL.md +2 -29
  43. package/skills/pptx/editing.md +2 -2
  44. package/skills/pptx/pptxgenjs.md +53 -8
  45. package/skills/pptx/scripts/add_slide.py +0 -30
  46. package/skills/pptx/scripts/clean.py +0 -23
  47. package/skills/pptx/scripts/office/helpers/merge_runs.py +1 -33
  48. package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -43
  49. package/skills/pptx/scripts/office/pack.py +0 -30
  50. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  51. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  52. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  53. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  54. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  55. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  56. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  57. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  58. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  59. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  60. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  61. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  62. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  63. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  64. package/skills/pptx/scripts/office/soffice.py +0 -55
  65. package/skills/pptx/scripts/office/unpack.py +5 -27
  66. package/skills/pptx/scripts/office/validate.py +19 -14
  67. package/skills/pptx/scripts/office/validators/base.py +48 -224
  68. package/skills/pptx/scripts/office/validators/docx.py +44 -117
  69. package/skills/pptx/scripts/office/validators/pptx.py +2 -42
  70. package/skills/pptx/scripts/office/validators/redlining.py +3 -40
  71. package/skills/pptx/scripts/thumbnail.py +0 -31
  72. package/skills/xlsx/SKILL.md +3 -26
  73. package/skills/xlsx/scripts/office/helpers/merge_runs.py +1 -33
  74. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -43
  75. package/skills/xlsx/scripts/office/pack.py +0 -30
  76. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  77. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  78. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  79. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  80. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  81. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  82. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  83. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  84. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  85. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  86. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  87. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  88. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  89. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  90. package/skills/xlsx/scripts/office/soffice.py +0 -55
  91. package/skills/xlsx/scripts/office/unpack.py +5 -27
  92. package/skills/xlsx/scripts/office/validate.py +19 -14
  93. package/skills/xlsx/scripts/office/validators/base.py +48 -224
  94. package/skills/xlsx/scripts/office/validators/docx.py +44 -117
  95. package/skills/xlsx/scripts/office/validators/pptx.py +2 -42
  96. package/skills/xlsx/scripts/office/validators/redlining.py +3 -40
  97. package/skills/xlsx/scripts/recalc.py +2 -26
  98. package/skills/docx/scripts/__init__.py +0 -1
  99. package/skills/docx/scripts/office/helpers/__init__.py +0 -0
  100. package/skills/docx/scripts/office/validators/__init__.py +0 -15
  101. package/skills/pptx/scripts/__init__.py +0 -0
  102. package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  103. package/skills/pptx/scripts/office/validators/__init__.py +0 -15
  104. package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  105. package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
  106. /package/skills/pdf/{REFERENCE.md → reference.md} +0 -0
@@ -14,100 +14,76 @@ from .base import BaseSchemaValidator
14
14
 
15
15
 
16
16
  class DOCXSchemaValidator(BaseSchemaValidator):
17
- """Validator for Word document XML files against XSD schemas."""
18
17
 
19
- # Word-specific namespaces
20
18
  WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
21
19
  W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml"
22
20
  W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid"
23
21
 
24
- # Word-specific element to relationship type mappings
25
- # Start with empty mapping - add specific cases as we discover them
26
22
  ELEMENT_RELATIONSHIP_TYPES = {}
27
23
 
28
24
  def validate(self):
29
- """Run all validation checks and return True if all pass."""
30
- # Test 0: XML well-formedness
31
25
  if not self.validate_xml():
32
26
  return False
33
27
 
34
- # Test 1: Namespace declarations
35
28
  all_valid = True
36
29
  if not self.validate_namespaces():
37
30
  all_valid = False
38
31
 
39
- # Test 2: Unique IDs
40
32
  if not self.validate_unique_ids():
41
33
  all_valid = False
42
34
 
43
- # Test 3: Relationship and file reference validation
44
35
  if not self.validate_file_references():
45
36
  all_valid = False
46
37
 
47
- # Test 4: Content type declarations
48
38
  if not self.validate_content_types():
49
39
  all_valid = False
50
40
 
51
- # Test 5: XSD schema validation
52
41
  if not self.validate_against_xsd():
53
42
  all_valid = False
54
43
 
55
- # Test 6: Whitespace preservation
56
44
  if not self.validate_whitespace_preservation():
57
45
  all_valid = False
58
46
 
59
- # Test 7: Deletion validation
60
47
  if not self.validate_deletions():
61
48
  all_valid = False
62
49
 
63
- # Test 8: Insertion validation
64
50
  if not self.validate_insertions():
65
51
  all_valid = False
66
52
 
67
- # Test 9: Relationship ID reference validation
68
53
  if not self.validate_all_relationship_ids():
69
54
  all_valid = False
70
55
 
71
- # Test 10: ID constraints (paraId, durableId)
72
56
  if not self.validate_id_constraints():
73
57
  all_valid = False
74
58
 
75
- # Test 11: Comment marker validation
76
59
  if not self.validate_comment_markers():
77
60
  all_valid = False
78
61
 
79
- # Count and compare paragraphs
80
62
  self.compare_paragraph_counts()
81
63
 
82
64
  return all_valid
83
65
 
84
66
  def validate_whitespace_preservation(self):
85
- """
86
- Validate that w:t elements with whitespace have xml:space='preserve'.
87
- """
88
67
  errors = []
89
68
 
90
69
  for xml_file in self.xml_files:
91
- # Only check document.xml files
92
70
  if xml_file.name != "document.xml":
93
71
  continue
94
72
 
95
73
  try:
96
74
  root = lxml.etree.parse(str(xml_file)).getroot()
97
75
 
98
- # Find all w:t elements
99
76
  for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
100
77
  if elem.text:
101
78
  text = elem.text
102
- # Check if text starts or ends with whitespace
103
- if re.match(r"^\s.*", text) or re.match(r".*\s$", text):
104
- # Check if xml:space="preserve" attribute exists
79
+ if re.search(r"^[ \t\n\r]", text) or re.search(
80
+ r"[ \t\n\r]$", text
81
+ ):
105
82
  xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
106
83
  if (
107
84
  xml_space_attr not in elem.attrib
108
85
  or elem.attrib[xml_space_attr] != "preserve"
109
86
  ):
110
- # Show a preview of the text
111
87
  text_preview = (
112
88
  repr(text)[:50] + "..."
113
89
  if len(repr(text)) > 50
@@ -134,15 +110,9 @@ class DOCXSchemaValidator(BaseSchemaValidator):
134
110
  return True
135
111
 
136
112
  def validate_deletions(self):
137
- """
138
- Validate that w:t and w:instrText elements are not within w:del elements.
139
- Inside w:del, use w:delText and w:delInstrText instead.
140
- XSD validation does not catch this, so we do it manually.
141
- """
142
113
  errors = []
143
114
 
144
115
  for xml_file in self.xml_files:
145
- # Only check document.xml files
146
116
  if xml_file.name != "document.xml":
147
117
  continue
148
118
 
@@ -150,10 +120,8 @@ class DOCXSchemaValidator(BaseSchemaValidator):
150
120
  root = lxml.etree.parse(str(xml_file)).getroot()
151
121
  namespaces = {"w": self.WORD_2006_NAMESPACE}
152
122
 
153
- # Find all w:t elements that are descendants of w:del elements
154
123
  for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces):
155
124
  if t_elem.text:
156
- # Show a preview of the text
157
125
  text_preview = (
158
126
  repr(t_elem.text)[:50] + "..."
159
127
  if len(repr(t_elem.text)) > 50
@@ -164,9 +132,9 @@ class DOCXSchemaValidator(BaseSchemaValidator):
164
132
  f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
165
133
  )
166
134
 
167
- # Find all w:instrText elements that are descendants of w:del elements
168
- # These should be w:delInstrText instead
169
- for instr_elem in root.xpath(".//w:del//w:instrText", namespaces=namespaces):
135
+ for instr_elem in root.xpath(
136
+ ".//w:del//w:instrText", namespaces=namespaces
137
+ ):
170
138
  text_preview = (
171
139
  repr(instr_elem.text or "")[:50] + "..."
172
140
  if len(repr(instr_elem.text or "")) > 50
@@ -193,17 +161,14 @@ class DOCXSchemaValidator(BaseSchemaValidator):
193
161
  return True
194
162
 
195
163
  def count_paragraphs_in_unpacked(self):
196
- """Count the number of paragraphs in the unpacked document."""
197
164
  count = 0
198
165
 
199
166
  for xml_file in self.xml_files:
200
- # Only check document.xml files
201
167
  if xml_file.name != "document.xml":
202
168
  continue
203
169
 
204
170
  try:
205
171
  root = lxml.etree.parse(str(xml_file)).getroot()
206
- # Count all w:p elements
207
172
  paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
208
173
  count = len(paragraphs)
209
174
  except Exception as e:
@@ -212,21 +177,20 @@ class DOCXSchemaValidator(BaseSchemaValidator):
212
177
  return count
213
178
 
214
179
  def count_paragraphs_in_original(self):
215
- """Count the number of paragraphs in the original docx file."""
180
+ original = self.original_file
181
+ if original is None:
182
+ return 0
183
+
216
184
  count = 0
217
185
 
218
186
  try:
219
- # Create temporary directory to unpack original
220
187
  with tempfile.TemporaryDirectory() as temp_dir:
221
- # Unpack original docx
222
- with zipfile.ZipFile(self.original_file, "r") as zip_ref:
188
+ with zipfile.ZipFile(original, "r") as zip_ref:
223
189
  zip_ref.extractall(temp_dir)
224
190
 
225
- # Parse document.xml
226
191
  doc_xml_path = temp_dir + "/word/document.xml"
227
192
  root = lxml.etree.parse(doc_xml_path).getroot()
228
193
 
229
- # Count all w:p elements
230
194
  paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
231
195
  count = len(paragraphs)
232
196
 
@@ -236,10 +200,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
236
200
  return count
237
201
 
238
202
  def validate_insertions(self):
239
- """
240
- Validate that w:delText elements are not within w:ins elements.
241
- w:delText is only allowed in w:ins if nested within a w:del.
242
- """
243
203
  errors = []
244
204
 
245
205
  for xml_file in self.xml_files:
@@ -250,7 +210,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
250
210
  root = lxml.etree.parse(str(xml_file)).getroot()
251
211
  namespaces = {"w": self.WORD_2006_NAMESPACE}
252
212
 
253
- # Find w:delText in w:ins that are NOT within w:del
254
213
  invalid_elements = root.xpath(
255
214
  ".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces
256
215
  )
@@ -282,7 +241,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
282
241
  return True
283
242
 
284
243
  def compare_paragraph_counts(self):
285
- """Compare paragraph counts between original and new document."""
286
244
  original_count = self.count_paragraphs_in_original()
287
245
  new_count = self.count_paragraphs_in_unpacked()
288
246
 
@@ -291,24 +249,9 @@ class DOCXSchemaValidator(BaseSchemaValidator):
291
249
  print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
292
250
 
293
251
  def _parse_id_value(self, val: str, base: int = 16) -> int:
294
- """Parse an ID value as hex (base=16) or decimal (base=10).
295
-
296
- Args:
297
- val: The string value to parse
298
- base: The numeric base (16 for hex, 10 for decimal)
299
-
300
- Returns:
301
- The parsed integer value
302
- """
303
252
  return int(val, base)
304
253
 
305
254
  def validate_id_constraints(self):
306
- """Validate paraId and durableId values per OOXML spec.
307
-
308
- Checks:
309
- - paraId < 0x80000000 (always hex)
310
- - durableId < 0x7FFFFFFF (decimal in numbering.xml, hex elsewhere)
311
- """
312
255
  errors = []
313
256
  para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId"
314
257
  durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId"
@@ -316,7 +259,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
316
259
  for xml_file in self.xml_files:
317
260
  try:
318
261
  for elem in lxml.etree.parse(str(xml_file)).iter():
319
- # paraId is always hex format
320
262
  if val := elem.get(para_id_attr):
321
263
  if self._parse_id_value(val, base=16) >= 0x80000000:
322
264
  errors.append(
@@ -324,8 +266,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
324
266
  )
325
267
 
326
268
  if val := elem.get(durable_id_attr):
327
- # durableId in numbering.xml must be decimal.
328
- # Word rejects hex-formatted durableIds in numbering.xml.
329
269
  if xml_file.name == "numbering.xml":
330
270
  try:
331
271
  if self._parse_id_value(val, base=10) >= 0x7FFFFFFF:
@@ -334,12 +274,10 @@ class DOCXSchemaValidator(BaseSchemaValidator):
334
274
  f"durableId={val} >= 0x7FFFFFFF"
335
275
  )
336
276
  except ValueError:
337
- # Contains non-decimal characters (e.g., hex letters A-F)
338
277
  errors.append(
339
278
  f" {xml_file.name}:{elem.sourceline}: "
340
279
  f"durableId={val} must be decimal in numbering.xml"
341
280
  )
342
- # durableId in other files (e.g. commentsIds.xml) uses hex format
343
281
  else:
344
282
  if self._parse_id_value(val, base=16) >= 0x7FFFFFFF:
345
283
  errors.append(
@@ -358,16 +296,8 @@ class DOCXSchemaValidator(BaseSchemaValidator):
358
296
  return not errors
359
297
 
360
298
  def validate_comment_markers(self):
361
- """Validate comment markers are properly paired and reference existing comments.
362
-
363
- Checks:
364
- - Every commentRangeStart has a matching commentRangeEnd
365
- - Every commentRangeEnd has a matching commentRangeStart
366
- - Every marker in document.xml references an existing comment
367
- """
368
299
  errors = []
369
300
 
370
- # Find document.xml and comments.xml
371
301
  document_xml = None
372
302
  comments_xml = None
373
303
  for xml_file in self.xml_files:
@@ -385,50 +315,59 @@ class DOCXSchemaValidator(BaseSchemaValidator):
385
315
  doc_root = lxml.etree.parse(str(document_xml)).getroot()
386
316
  namespaces = {"w": self.WORD_2006_NAMESPACE}
387
317
 
388
- # Collect all comment marker IDs from document.xml
389
318
  range_starts = {
390
319
  elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
391
- for elem in doc_root.xpath(".//w:commentRangeStart", namespaces=namespaces)
320
+ for elem in doc_root.xpath(
321
+ ".//w:commentRangeStart", namespaces=namespaces
322
+ )
392
323
  }
393
324
  range_ends = {
394
325
  elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
395
- for elem in doc_root.xpath(".//w:commentRangeEnd", namespaces=namespaces)
326
+ for elem in doc_root.xpath(
327
+ ".//w:commentRangeEnd", namespaces=namespaces
328
+ )
396
329
  }
397
330
  references = {
398
331
  elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
399
- for elem in doc_root.xpath(".//w:commentReference", namespaces=namespaces)
332
+ for elem in doc_root.xpath(
333
+ ".//w:commentReference", namespaces=namespaces
334
+ )
400
335
  }
401
336
 
402
- # Check for orphaned commentRangeEnd (missing commentRangeStart)
403
337
  orphaned_ends = range_ends - range_starts
404
- for comment_id in sorted(orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0):
338
+ for comment_id in sorted(
339
+ orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0
340
+ ):
405
341
  errors.append(
406
- f" document.xml: commentRangeEnd id=\"{comment_id}\" has no matching commentRangeStart"
342
+ f' document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart'
407
343
  )
408
344
 
409
- # Check for orphaned commentRangeStart (missing commentRangeEnd)
410
345
  orphaned_starts = range_starts - range_ends
411
- for comment_id in sorted(orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0):
346
+ for comment_id in sorted(
347
+ orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0
348
+ ):
412
349
  errors.append(
413
- f" document.xml: commentRangeStart id=\"{comment_id}\" has no matching commentRangeEnd"
350
+ f' document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd'
414
351
  )
415
352
 
416
- # Get comment IDs from comments.xml if it exists
417
353
  comment_ids = set()
418
354
  if comments_xml and comments_xml.exists():
419
355
  comments_root = lxml.etree.parse(str(comments_xml)).getroot()
420
356
  comment_ids = {
421
357
  elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
422
- for elem in comments_root.xpath(".//w:comment", namespaces=namespaces)
358
+ for elem in comments_root.xpath(
359
+ ".//w:comment", namespaces=namespaces
360
+ )
423
361
  }
424
362
 
425
- # Check for markers referencing non-existent comments
426
363
  marker_ids = range_starts | range_ends | references
427
364
  invalid_refs = marker_ids - comment_ids
428
- for comment_id in sorted(invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0):
429
- if comment_id: # Skip None values
365
+ for comment_id in sorted(
366
+ invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0
367
+ ):
368
+ if comment_id:
430
369
  errors.append(
431
- f" document.xml: marker id=\"{comment_id}\" references non-existent comment"
370
+ f' document.xml: marker id="{comment_id}" references non-existent comment'
432
371
  )
433
372
 
434
373
  except (lxml.etree.XMLSyntaxError, Exception) as e:
@@ -445,22 +384,11 @@ class DOCXSchemaValidator(BaseSchemaValidator):
445
384
  return True
446
385
 
447
386
  def repair(self) -> int:
448
- """Run DOCX-specific auto-repairs."""
449
387
  repairs = super().repair()
450
388
  repairs += self.repair_durableId()
451
389
  return repairs
452
390
 
453
391
  def repair_durableId(self) -> int:
454
- """Fix invalid durableId values.
455
-
456
- Repairs:
457
- - durableId >= 0x7FFFFFFF (value out of range)
458
- - durableId with hex letters in numbering.xml (wrong format)
459
-
460
- Note: paraId is not auto-repaired because it may be referenced by
461
- commentsExtended.xml, commentsIds.xml, and comment threading (paraIdParent).
462
- Changing paraId without updating all references would break comment associations.
463
- """
464
392
  repairs = 0
465
393
 
466
394
  for xml_file in self.xml_files:
@@ -476,28 +404,27 @@ class DOCXSchemaValidator(BaseSchemaValidator):
476
404
  durable_id = elem.getAttribute("w16cid:durableId")
477
405
  needs_repair = False
478
406
 
479
- # Check if durableId needs repair based on file type
480
407
  if xml_file.name == "numbering.xml":
481
- # numbering.xml requires decimal format
482
408
  try:
483
- needs_repair = self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF
409
+ needs_repair = (
410
+ self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF
411
+ )
484
412
  except ValueError:
485
- # Contains non-decimal characters (e.g., hex letters A-F)
486
413
  needs_repair = True
487
414
  else:
488
- # Other files (e.g. commentsIds.xml) use hex format
489
415
  try:
490
- needs_repair = self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF
416
+ needs_repair = (
417
+ self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF
418
+ )
491
419
  except ValueError:
492
420
  needs_repair = True
493
421
 
494
422
  if needs_repair:
495
- # Generate new ID in the correct format for this file type
496
423
  value = random.randint(1, 0x7FFFFFFE)
497
424
  if xml_file.name == "numbering.xml":
498
- new_id = str(value) # decimal for numbering.xml
425
+ new_id = str(value)
499
426
  else:
500
- new_id = f"{value:08X}" # hex for other files
427
+ new_id = f"{value:08X}"
501
428
 
502
429
  elem.setAttribute("w16cid:durableId", new_id)
503
430
  print(
@@ -8,14 +8,11 @@ from .base import BaseSchemaValidator
8
8
 
9
9
 
10
10
  class PPTXSchemaValidator(BaseSchemaValidator):
11
- """Validator for PowerPoint presentation XML files against XSD schemas."""
12
11
 
13
- # PowerPoint presentation namespace
14
12
  PRESENTATIONML_NAMESPACE = (
15
13
  "http://schemas.openxmlformats.org/presentationml/2006/main"
16
14
  )
17
15
 
18
- # PowerPoint-specific element to relationship type mappings
19
16
  ELEMENT_RELATIONSHIP_TYPES = {
20
17
  "sldid": "slide",
21
18
  "sldmasterid": "slidemaster",
@@ -26,60 +23,46 @@ class PPTXSchemaValidator(BaseSchemaValidator):
26
23
  }
27
24
 
28
25
  def validate(self):
29
- """Run all validation checks and return True if all pass."""
30
- # Test 0: XML well-formedness
31
26
  if not self.validate_xml():
32
27
  return False
33
28
 
34
- # Test 1: Namespace declarations
35
29
  all_valid = True
36
30
  if not self.validate_namespaces():
37
31
  all_valid = False
38
32
 
39
- # Test 2: Unique IDs
40
33
  if not self.validate_unique_ids():
41
34
  all_valid = False
42
35
 
43
- # Test 3: UUID ID validation
44
36
  if not self.validate_uuid_ids():
45
37
  all_valid = False
46
38
 
47
- # Test 4: Relationship and file reference validation
48
39
  if not self.validate_file_references():
49
40
  all_valid = False
50
41
 
51
- # Test 5: Slide layout ID validation
52
42
  if not self.validate_slide_layout_ids():
53
43
  all_valid = False
54
44
 
55
- # Test 6: Content type declarations
56
45
  if not self.validate_content_types():
57
46
  all_valid = False
58
47
 
59
- # Test 7: XSD schema validation
60
48
  if not self.validate_against_xsd():
61
49
  all_valid = False
62
50
 
63
- # Test 8: Notes slide reference validation
64
51
  if not self.validate_notes_slide_references():
65
52
  all_valid = False
66
53
 
67
- # Test 9: Relationship ID reference validation
68
54
  if not self.validate_all_relationship_ids():
69
55
  all_valid = False
70
56
 
71
- # Test 10: Duplicate slide layout references validation
72
57
  if not self.validate_no_duplicate_slide_layouts():
73
58
  all_valid = False
74
59
 
75
60
  return all_valid
76
61
 
77
62
  def validate_uuid_ids(self):
78
- """Validate that ID attributes that look like UUIDs contain only hex values."""
79
63
  import lxml.etree
80
64
 
81
65
  errors = []
82
- # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
83
66
  uuid_pattern = re.compile(
84
67
  r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
85
68
  )
@@ -88,15 +71,11 @@ class PPTXSchemaValidator(BaseSchemaValidator):
88
71
  try:
89
72
  root = lxml.etree.parse(str(xml_file)).getroot()
90
73
 
91
- # Check all elements for ID attributes
92
74
  for elem in root.iter():
93
75
  for attr, value in elem.attrib.items():
94
- # Check if this is an ID attribute
95
76
  attr_name = attr.split("}")[-1].lower()
96
77
  if attr_name == "id" or attr_name.endswith("id"):
97
- # Check if value looks like a UUID (has the right length and pattern structure)
98
78
  if self._looks_like_uuid(value):
99
- # Validate that it contains only hex characters in the right positions
100
79
  if not uuid_pattern.match(value):
101
80
  errors.append(
102
81
  f" {xml_file.relative_to(self.unpacked_dir)}: "
@@ -119,19 +98,14 @@ class PPTXSchemaValidator(BaseSchemaValidator):
119
98
  return True
120
99
 
121
100
  def _looks_like_uuid(self, value):
122
- """Check if a value has the general structure of a UUID."""
123
- # Remove common UUID delimiters
124
101
  clean_value = value.strip("{}()").replace("-", "")
125
- # Check if it's 32 hex-like characters (could include invalid hex chars)
126
102
  return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
127
103
 
128
104
  def validate_slide_layout_ids(self):
129
- """Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
130
105
  import lxml.etree
131
106
 
132
107
  errors = []
133
108
 
134
- # Find all slide master files
135
109
  slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
136
110
 
137
111
  if not slide_masters:
@@ -141,10 +115,8 @@ class PPTXSchemaValidator(BaseSchemaValidator):
141
115
 
142
116
  for slide_master in slide_masters:
143
117
  try:
144
- # Parse the slide master file
145
118
  root = lxml.etree.parse(str(slide_master)).getroot()
146
119
 
147
- # Find the corresponding _rels file for this slide master
148
120
  rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
149
121
 
150
122
  if not rels_file.exists():
@@ -154,10 +126,8 @@ class PPTXSchemaValidator(BaseSchemaValidator):
154
126
  )
155
127
  continue
156
128
 
157
- # Parse the relationships file
158
129
  rels_root = lxml.etree.parse(str(rels_file)).getroot()
159
130
 
160
- # Build a set of valid relationship IDs that point to slide layouts
161
131
  valid_layout_rids = set()
162
132
  for rel in rels_root.findall(
163
133
  f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
@@ -166,7 +136,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
166
136
  if "slideLayout" in rel_type:
167
137
  valid_layout_rids.add(rel.get("Id"))
168
138
 
169
- # Find all sldLayoutId elements in the slide master
170
139
  for sld_layout_id in root.findall(
171
140
  f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
172
141
  ):
@@ -201,7 +170,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
201
170
  return True
202
171
 
203
172
  def validate_no_duplicate_slide_layouts(self):
204
- """Validate that each slide has exactly one slideLayout reference."""
205
173
  import lxml.etree
206
174
 
207
175
  errors = []
@@ -211,7 +179,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
211
179
  try:
212
180
  root = lxml.etree.parse(str(rels_file)).getroot()
213
181
 
214
- # Find all slideLayout relationships
215
182
  layout_rels = [
216
183
  rel
217
184
  for rel in root.findall(
@@ -241,13 +208,11 @@ class PPTXSchemaValidator(BaseSchemaValidator):
241
208
  return True
242
209
 
243
210
  def validate_notes_slide_references(self):
244
- """Validate that each notesSlide file is referenced by only one slide."""
245
211
  import lxml.etree
246
212
 
247
213
  errors = []
248
- notes_slide_references = {} # Track which slides reference each notesSlide
214
+ notes_slide_references = {}
249
215
 
250
- # Find all slide relationship files
251
216
  slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
252
217
 
253
218
  if not slide_rels_files:
@@ -257,10 +222,8 @@ class PPTXSchemaValidator(BaseSchemaValidator):
257
222
 
258
223
  for rels_file in slide_rels_files:
259
224
  try:
260
- # Parse the relationships file
261
225
  root = lxml.etree.parse(str(rels_file)).getroot()
262
226
 
263
- # Find all notesSlide relationships
264
227
  for rel in root.findall(
265
228
  f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
266
229
  ):
@@ -268,13 +231,11 @@ class PPTXSchemaValidator(BaseSchemaValidator):
268
231
  if "notesSlide" in rel_type:
269
232
  target = rel.get("Target", "")
270
233
  if target:
271
- # Normalize the target path to handle relative paths
272
234
  normalized_target = target.replace("../", "")
273
235
 
274
- # Track which slide references this notesSlide
275
236
  slide_name = rels_file.stem.replace(
276
237
  ".xml", ""
277
- ) # e.g., "slide1"
238
+ )
278
239
 
279
240
  if normalized_target not in notes_slide_references:
280
241
  notes_slide_references[normalized_target] = []
@@ -287,7 +248,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
287
248
  f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
288
249
  )
289
250
 
290
- # Check for duplicate references
291
251
  for target, references in notes_slide_references.items():
292
252
  if len(references) > 1:
293
253
  slide_names = [ref[0] for ref in references]