@heylemon/lemonade 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. package/dist/build-info.json +3 -3
  2. package/dist/canvas-host/a2ui/.bundle.hash +1 -1
  3. package/dist/gateway/skills-http.js +74 -19
  4. package/package.json +1 -1
  5. package/skills/docx/SKILL.md +25 -30
  6. package/skills/docx/scripts/accept_changes.py +0 -17
  7. package/skills/docx/scripts/comment.py +10 -39
  8. package/skills/docx/scripts/office/helpers/merge_runs.py +1 -33
  9. package/skills/docx/scripts/office/helpers/simplify_redlines.py +0 -43
  10. package/skills/docx/scripts/office/pack.py +0 -30
  11. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  12. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  13. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  14. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  15. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  16. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  17. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  18. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  19. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  20. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  21. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  22. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  23. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  24. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  25. package/skills/docx/scripts/office/soffice.py +0 -55
  26. package/skills/docx/scripts/office/unpack.py +5 -27
  27. package/skills/docx/scripts/office/validate.py +19 -14
  28. package/skills/docx/scripts/office/validators/base.py +48 -224
  29. package/skills/docx/scripts/office/validators/docx.py +44 -117
  30. package/skills/docx/scripts/office/validators/pptx.py +2 -42
  31. package/skills/docx/scripts/office/validators/redlining.py +3 -40
  32. package/skills/pdf/SKILL.md +22 -15
  33. package/skills/pdf/{FORMS.md → forms.md} +0 -14
  34. package/skills/pdf/scripts/check_bounding_boxes.py +0 -5
  35. package/skills/pdf/scripts/check_fillable_fields.py +0 -1
  36. package/skills/pdf/scripts/convert_pdf_to_images.py +0 -2
  37. package/skills/pdf/scripts/create_validation_image.py +0 -4
  38. package/skills/pdf/scripts/extract_form_field_info.py +1 -31
  39. package/skills/pdf/scripts/extract_form_structure.py +0 -9
  40. package/skills/pdf/scripts/fill_fillable_fields.py +0 -23
  41. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +3 -38
  42. package/skills/pptx/SKILL.md +2 -29
  43. package/skills/pptx/editing.md +2 -2
  44. package/skills/pptx/pptxgenjs.md +53 -8
  45. package/skills/pptx/scripts/add_slide.py +0 -30
  46. package/skills/pptx/scripts/clean.py +0 -23
  47. package/skills/pptx/scripts/office/helpers/merge_runs.py +1 -33
  48. package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -43
  49. package/skills/pptx/scripts/office/pack.py +0 -30
  50. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  51. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  52. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  53. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  54. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  55. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  56. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  57. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  58. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  59. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  60. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  61. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  62. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  63. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  64. package/skills/pptx/scripts/office/soffice.py +0 -55
  65. package/skills/pptx/scripts/office/unpack.py +5 -27
  66. package/skills/pptx/scripts/office/validate.py +19 -14
  67. package/skills/pptx/scripts/office/validators/base.py +48 -224
  68. package/skills/pptx/scripts/office/validators/docx.py +44 -117
  69. package/skills/pptx/scripts/office/validators/pptx.py +2 -42
  70. package/skills/pptx/scripts/office/validators/redlining.py +3 -40
  71. package/skills/pptx/scripts/thumbnail.py +0 -31
  72. package/skills/xlsx/SKILL.md +3 -26
  73. package/skills/xlsx/scripts/office/helpers/merge_runs.py +1 -33
  74. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -43
  75. package/skills/xlsx/scripts/office/pack.py +0 -30
  76. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  77. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  78. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  79. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  80. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  81. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  82. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  83. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  84. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  85. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  86. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  87. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  88. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  89. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  90. package/skills/xlsx/scripts/office/soffice.py +0 -55
  91. package/skills/xlsx/scripts/office/unpack.py +5 -27
  92. package/skills/xlsx/scripts/office/validate.py +19 -14
  93. package/skills/xlsx/scripts/office/validators/base.py +48 -224
  94. package/skills/xlsx/scripts/office/validators/docx.py +44 -117
  95. package/skills/xlsx/scripts/office/validators/pptx.py +2 -42
  96. package/skills/xlsx/scripts/office/validators/redlining.py +3 -40
  97. package/skills/xlsx/scripts/recalc.py +2 -26
  98. package/skills/docx/scripts/__init__.py +0 -1
  99. package/skills/docx/scripts/office/helpers/__init__.py +0 -0
  100. package/skills/docx/scripts/office/validators/__init__.py +0 -15
  101. package/skills/pptx/scripts/__init__.py +0 -0
  102. package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  103. package/skills/pptx/scripts/office/validators/__init__.py +0 -15
  104. package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  105. package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
  106. /package/skills/pdf/{REFERENCE.md → reference.md} +0 -0
@@ -10,85 +10,57 @@ import lxml.etree
10
10
 
11
11
 
12
12
  class BaseSchemaValidator:
13
- """Base validator with common validation logic for document files."""
14
13
 
15
- # Validation errors to ignore (patterns that appear in error messages)
16
- # These are XSD schema errors that don't affect document functionality,
17
- # typically caused by specific editors like LibreOffice.
18
14
  IGNORED_VALIDATION_ERRORS = [
19
- # LibreOffice writes hyphenationZone in wrong order in word/settings.xml.
20
- # The XSD requires strict element ordering, but LibreOffice puts doNotHyphenateCaps
21
- # before hyphenationZone. This doesn't affect document rendering.
22
15
  "hyphenationZone",
16
+ "purl.org/dc/terms",
23
17
  ]
24
18
 
25
- # Elements whose 'id' attributes must be unique within their file
26
- # Format: element_name -> (attribute_name, scope)
27
- # scope can be 'file' (unique within file) or 'global' (unique across all files)
28
19
  UNIQUE_ID_REQUIREMENTS = {
29
- # Word elements
30
- "comment": ("id", "file"), # Comment IDs in comments.xml
31
- "commentrangestart": ("id", "file"), # Must match comment IDs
32
- "commentrangeend": ("id", "file"), # Must match comment IDs
33
- "bookmarkstart": ("id", "file"), # Bookmark start IDs
34
- "bookmarkend": ("id", "file"), # Bookmark end IDs
35
- # Note: ins and del (track changes) can share IDs when part of same revision
36
- # PowerPoint elements
37
- "sldid": ("id", "file"), # Slide IDs in presentation.xml
38
- "sldmasterid": ("id", "global"), # Slide master IDs must be globally unique
39
- "sldlayoutid": ("id", "global"), # Slide layout IDs must be globally unique
40
- "cm": ("authorid", "file"), # Comment author IDs
41
- # Excel elements
42
- "sheet": ("sheetid", "file"), # Sheet IDs in workbook.xml
43
- "definedname": ("id", "file"), # Named range IDs
44
- # Drawing/Shape elements (all formats)
45
- "cxnsp": ("id", "file"), # Connection shape IDs
46
- "sp": ("id", "file"), # Shape IDs
47
- "pic": ("id", "file"), # Picture IDs
48
- "grpsp": ("id", "file"), # Group shape IDs
20
+ "comment": ("id", "file"),
21
+ "commentrangestart": ("id", "file"),
22
+ "commentrangeend": ("id", "file"),
23
+ "bookmarkstart": ("id", "file"),
24
+ "bookmarkend": ("id", "file"),
25
+ "sldid": ("id", "file"),
26
+ "sldmasterid": ("id", "global"),
27
+ "sldlayoutid": ("id", "global"),
28
+ "cm": ("authorid", "file"),
29
+ "sheet": ("sheetid", "file"),
30
+ "definedname": ("id", "file"),
31
+ "cxnsp": ("id", "file"),
32
+ "sp": ("id", "file"),
33
+ "pic": ("id", "file"),
34
+ "grpsp": ("id", "file"),
49
35
  }
50
36
 
51
- # Container elements where ID uniqueness checks should be skipped
52
- # These hold references that intentionally duplicate IDs of elements they reference
53
- # Example: <p14:sldId id="301"> in sectionLst references <p:sldId id="301"> in sldIdLst
54
37
  EXCLUDED_ID_CONTAINERS = {
55
- "sectionlst", # PowerPoint sections - sldId elements reference slides by ID
38
+ "sectionlst",
56
39
  }
57
40
 
58
- # Mapping of element names to expected relationship types
59
- # Subclasses should override this with format-specific mappings
60
41
  ELEMENT_RELATIONSHIP_TYPES = {}
61
42
 
62
- # Unified schema mappings for all Office document types
63
43
  SCHEMA_MAPPINGS = {
64
- # Document type specific schemas
65
- "word": "ISO-IEC29500-4_2016/wml.xsd", # Word documents
66
- "ppt": "ISO-IEC29500-4_2016/pml.xsd", # PowerPoint presentations
67
- "xl": "ISO-IEC29500-4_2016/sml.xsd", # Excel spreadsheets
68
- # Common file types
44
+ "word": "ISO-IEC29500-4_2016/wml.xsd",
45
+ "ppt": "ISO-IEC29500-4_2016/pml.xsd",
46
+ "xl": "ISO-IEC29500-4_2016/sml.xsd",
69
47
  "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
70
48
  "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
71
49
  "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
72
50
  "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
73
51
  ".rels": "ecma/fouth-edition/opc-relationships.xsd",
74
- # Word-specific files
75
52
  "people.xml": "microsoft/wml-2012.xsd",
76
53
  "commentsIds.xml": "microsoft/wml-cid-2016.xsd",
77
54
  "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
78
55
  "commentsExtended.xml": "microsoft/wml-2012.xsd",
79
- # Chart files (common across document types)
80
56
  "chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
81
- # Theme files (common across document types)
82
57
  "theme": "ISO-IEC29500-4_2016/dml-main.xsd",
83
- # Drawing and media files
84
58
  "drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
85
59
  }
86
60
 
87
- # Unified namespace constants
88
61
  MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
89
62
  XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
90
63
 
91
- # Common OOXML namespaces used across validators
92
64
  PACKAGE_RELATIONSHIPS_NAMESPACE = (
93
65
  "http://schemas.openxmlformats.org/package/2006/relationships"
94
66
  )
@@ -99,10 +71,8 @@ class BaseSchemaValidator:
99
71
  "http://schemas.openxmlformats.org/package/2006/content-types"
100
72
  )
101
73
 
102
- # Folders where we should clean ignorable namespaces
103
74
  MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
104
75
 
105
- # All allowed OOXML namespaces (superset of all document types)
106
76
  OOXML_NAMESPACES = {
107
77
  "http://schemas.openxmlformats.org/officeDocument/2006/math",
108
78
  "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
@@ -121,15 +91,13 @@ class BaseSchemaValidator:
121
91
  "http://www.w3.org/XML/1998/namespace",
122
92
  }
123
93
 
124
- def __init__(self, unpacked_dir, original_file, verbose=False):
94
+ def __init__(self, unpacked_dir, original_file=None, verbose=False):
125
95
  self.unpacked_dir = Path(unpacked_dir).resolve()
126
- self.original_file = Path(original_file)
96
+ self.original_file = Path(original_file) if original_file else None
127
97
  self.verbose = verbose
128
98
 
129
- # Set schemas directory
130
99
  self.schemas_dir = Path(__file__).parent.parent / "schemas"
131
100
 
132
- # Get all XML and .rels files
133
101
  patterns = ["*.xml", "*.rels"]
134
102
  self.xml_files = [
135
103
  f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
@@ -139,15 +107,12 @@ class BaseSchemaValidator:
139
107
  print(f"Warning: No XML files found in {self.unpacked_dir}")
140
108
 
141
109
  def validate(self):
142
- """Run all validation checks and return True if all pass."""
143
110
  raise NotImplementedError("Subclasses must implement the validate method")
144
111
 
145
112
  def repair(self) -> int:
146
- """Run auto-repairs. Returns count of repairs made. Subclasses should override and call super()."""
147
113
  return self.repair_whitespace_preservation()
148
114
 
149
115
  def repair_whitespace_preservation(self) -> int:
150
- """Add xml:space='preserve' to w:t/a:t elements with leading/trailing whitespace."""
151
116
  repairs = 0
152
117
 
153
118
  for xml_file in self.xml_files:
@@ -176,12 +141,10 @@ class BaseSchemaValidator:
176
141
  return repairs
177
142
 
178
143
  def validate_xml(self):
179
- """Validate that all XML files are well-formed."""
180
144
  errors = []
181
145
 
182
146
  for xml_file in self.xml_files:
183
147
  try:
184
- # Try to parse the XML file
185
148
  lxml.etree.parse(str(xml_file))
186
149
  except lxml.etree.XMLSyntaxError as e:
187
150
  errors.append(
@@ -205,13 +168,12 @@ class BaseSchemaValidator:
205
168
  return True
206
169
 
207
170
  def validate_namespaces(self):
208
- """Validate that namespace prefixes in Ignorable attributes are declared."""
209
171
  errors = []
210
172
 
211
173
  for xml_file in self.xml_files:
212
174
  try:
213
175
  root = lxml.etree.parse(str(xml_file)).getroot()
214
- declared = set(root.nsmap.keys()) - {None} # Exclude default namespace
176
+ declared = set(root.nsmap.keys()) - {None}
215
177
 
216
178
  for attr_val in [
217
179
  v for k, v in root.attrib.items() if k.endswith("Ignorable")
@@ -235,35 +197,28 @@ class BaseSchemaValidator:
235
197
  return True
236
198
 
237
199
  def validate_unique_ids(self):
238
- """Validate that specific IDs are unique according to OOXML requirements."""
239
200
  errors = []
240
- global_ids = {} # Track globally unique IDs across all files
201
+ global_ids = {}
241
202
 
242
203
  for xml_file in self.xml_files:
243
204
  try:
244
205
  root = lxml.etree.parse(str(xml_file)).getroot()
245
- file_ids = {} # Track IDs that must be unique within this file
206
+ file_ids = {}
246
207
 
247
- # Remove all mc:AlternateContent elements from the tree
248
208
  mc_elements = root.xpath(
249
209
  ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
250
210
  )
251
211
  for elem in mc_elements:
252
212
  elem.getparent().remove(elem)
253
213
 
254
- # Now check IDs in the cleaned tree
255
214
  for elem in root.iter():
256
- # Get the element name without namespace
257
215
  tag = (
258
216
  elem.tag.split("}")[-1].lower()
259
217
  if "}" in elem.tag
260
218
  else elem.tag.lower()
261
219
  )
262
220
 
263
- # Check if this element type has ID uniqueness requirements
264
221
  if tag in self.UNIQUE_ID_REQUIREMENTS:
265
- # Skip if element is inside an excluded container
266
- # (e.g., <p14:sldId> inside <p14:sectionLst> is a reference, not a definition)
267
222
  in_excluded_container = any(
268
223
  ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
269
224
  for ancestor in elem.iterancestors()
@@ -273,7 +228,6 @@ class BaseSchemaValidator:
273
228
 
274
229
  attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
275
230
 
276
- # Look for the specified attribute
277
231
  id_value = None
278
232
  for attr, value in elem.attrib.items():
279
233
  attr_local = (
@@ -287,7 +241,6 @@ class BaseSchemaValidator:
287
241
 
288
242
  if id_value is not None:
289
243
  if scope == "global":
290
- # Check global uniqueness
291
244
  if id_value in global_ids:
292
245
  prev_file, prev_line, prev_tag = global_ids[
293
246
  id_value
@@ -304,7 +257,6 @@ class BaseSchemaValidator:
304
257
  tag,
305
258
  )
306
259
  elif scope == "file":
307
- # Check file-level uniqueness
308
260
  key = (tag, attr_name)
309
261
  if key not in file_ids:
310
262
  file_ids[key] = {}
@@ -335,12 +287,8 @@ class BaseSchemaValidator:
335
287
  return True
336
288
 
337
289
  def validate_file_references(self):
338
- """
339
- Validate that all .rels files properly reference files and that all files are referenced.
340
- """
341
290
  errors = []
342
291
 
343
- # Find all .rels files
344
292
  rels_files = list(self.unpacked_dir.rglob("*.rels"))
345
293
 
346
294
  if not rels_files:
@@ -348,17 +296,15 @@ class BaseSchemaValidator:
348
296
  print("PASSED - No .rels files found")
349
297
  return True
350
298
 
351
- # Get all files in the unpacked directory (excluding reference files)
352
299
  all_files = []
353
300
  for file_path in self.unpacked_dir.rglob("*"):
354
301
  if (
355
302
  file_path.is_file()
356
303
  and file_path.name != "[Content_Types].xml"
357
304
  and not file_path.name.endswith(".rels")
358
- ): # This file is not referenced by .rels
305
+ ):
359
306
  all_files.append(file_path.resolve())
360
307
 
361
- # Track all files that are referenced by any .rels file
362
308
  all_referenced_files = set()
363
309
 
364
310
  if self.verbose:
@@ -366,16 +312,12 @@ class BaseSchemaValidator:
366
312
  f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
367
313
  )
368
314
 
369
- # Check each .rels file
370
315
  for rels_file in rels_files:
371
316
  try:
372
- # Parse relationships file
373
317
  rels_root = lxml.etree.parse(str(rels_file)).getroot()
374
318
 
375
- # Get the directory where this .rels file is located
376
319
  rels_dir = rels_file.parent
377
320
 
378
- # Find all relationships and their targets
379
321
  referenced_files = set()
380
322
  broken_refs = []
381
323
 
@@ -386,24 +328,15 @@ class BaseSchemaValidator:
386
328
  target = rel.get("Target")
387
329
  if target and not target.startswith(
388
330
  ("http", "mailto:")
389
- ): # Skip external URLs
390
- # Resolve the target path
391
- # Absolute paths (starting with /) are relative to package root
392
- # Relative paths are relative to the .rels file's parent directory
331
+ ):
393
332
  if target.startswith("/"):
394
- # Absolute path - resolve from unpacked_dir root
395
- # Strip leading / to avoid pathlib replacing the base
396
333
  target_path = self.unpacked_dir / target.lstrip("/")
397
334
  elif rels_file.name == ".rels":
398
- # Root .rels file - relative targets are relative to unpacked_dir
399
335
  target_path = self.unpacked_dir / target
400
336
  else:
401
- # Other .rels files - relative targets are relative to their parent's parent
402
- # e.g., word/_rels/document.xml.rels -> targets relative to word/
403
337
  base_dir = rels_dir.parent
404
338
  target_path = base_dir / target
405
339
 
406
- # Normalize the path and check if it exists
407
340
  try:
408
341
  target_path = target_path.resolve()
409
342
  if target_path.exists() and target_path.is_file():
@@ -414,7 +347,6 @@ class BaseSchemaValidator:
414
347
  except (OSError, ValueError):
415
348
  broken_refs.append((target, rel.sourceline))
416
349
 
417
- # Report broken references
418
350
  if broken_refs:
419
351
  rel_path = rels_file.relative_to(self.unpacked_dir)
420
352
  for broken_ref, line_num in broken_refs:
@@ -426,7 +358,6 @@ class BaseSchemaValidator:
426
358
  rel_path = rels_file.relative_to(self.unpacked_dir)
427
359
  errors.append(f" Error parsing {rel_path}: {e}")
428
360
 
429
- # Check for unreferenced files (files that exist but are not referenced anywhere)
430
361
  unreferenced_files = set(all_files) - all_referenced_files
431
362
 
432
363
  if unreferenced_files:
@@ -452,31 +383,21 @@ class BaseSchemaValidator:
452
383
  return True
453
384
 
454
385
  def validate_all_relationship_ids(self):
455
- """
456
- Validate that all r:id attributes in XML files reference existing IDs
457
- in their corresponding .rels files, and optionally validate relationship types.
458
- """
459
386
  import lxml.etree
460
387
 
461
388
  errors = []
462
389
 
463
- # Process each XML file that might contain r:id references
464
390
  for xml_file in self.xml_files:
465
- # Skip .rels files themselves
466
391
  if xml_file.suffix == ".rels":
467
392
  continue
468
393
 
469
- # Determine the corresponding .rels file
470
- # For dir/file.xml, it's dir/_rels/file.xml.rels
471
394
  rels_dir = xml_file.parent / "_rels"
472
395
  rels_file = rels_dir / f"{xml_file.name}.rels"
473
396
 
474
- # Skip if there's no corresponding .rels file (that's okay)
475
397
  if not rels_file.exists():
476
398
  continue
477
399
 
478
400
  try:
479
- # Parse the .rels file to get valid relationship IDs and their types
480
401
  rels_root = lxml.etree.parse(str(rels_file)).getroot()
481
402
  rid_to_type = {}
482
403
 
@@ -486,47 +407,43 @@ class BaseSchemaValidator:
486
407
  rid = rel.get("Id")
487
408
  rel_type = rel.get("Type", "")
488
409
  if rid:
489
- # Check for duplicate rIds
490
410
  if rid in rid_to_type:
491
411
  rels_rel_path = rels_file.relative_to(self.unpacked_dir)
492
412
  errors.append(
493
413
  f" {rels_rel_path}: Line {rel.sourceline}: "
494
414
  f"Duplicate relationship ID '{rid}' (IDs must be unique)"
495
415
  )
496
- # Extract just the type name from the full URL
497
416
  type_name = (
498
417
  rel_type.split("/")[-1] if "/" in rel_type else rel_type
499
418
  )
500
419
  rid_to_type[rid] = type_name
501
420
 
502
- # Parse the XML file to find all r:id references
503
421
  xml_root = lxml.etree.parse(str(xml_file)).getroot()
504
422
 
505
- # Find all elements with r:id attributes
423
+ r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE
424
+ rid_attrs_to_check = ["id", "embed", "link"]
506
425
  for elem in xml_root.iter():
507
- # Check for r:id attribute (relationship ID)
508
- rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id")
509
- if rid_attr:
426
+ for attr_name in rid_attrs_to_check:
427
+ rid_attr = elem.get(f"{{{r_ns}}}{attr_name}")
428
+ if not rid_attr:
429
+ continue
510
430
  xml_rel_path = xml_file.relative_to(self.unpacked_dir)
511
431
  elem_name = (
512
432
  elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
513
433
  )
514
434
 
515
- # Check if the ID exists
516
435
  if rid_attr not in rid_to_type:
517
436
  errors.append(
518
437
  f" {xml_rel_path}: Line {elem.sourceline}: "
519
- f"<{elem_name}> references non-existent relationship '{rid_attr}' "
438
+ f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' "
520
439
  f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
521
440
  )
522
- # Check if we have type expectations for this element
523
- elif self.ELEMENT_RELATIONSHIP_TYPES:
441
+ elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES:
524
442
  expected_type = self._get_expected_relationship_type(
525
443
  elem_name
526
444
  )
527
445
  if expected_type:
528
446
  actual_type = rid_to_type[rid_attr]
529
- # Check if the actual type matches or contains the expected type
530
447
  if expected_type not in actual_type.lower():
531
448
  errors.append(
532
449
  f" {xml_rel_path}: Line {elem.sourceline}: "
@@ -550,58 +467,41 @@ class BaseSchemaValidator:
550
467
  return True
551
468
 
552
469
  def _get_expected_relationship_type(self, element_name):
553
- """
554
- Get the expected relationship type for an element.
555
- First checks the explicit mapping, then tries pattern detection.
556
- """
557
- # Normalize element name to lowercase
558
470
  elem_lower = element_name.lower()
559
471
 
560
- # Check explicit mapping first
561
472
  if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
562
473
  return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
563
474
 
564
- # Try pattern detection for common patterns
565
- # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type
566
475
  if elem_lower.endswith("id") and len(elem_lower) > 2:
567
- # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster"
568
- prefix = elem_lower[:-2] # Remove "id"
569
- # Check if this might be a compound like "sldMasterId"
476
+ prefix = elem_lower[:-2]
570
477
  if prefix.endswith("master"):
571
478
  return prefix.lower()
572
479
  elif prefix.endswith("layout"):
573
480
  return prefix.lower()
574
481
  else:
575
- # Simple case like "sldId" -> "slide"
576
- # Common transformations
577
482
  if prefix == "sld":
578
483
  return "slide"
579
484
  return prefix.lower()
580
485
 
581
- # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type
582
486
  if elem_lower.endswith("reference") and len(elem_lower) > 9:
583
- prefix = elem_lower[:-9] # Remove "reference"
487
+ prefix = elem_lower[:-9]
584
488
  return prefix.lower()
585
489
 
586
490
  return None
587
491
 
588
492
  def validate_content_types(self):
589
- """Validate that all content files are properly declared in [Content_Types].xml."""
590
493
  errors = []
591
494
 
592
- # Find [Content_Types].xml file
593
495
  content_types_file = self.unpacked_dir / "[Content_Types].xml"
594
496
  if not content_types_file.exists():
595
497
  print("FAILED - [Content_Types].xml file not found")
596
498
  return False
597
499
 
598
500
  try:
599
- # Parse and get all declared parts and extensions
600
501
  root = lxml.etree.parse(str(content_types_file)).getroot()
601
502
  declared_parts = set()
602
503
  declared_extensions = set()
603
504
 
604
- # Get Override declarations (specific files)
605
505
  for override in root.findall(
606
506
  f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
607
507
  ):
@@ -609,7 +509,6 @@ class BaseSchemaValidator:
609
509
  if part_name is not None:
610
510
  declared_parts.add(part_name.lstrip("/"))
611
511
 
612
- # Get Default declarations (by extension)
613
512
  for default in root.findall(
614
513
  f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
615
514
  ):
@@ -617,19 +516,17 @@ class BaseSchemaValidator:
617
516
  if extension is not None:
618
517
  declared_extensions.add(extension.lower())
619
518
 
620
- # Root elements that require content type declaration
621
519
  declarable_roots = {
622
520
  "sld",
623
521
  "sldLayout",
624
522
  "sldMaster",
625
- "presentation", # PowerPoint
626
- "document", # Word
523
+ "presentation",
524
+ "document",
627
525
  "workbook",
628
- "worksheet", # Excel
629
- "theme", # Common
526
+ "worksheet",
527
+ "theme",
630
528
  }
631
529
 
632
- # Common media file extensions that should be declared
633
530
  media_extensions = {
634
531
  "png": "image/png",
635
532
  "jpg": "image/jpeg",
@@ -641,17 +538,14 @@ class BaseSchemaValidator:
641
538
  "emf": "image/x-emf",
642
539
  }
643
540
 
644
- # Get all files in the unpacked directory
645
541
  all_files = list(self.unpacked_dir.rglob("*"))
646
542
  all_files = [f for f in all_files if f.is_file()]
647
543
 
648
- # Check all XML files for Override declarations
649
544
  for xml_file in self.xml_files:
650
545
  path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
651
546
  "\\", "/"
652
547
  )
653
548
 
654
- # Skip non-content files
655
549
  if any(
656
550
  skip in path_str
657
551
  for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
@@ -668,11 +562,9 @@ class BaseSchemaValidator:
668
562
  )
669
563
 
670
564
  except Exception:
671
- continue # Skip unparseable files
565
+ continue
672
566
 
673
- # Check all non-XML files for Default extension declarations
674
567
  for file_path in all_files:
675
- # Skip XML files and metadata files (already checked above)
676
568
  if file_path.suffix.lower() in {".xml", ".rels"}:
677
569
  continue
678
570
  if file_path.name == "[Content_Types].xml":
@@ -682,7 +574,6 @@ class BaseSchemaValidator:
682
574
 
683
575
  extension = file_path.suffix.lstrip(".").lower()
684
576
  if extension and extension not in declared_extensions:
685
- # Check if it's a known media extension that should be declared
686
577
  if extension in media_extensions:
687
578
  relative_path = file_path.relative_to(self.unpacked_dir)
688
579
  errors.append(
@@ -705,37 +596,23 @@ class BaseSchemaValidator:
705
596
  return True
706
597
 
707
598
  def validate_file_against_xsd(self, xml_file, verbose=False):
708
- """Validate a single XML file against XSD schema, comparing with original.
709
-
710
- Args:
711
- xml_file: Path to XML file to validate
712
- verbose: Enable verbose output
713
-
714
- Returns:
715
- tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped)
716
- """
717
- # Resolve both paths to handle symlinks
718
599
  xml_file = Path(xml_file).resolve()
719
600
  unpacked_dir = self.unpacked_dir.resolve()
720
601
 
721
- # Validate current file
722
602
  is_valid, current_errors = self._validate_single_file_xsd(
723
603
  xml_file, unpacked_dir
724
604
  )
725
605
 
726
606
  if is_valid is None:
727
- return None, set() # Skipped
607
+ return None, set()
728
608
  elif is_valid:
729
- return True, set() # Valid, no errors
609
+ return True, set()
730
610
 
731
- # Get errors from original file for this specific file
732
611
  original_errors = self._get_original_file_errors(xml_file)
733
612
 
734
- # Compare with original (both are guaranteed to be sets here)
735
613
  assert current_errors is not None
736
614
  new_errors = current_errors - original_errors
737
615
 
738
- # Filter out known harmless errors (e.g., LibreOffice element ordering issues)
739
616
  new_errors = {
740
617
  e for e in new_errors
741
618
  if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS)
@@ -750,7 +627,6 @@ class BaseSchemaValidator:
750
627
  print(f" - {truncated}")
751
628
  return False, new_errors
752
629
  else:
753
- # All errors existed in original
754
630
  if verbose:
755
631
  print(
756
632
  f"PASSED - No new errors (original had {len(current_errors)} errors)"
@@ -758,7 +634,6 @@ class BaseSchemaValidator:
758
634
  return True, set()
759
635
 
760
636
  def validate_against_xsd(self):
761
- """Validate XML files against XSD schemas, showing only new errors compared to original."""
762
637
  new_errors = []
763
638
  original_error_count = 0
764
639
  valid_count = 0
@@ -777,19 +652,16 @@ class BaseSchemaValidator:
777
652
  valid_count += 1
778
653
  continue
779
654
  elif is_valid:
780
- # Had errors but all existed in original
781
655
  original_error_count += 1
782
656
  valid_count += 1
783
657
  continue
784
658
 
785
- # Has new errors
786
659
  new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")
787
- for error in list(new_file_errors)[:3]: # Show first 3 errors
660
+ for error in list(new_file_errors)[:3]:
788
661
  new_errors.append(
789
662
  f" - {error[:250]}..." if len(error) > 250 else f" - {error}"
790
663
  )
791
664
 
792
- # Print summary
793
665
  if self.verbose:
794
666
  print(f"Validated {len(self.xml_files)} files:")
795
667
  print(f" - Valid: {valid_count}")
@@ -811,62 +683,47 @@ class BaseSchemaValidator:
811
683
  return True
812
684
 
813
685
  def _get_schema_path(self, xml_file):
814
- """Determine the appropriate schema path for an XML file."""
815
- # Check exact filename match
816
686
  if xml_file.name in self.SCHEMA_MAPPINGS:
817
687
  return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
818
688
 
819
- # Check .rels files
820
689
  if xml_file.suffix == ".rels":
821
690
  return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
822
691
 
823
- # Check chart files
824
692
  if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
825
693
  return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
826
694
 
827
- # Check theme files
828
695
  if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
829
696
  return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
830
697
 
831
- # Check if file is in a main content folder and use appropriate schema
832
698
  if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
833
699
  return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
834
700
 
835
701
  return None
836
702
 
837
703
  def _clean_ignorable_namespaces(self, xml_doc):
838
- """Remove attributes and elements not in allowed namespaces."""
839
- # Create a clean copy
840
704
  xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
841
705
  xml_copy = lxml.etree.fromstring(xml_string)
842
706
 
843
- # Remove attributes not in allowed namespaces
844
707
  for elem in xml_copy.iter():
845
708
  attrs_to_remove = []
846
709
 
847
710
  for attr in elem.attrib:
848
- # Check if attribute is from a namespace other than allowed ones
849
711
  if "{" in attr:
850
712
  ns = attr.split("}")[0][1:]
851
713
  if ns not in self.OOXML_NAMESPACES:
852
714
  attrs_to_remove.append(attr)
853
715
 
854
- # Remove collected attributes
855
716
  for attr in attrs_to_remove:
856
717
  del elem.attrib[attr]
857
718
 
858
- # Remove elements not in allowed namespaces
859
719
  self._remove_ignorable_elements(xml_copy)
860
720
 
861
721
  return lxml.etree.ElementTree(xml_copy)
862
722
 
863
723
  def _remove_ignorable_elements(self, root):
864
- """Recursively remove all elements not in allowed namespaces."""
865
724
  elements_to_remove = []
866
725
 
867
- # Find elements to remove
868
726
  for elem in list(root):
869
- # Skip non-element nodes (comments, processing instructions, etc.)
870
727
  if not hasattr(elem, "tag") or callable(elem.tag):
871
728
  continue
872
729
 
@@ -877,32 +734,25 @@ class BaseSchemaValidator:
877
734
  elements_to_remove.append(elem)
878
735
  continue
879
736
 
880
- # Recursively clean child elements
881
737
  self._remove_ignorable_elements(elem)
882
738
 
883
- # Remove collected elements
884
739
  for elem in elements_to_remove:
885
740
  root.remove(elem)
886
741
 
887
742
  def _preprocess_for_mc_ignorable(self, xml_doc):
888
- """Preprocess XML to handle mc:Ignorable attribute properly."""
889
- # Remove mc:Ignorable attributes before validation
890
743
  root = xml_doc.getroot()
891
744
 
892
- # Remove mc:Ignorable attribute from root
893
745
  if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
894
746
  del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
895
747
 
896
748
  return xml_doc
897
749
 
898
750
  def _validate_single_file_xsd(self, xml_file, base_path):
899
- """Validate a single XML file against XSD schema. Returns (is_valid, errors_set)."""
900
751
  schema_path = self._get_schema_path(xml_file)
901
752
  if not schema_path:
902
- return None, None # Skip file
753
+ return None, None
903
754
 
904
755
  try:
905
- # Load schema
906
756
  with open(schema_path, "rb") as xsd_file:
907
757
  parser = lxml.etree.XMLParser()
908
758
  xsd_doc = lxml.etree.parse(
@@ -910,14 +760,12 @@ class BaseSchemaValidator:
910
760
  )
911
761
  schema = lxml.etree.XMLSchema(xsd_doc)
912
762
 
913
- # Load and preprocess XML
914
763
  with open(xml_file, "r") as f:
915
764
  xml_doc = lxml.etree.parse(f)
916
765
 
917
766
  xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
918
767
  xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
919
768
 
920
- # Clean ignorable namespaces if needed
921
769
  relative_path = xml_file.relative_to(base_path)
922
770
  if (
923
771
  relative_path.parts
@@ -925,13 +773,11 @@ class BaseSchemaValidator:
925
773
  ):
926
774
  xml_doc = self._clean_ignorable_namespaces(xml_doc)
927
775
 
928
- # Validate
929
776
  if schema.validate(xml_doc):
930
777
  return True, set()
931
778
  else:
932
779
  errors = set()
933
780
  for error in schema.error_log:
934
- # Store normalized error message (without line numbers for comparison)
935
781
  errors.add(error.message)
936
782
  return False, errors
937
783
 
@@ -939,18 +785,12 @@ class BaseSchemaValidator:
939
785
  return False, {str(e)}
940
786
 
941
787
  def _get_original_file_errors(self, xml_file):
942
- """Get XSD validation errors from a single file in the original document.
788
+ if self.original_file is None:
789
+ return set()
943
790
 
944
- Args:
945
- xml_file: Path to the XML file in unpacked_dir to check
946
-
947
- Returns:
948
- set: Set of error messages from the original file
949
- """
950
791
  import tempfile
951
792
  import zipfile
952
793
 
953
- # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS)
954
794
  xml_file = Path(xml_file).resolve()
955
795
  unpacked_dir = self.unpacked_dir.resolve()
956
796
  relative_path = xml_file.relative_to(unpacked_dir)
@@ -958,37 +798,23 @@ class BaseSchemaValidator:
958
798
  with tempfile.TemporaryDirectory() as temp_dir:
959
799
  temp_path = Path(temp_dir)
960
800
 
961
- # Extract original file
962
801
  with zipfile.ZipFile(self.original_file, "r") as zip_ref:
963
802
  zip_ref.extractall(temp_path)
964
803
 
965
- # Find corresponding file in original
966
804
  original_xml_file = temp_path / relative_path
967
805
 
968
806
  if not original_xml_file.exists():
969
- # File didn't exist in original, so no original errors
970
807
  return set()
971
808
 
972
- # Validate the specific file in original
973
809
  is_valid, errors = self._validate_single_file_xsd(
974
810
  original_xml_file, temp_path
975
811
  )
976
812
  return errors if errors else set()
977
813
 
978
814
  def _remove_template_tags_from_text_nodes(self, xml_doc):
979
- """Remove template tags from XML text nodes and collect warnings.
980
-
981
- Template tags follow the pattern {{ ... }} and are used as placeholders
982
- for content replacement. They should be removed from text content before
983
- XSD validation while preserving XML structure.
984
-
985
- Returns:
986
- tuple: (cleaned_xml_doc, warnings_list)
987
- """
988
815
  warnings = []
989
816
  template_pattern = re.compile(r"\{\{[^}]*\}\}")
990
817
 
991
- # Create a copy of the document to avoid modifying the original
992
818
  xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
993
819
  xml_copy = lxml.etree.fromstring(xml_string)
994
820
 
@@ -1004,9 +830,7 @@ class BaseSchemaValidator:
1004
830
  return template_pattern.sub("", text)
1005
831
  return text
1006
832
 
1007
- # Process all text nodes in the document
1008
833
  for elem in xml_copy.iter():
1009
- # Skip processing if this is a w:t element
1010
834
  if not hasattr(elem, "tag") or callable(elem.tag):
1011
835
  continue
1012
836
  tag_str = str(elem.tag)