npm - @heylemon/lemonade - Versions diffs - 0.0.4 → 0.0.6 - Mend

@heylemon/lemonade 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/skills/pptx/scripts/office/validators/base.py CHANGED Viewed

@@ -10,85 +10,57 @@ import lxml.etree
 class BaseSchemaValidator:
-    """Base validator with common validation logic for document files."""
-    # Validation errors to ignore (patterns that appear in error messages)
-    # These are XSD schema errors that don't affect document functionality,
-    # typically caused by specific editors like LibreOffice.
     IGNORED_VALIDATION_ERRORS = [
-        # LibreOffice writes hyphenationZone in wrong order in word/settings.xml.
-        # The XSD requires strict element ordering, but LibreOffice puts doNotHyphenateCaps
-        # before hyphenationZone. This doesn't affect document rendering.
         "hyphenationZone",
+        "purl.org/dc/terms",
     ]
-    # Elements whose 'id' attributes must be unique within their file
-    # Format: element_name -> (attribute_name, scope)
-    # scope can be 'file' (unique within file) or 'global' (unique across all files)
     UNIQUE_ID_REQUIREMENTS = {
-        # Word elements
-        "comment": ("id", "file"),  # Comment IDs in comments.xml
-        "commentrangestart": ("id", "file"),  # Must match comment IDs
-        "commentrangeend": ("id", "file"),  # Must match comment IDs
-        "bookmarkstart": ("id", "file"),  # Bookmark start IDs
-        "bookmarkend": ("id", "file"),  # Bookmark end IDs
-        # Note: ins and del (track changes) can share IDs when part of same revision
-        # PowerPoint elements
-        "sldid": ("id", "file"),  # Slide IDs in presentation.xml
-        "sldmasterid": ("id", "global"),  # Slide master IDs must be globally unique
-        "sldlayoutid": ("id", "global"),  # Slide layout IDs must be globally unique
-        "cm": ("authorid", "file"),  # Comment author IDs
-        # Excel elements
-        "sheet": ("sheetid", "file"),  # Sheet IDs in workbook.xml
-        "definedname": ("id", "file"),  # Named range IDs
-        # Drawing/Shape elements (all formats)
-        "cxnsp": ("id", "file"),  # Connection shape IDs
-        "sp": ("id", "file"),  # Shape IDs
-        "pic": ("id", "file"),  # Picture IDs
-        "grpsp": ("id", "file"),  # Group shape IDs
+        "comment": ("id", "file"),
+        "commentrangestart": ("id", "file"),
+        "commentrangeend": ("id", "file"),
+        "bookmarkstart": ("id", "file"),
+        "bookmarkend": ("id", "file"),
+        "sldid": ("id", "file"),
+        "sldmasterid": ("id", "global"),
+        "sldlayoutid": ("id", "global"),
+        "cm": ("authorid", "file"),
+        "sheet": ("sheetid", "file"),
+        "definedname": ("id", "file"),
+        "cxnsp": ("id", "file"),
+        "sp": ("id", "file"),
+        "pic": ("id", "file"),
+        "grpsp": ("id", "file"),
     }
-    # Container elements where ID uniqueness checks should be skipped
-    # These hold references that intentionally duplicate IDs of elements they reference
-    # Example: <p14:sldId id="301"> in sectionLst references <p:sldId id="301"> in sldIdLst
     EXCLUDED_ID_CONTAINERS = {
-        "sectionlst",  # PowerPoint sections - sldId elements reference slides by ID
+        "sectionlst",
     }
-    # Mapping of element names to expected relationship types
-    # Subclasses should override this with format-specific mappings
     ELEMENT_RELATIONSHIP_TYPES = {}
-    # Unified schema mappings for all Office document types
     SCHEMA_MAPPINGS = {
-        # Document type specific schemas
-        "word": "ISO-IEC29500-4_2016/wml.xsd",  # Word documents
-        "ppt": "ISO-IEC29500-4_2016/pml.xsd",  # PowerPoint presentations
-        "xl": "ISO-IEC29500-4_2016/sml.xsd",  # Excel spreadsheets
-        # Common file types
+        "word": "ISO-IEC29500-4_2016/wml.xsd",
+        "ppt": "ISO-IEC29500-4_2016/pml.xsd",
+        "xl": "ISO-IEC29500-4_2016/sml.xsd",
         "[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
         "app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
         "core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
         "custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
         ".rels": "ecma/fouth-edition/opc-relationships.xsd",
-        # Word-specific files
         "people.xml": "microsoft/wml-2012.xsd",
         "commentsIds.xml": "microsoft/wml-cid-2016.xsd",
         "commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
         "commentsExtended.xml": "microsoft/wml-2012.xsd",
-        # Chart files (common across document types)
         "chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
-        # Theme files (common across document types)
         "theme": "ISO-IEC29500-4_2016/dml-main.xsd",
-        # Drawing and media files
         "drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
     }
-    # Unified namespace constants
     MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
     XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
-    # Common OOXML namespaces used across validators
     PACKAGE_RELATIONSHIPS_NAMESPACE = (
         "http://schemas.openxmlformats.org/package/2006/relationships"
     )
@@ -99,10 +71,8 @@ class BaseSchemaValidator:
         "http://schemas.openxmlformats.org/package/2006/content-types"
     )
-    # Folders where we should clean ignorable namespaces
     MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
-    # All allowed OOXML namespaces (superset of all document types)
     OOXML_NAMESPACES = {
         "http://schemas.openxmlformats.org/officeDocument/2006/math",
         "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
@@ -121,15 +91,13 @@ class BaseSchemaValidator:
         "http://www.w3.org/XML/1998/namespace",
     }
-    def __init__(self, unpacked_dir, original_file, verbose=False):
+    def __init__(self, unpacked_dir, original_file=None, verbose=False):
         self.unpacked_dir = Path(unpacked_dir).resolve()
-        self.original_file = Path(original_file)
+        self.original_file = Path(original_file) if original_file else None
         self.verbose = verbose
-        # Set schemas directory
         self.schemas_dir = Path(__file__).parent.parent / "schemas"
-        # Get all XML and .rels files
         patterns = ["*.xml", "*.rels"]
         self.xml_files = [
             f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
@@ -139,15 +107,12 @@ class BaseSchemaValidator:
             print(f"Warning: No XML files found in {self.unpacked_dir}")
     def validate(self):
-        """Run all validation checks and return True if all pass."""
         raise NotImplementedError("Subclasses must implement the validate method")
     def repair(self) -> int:
-        """Run auto-repairs. Returns count of repairs made. Subclasses should override and call super()."""
         return self.repair_whitespace_preservation()
     def repair_whitespace_preservation(self) -> int:
-        """Add xml:space='preserve' to w:t/a:t elements with leading/trailing whitespace."""
         repairs = 0
         for xml_file in self.xml_files:
@@ -176,12 +141,10 @@ class BaseSchemaValidator:
         return repairs
     def validate_xml(self):
-        """Validate that all XML files are well-formed."""
         errors = []
         for xml_file in self.xml_files:
             try:
-                # Try to parse the XML file
                 lxml.etree.parse(str(xml_file))
             except lxml.etree.XMLSyntaxError as e:
                 errors.append(
@@ -205,13 +168,12 @@ class BaseSchemaValidator:
             return True
     def validate_namespaces(self):
-        """Validate that namespace prefixes in Ignorable attributes are declared."""
         errors = []
         for xml_file in self.xml_files:
             try:
                 root = lxml.etree.parse(str(xml_file)).getroot()
-                declared = set(root.nsmap.keys()) - {None}  # Exclude default namespace
+                declared = set(root.nsmap.keys()) - {None}
                 for attr_val in [
                     v for k, v in root.attrib.items() if k.endswith("Ignorable")
@@ -235,35 +197,28 @@ class BaseSchemaValidator:
         return True
     def validate_unique_ids(self):
-        """Validate that specific IDs are unique according to OOXML requirements."""
         errors = []
-        global_ids = {}  # Track globally unique IDs across all files
+        global_ids = {}
         for xml_file in self.xml_files:
             try:
                 root = lxml.etree.parse(str(xml_file)).getroot()
-                file_ids = {}  # Track IDs that must be unique within this file
+                file_ids = {}
-                # Remove all mc:AlternateContent elements from the tree
                 mc_elements = root.xpath(
                     ".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
                 )
                 for elem in mc_elements:
                     elem.getparent().remove(elem)
-                # Now check IDs in the cleaned tree
                 for elem in root.iter():
-                    # Get the element name without namespace
                     tag = (
                         elem.tag.split("}")[-1].lower()
                         if "}" in elem.tag
                         else elem.tag.lower()
                     )
-                    # Check if this element type has ID uniqueness requirements
                     if tag in self.UNIQUE_ID_REQUIREMENTS:
-                        # Skip if element is inside an excluded container
-                        # (e.g., <p14:sldId> inside <p14:sectionLst> is a reference, not a definition)
                         in_excluded_container = any(
                             ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
                             for ancestor in elem.iterancestors()
@@ -273,7 +228,6 @@ class BaseSchemaValidator:
                         attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
-                        # Look for the specified attribute
                         id_value = None
                         for attr, value in elem.attrib.items():
                             attr_local = (
@@ -287,7 +241,6 @@ class BaseSchemaValidator:
                         if id_value is not None:
                             if scope == "global":
-                                # Check global uniqueness
                                 if id_value in global_ids:
                                     prev_file, prev_line, prev_tag = global_ids[
                                         id_value
@@ -304,7 +257,6 @@ class BaseSchemaValidator:
                                         tag,
                                     )
                             elif scope == "file":
-                                # Check file-level uniqueness
                                 key = (tag, attr_name)
                                 if key not in file_ids:
                                     file_ids[key] = {}
@@ -335,12 +287,8 @@ class BaseSchemaValidator:
             return True
     def validate_file_references(self):
-        """
-        Validate that all .rels files properly reference files and that all files are referenced.
-        """
         errors = []
-        # Find all .rels files
         rels_files = list(self.unpacked_dir.rglob("*.rels"))
         if not rels_files:
@@ -348,17 +296,15 @@ class BaseSchemaValidator:
                 print("PASSED - No .rels files found")
             return True
-        # Get all files in the unpacked directory (excluding reference files)
         all_files = []
         for file_path in self.unpacked_dir.rglob("*"):
             if (
                 file_path.is_file()
                 and file_path.name != "[Content_Types].xml"
                 and not file_path.name.endswith(".rels")
-            ):  # This file is not referenced by .rels
+            ):
                 all_files.append(file_path.resolve())
-        # Track all files that are referenced by any .rels file
         all_referenced_files = set()
         if self.verbose:
@@ -366,16 +312,12 @@ class BaseSchemaValidator:
                 f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
             )
-        # Check each .rels file
         for rels_file in rels_files:
             try:
-                # Parse relationships file
                 rels_root = lxml.etree.parse(str(rels_file)).getroot()
-                # Get the directory where this .rels file is located
                 rels_dir = rels_file.parent
-                # Find all relationships and their targets
                 referenced_files = set()
                 broken_refs = []
@@ -386,24 +328,15 @@ class BaseSchemaValidator:
                     target = rel.get("Target")
                     if target and not target.startswith(
                         ("http", "mailto:")
-                    ):  # Skip external URLs
-                        # Resolve the target path
-                        # Absolute paths (starting with /) are relative to package root
-                        # Relative paths are relative to the .rels file's parent directory
+                    ):
                         if target.startswith("/"):
-                            # Absolute path - resolve from unpacked_dir root
-                            # Strip leading / to avoid pathlib replacing the base
                             target_path = self.unpacked_dir / target.lstrip("/")
                         elif rels_file.name == ".rels":
-                            # Root .rels file - relative targets are relative to unpacked_dir
                             target_path = self.unpacked_dir / target
                         else:
-                            # Other .rels files - relative targets are relative to their parent's parent
-                            # e.g., word/_rels/document.xml.rels -> targets relative to word/
                             base_dir = rels_dir.parent
                             target_path = base_dir / target
-                        # Normalize the path and check if it exists
                         try:
                             target_path = target_path.resolve()
                             if target_path.exists() and target_path.is_file():
@@ -414,7 +347,6 @@ class BaseSchemaValidator:
                         except (OSError, ValueError):
                             broken_refs.append((target, rel.sourceline))
-                # Report broken references
                 if broken_refs:
                     rel_path = rels_file.relative_to(self.unpacked_dir)
                     for broken_ref, line_num in broken_refs:
@@ -426,7 +358,6 @@ class BaseSchemaValidator:
                 rel_path = rels_file.relative_to(self.unpacked_dir)
                 errors.append(f"  Error parsing {rel_path}: {e}")
-        # Check for unreferenced files (files that exist but are not referenced anywhere)
         unreferenced_files = set(all_files) - all_referenced_files
         if unreferenced_files:
@@ -452,31 +383,21 @@ class BaseSchemaValidator:
             return True
     def validate_all_relationship_ids(self):
-        """
-        Validate that all r:id attributes in XML files reference existing IDs
-        in their corresponding .rels files, and optionally validate relationship types.
-        """
         import lxml.etree
         errors = []
-        # Process each XML file that might contain r:id references
         for xml_file in self.xml_files:
-            # Skip .rels files themselves
             if xml_file.suffix == ".rels":
                 continue
-            # Determine the corresponding .rels file
-            # For dir/file.xml, it's dir/_rels/file.xml.rels
             rels_dir = xml_file.parent / "_rels"
             rels_file = rels_dir / f"{xml_file.name}.rels"
-            # Skip if there's no corresponding .rels file (that's okay)
             if not rels_file.exists():
                 continue
             try:
-                # Parse the .rels file to get valid relationship IDs and their types
                 rels_root = lxml.etree.parse(str(rels_file)).getroot()
                 rid_to_type = {}
@@ -486,47 +407,43 @@ class BaseSchemaValidator:
                     rid = rel.get("Id")
                     rel_type = rel.get("Type", "")
                     if rid:
-                        # Check for duplicate rIds
                         if rid in rid_to_type:
                             rels_rel_path = rels_file.relative_to(self.unpacked_dir)
                             errors.append(
                                 f"  {rels_rel_path}: Line {rel.sourceline}: "
                                 f"Duplicate relationship ID '{rid}' (IDs must be unique)"
                             )
-                        # Extract just the type name from the full URL
                         type_name = (
                             rel_type.split("/")[-1] if "/" in rel_type else rel_type
                         )
                         rid_to_type[rid] = type_name
-                # Parse the XML file to find all r:id references
                 xml_root = lxml.etree.parse(str(xml_file)).getroot()
-                # Find all elements with r:id attributes
+                r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE
+                rid_attrs_to_check = ["id", "embed", "link"]
                 for elem in xml_root.iter():
-                    # Check for r:id attribute (relationship ID)
-                    rid_attr = elem.get(f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id")
-                    if rid_attr:
+                    for attr_name in rid_attrs_to_check:
+                        rid_attr = elem.get(f"{{{r_ns}}}{attr_name}")
+                        if not rid_attr:
+                            continue
                         xml_rel_path = xml_file.relative_to(self.unpacked_dir)
                         elem_name = (
                             elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
                         )
-                        # Check if the ID exists
                         if rid_attr not in rid_to_type:
                             errors.append(
                                 f"  {xml_rel_path}: Line {elem.sourceline}: "
-                                f"<{elem_name}> references non-existent relationship '{rid_attr}' "
+                                f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' "
                                 f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
                             )
-                        # Check if we have type expectations for this element
-                        elif self.ELEMENT_RELATIONSHIP_TYPES:
+                        elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES:
                             expected_type = self._get_expected_relationship_type(
                                 elem_name
                             )
                             if expected_type:
                                 actual_type = rid_to_type[rid_attr]
-                                # Check if the actual type matches or contains the expected type
                                 if expected_type not in actual_type.lower():
                                     errors.append(
                                         f"  {xml_rel_path}: Line {elem.sourceline}: "
@@ -550,58 +467,41 @@ class BaseSchemaValidator:
             return True
     def _get_expected_relationship_type(self, element_name):
-        """
-        Get the expected relationship type for an element.
-        First checks the explicit mapping, then tries pattern detection.
-        """
-        # Normalize element name to lowercase
         elem_lower = element_name.lower()
-        # Check explicit mapping first
         if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
             return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
-        # Try pattern detection for common patterns
-        # Pattern 1: Elements ending in "Id" often expect a relationship of the prefix type
         if elem_lower.endswith("id") and len(elem_lower) > 2:
-            # e.g., "sldId" -> "sld", "sldMasterId" -> "sldMaster"
-            prefix = elem_lower[:-2]  # Remove "id"
-            # Check if this might be a compound like "sldMasterId"
+            prefix = elem_lower[:-2]
             if prefix.endswith("master"):
                 return prefix.lower()
             elif prefix.endswith("layout"):
                 return prefix.lower()
             else:
-                # Simple case like "sldId" -> "slide"
-                # Common transformations
                 if prefix == "sld":
                     return "slide"
                 return prefix.lower()
-        # Pattern 2: Elements ending in "Reference" expect a relationship of the prefix type
         if elem_lower.endswith("reference") and len(elem_lower) > 9:
-            prefix = elem_lower[:-9]  # Remove "reference"
+            prefix = elem_lower[:-9]
             return prefix.lower()
         return None
     def validate_content_types(self):
-        """Validate that all content files are properly declared in [Content_Types].xml."""
         errors = []
-        # Find [Content_Types].xml file
         content_types_file = self.unpacked_dir / "[Content_Types].xml"
         if not content_types_file.exists():
             print("FAILED - [Content_Types].xml file not found")
             return False
         try:
-            # Parse and get all declared parts and extensions
             root = lxml.etree.parse(str(content_types_file)).getroot()
             declared_parts = set()
             declared_extensions = set()
-            # Get Override declarations (specific files)
             for override in root.findall(
                 f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
             ):
@@ -609,7 +509,6 @@ class BaseSchemaValidator:
                 if part_name is not None:
                     declared_parts.add(part_name.lstrip("/"))
-            # Get Default declarations (by extension)
             for default in root.findall(
                 f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
             ):
@@ -617,19 +516,17 @@ class BaseSchemaValidator:
                 if extension is not None:
                     declared_extensions.add(extension.lower())
-            # Root elements that require content type declaration
             declarable_roots = {
                 "sld",
                 "sldLayout",
                 "sldMaster",
-                "presentation",  # PowerPoint
-                "document",  # Word
+                "presentation",
+                "document",
                 "workbook",
-                "worksheet",  # Excel
-                "theme",  # Common
+                "worksheet",
+                "theme",
             }
-            # Common media file extensions that should be declared
             media_extensions = {
                 "png": "image/png",
                 "jpg": "image/jpeg",
@@ -641,17 +538,14 @@ class BaseSchemaValidator:
                 "emf": "image/x-emf",
             }
-            # Get all files in the unpacked directory
             all_files = list(self.unpacked_dir.rglob("*"))
             all_files = [f for f in all_files if f.is_file()]
-            # Check all XML files for Override declarations
             for xml_file in self.xml_files:
                 path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
                     "\\", "/"
                 )
-                # Skip non-content files
                 if any(
                     skip in path_str
                     for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
@@ -668,11 +562,9 @@ class BaseSchemaValidator:
                         )
                 except Exception:
-                    continue  # Skip unparseable files
+                    continue
-            # Check all non-XML files for Default extension declarations
             for file_path in all_files:
-                # Skip XML files and metadata files (already checked above)
                 if file_path.suffix.lower() in {".xml", ".rels"}:
                     continue
                 if file_path.name == "[Content_Types].xml":
@@ -682,7 +574,6 @@ class BaseSchemaValidator:
                 extension = file_path.suffix.lstrip(".").lower()
                 if extension and extension not in declared_extensions:
-                    # Check if it's a known media extension that should be declared
                     if extension in media_extensions:
                         relative_path = file_path.relative_to(self.unpacked_dir)
                         errors.append(
@@ -705,37 +596,23 @@ class BaseSchemaValidator:
             return True
     def validate_file_against_xsd(self, xml_file, verbose=False):
-        """Validate a single XML file against XSD schema, comparing with original.
-        Args:
-            xml_file: Path to XML file to validate
-            verbose: Enable verbose output
-        Returns:
-            tuple: (is_valid, new_errors_set) where is_valid is True/False/None (skipped)
-        """
-        # Resolve both paths to handle symlinks
         xml_file = Path(xml_file).resolve()
         unpacked_dir = self.unpacked_dir.resolve()
-        # Validate current file
         is_valid, current_errors = self._validate_single_file_xsd(
             xml_file, unpacked_dir
         )
         if is_valid is None:
-            return None, set()  # Skipped
+            return None, set()
         elif is_valid:
-            return True, set()  # Valid, no errors
+            return True, set()
-        # Get errors from original file for this specific file
         original_errors = self._get_original_file_errors(xml_file)
-        # Compare with original (both are guaranteed to be sets here)
         assert current_errors is not None
         new_errors = current_errors - original_errors
-        # Filter out known harmless errors (e.g., LibreOffice element ordering issues)
         new_errors = {
             e for e in new_errors
             if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS)
@@ -750,7 +627,6 @@ class BaseSchemaValidator:
                     print(f"  - {truncated}")
             return False, new_errors
         else:
-            # All errors existed in original
             if verbose:
                 print(
                     f"PASSED - No new errors (original had {len(current_errors)} errors)"
@@ -758,7 +634,6 @@ class BaseSchemaValidator:
             return True, set()
     def validate_against_xsd(self):
-        """Validate XML files against XSD schemas, showing only new errors compared to original."""
         new_errors = []
         original_error_count = 0
         valid_count = 0
@@ -777,19 +652,16 @@ class BaseSchemaValidator:
                 valid_count += 1
                 continue
             elif is_valid:
-                # Had errors but all existed in original
                 original_error_count += 1
                 valid_count += 1
                 continue
-            # Has new errors
             new_errors.append(f"  {relative_path}: {len(new_file_errors)} new error(s)")
-            for error in list(new_file_errors)[:3]:  # Show first 3 errors
+            for error in list(new_file_errors)[:3]:
                 new_errors.append(
                     f"    - {error[:250]}..." if len(error) > 250 else f"    - {error}"
                 )
-        # Print summary
         if self.verbose:
             print(f"Validated {len(self.xml_files)} files:")
             print(f"  - Valid: {valid_count}")
@@ -811,62 +683,47 @@ class BaseSchemaValidator:
             return True
     def _get_schema_path(self, xml_file):
-        """Determine the appropriate schema path for an XML file."""
-        # Check exact filename match
         if xml_file.name in self.SCHEMA_MAPPINGS:
             return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
-        # Check .rels files
         if xml_file.suffix == ".rels":
             return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
-        # Check chart files
         if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
             return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
-        # Check theme files
         if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
             return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
-        # Check if file is in a main content folder and use appropriate schema
         if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
             return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
         return None
     def _clean_ignorable_namespaces(self, xml_doc):
-        """Remove attributes and elements not in allowed namespaces."""
-        # Create a clean copy
         xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
         xml_copy = lxml.etree.fromstring(xml_string)
-        # Remove attributes not in allowed namespaces
         for elem in xml_copy.iter():
             attrs_to_remove = []
             for attr in elem.attrib:
-                # Check if attribute is from a namespace other than allowed ones
                 if "{" in attr:
                     ns = attr.split("}")[0][1:]
                     if ns not in self.OOXML_NAMESPACES:
                         attrs_to_remove.append(attr)
-            # Remove collected attributes
             for attr in attrs_to_remove:
                 del elem.attrib[attr]
-        # Remove elements not in allowed namespaces
         self._remove_ignorable_elements(xml_copy)
         return lxml.etree.ElementTree(xml_copy)
     def _remove_ignorable_elements(self, root):
-        """Recursively remove all elements not in allowed namespaces."""
         elements_to_remove = []
-        # Find elements to remove
         for elem in list(root):
-            # Skip non-element nodes (comments, processing instructions, etc.)
             if not hasattr(elem, "tag") or callable(elem.tag):
                 continue
@@ -877,32 +734,25 @@ class BaseSchemaValidator:
                     elements_to_remove.append(elem)
                     continue
-            # Recursively clean child elements
             self._remove_ignorable_elements(elem)
-        # Remove collected elements
         for elem in elements_to_remove:
             root.remove(elem)
     def _preprocess_for_mc_ignorable(self, xml_doc):
-        """Preprocess XML to handle mc:Ignorable attribute properly."""
-        # Remove mc:Ignorable attributes before validation
         root = xml_doc.getroot()
-        # Remove mc:Ignorable attribute from root
         if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
             del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
         return xml_doc
     def _validate_single_file_xsd(self, xml_file, base_path):
-        """Validate a single XML file against XSD schema. Returns (is_valid, errors_set)."""
         schema_path = self._get_schema_path(xml_file)
         if not schema_path:
-            return None, None  # Skip file
+            return None, None
         try:
-            # Load schema
             with open(schema_path, "rb") as xsd_file:
                 parser = lxml.etree.XMLParser()
                 xsd_doc = lxml.etree.parse(
@@ -910,14 +760,12 @@ class BaseSchemaValidator:
                 )
                 schema = lxml.etree.XMLSchema(xsd_doc)
-            # Load and preprocess XML
             with open(xml_file, "r") as f:
                 xml_doc = lxml.etree.parse(f)
             xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
             xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
-            # Clean ignorable namespaces if needed
             relative_path = xml_file.relative_to(base_path)
             if (
                 relative_path.parts
@@ -925,13 +773,11 @@ class BaseSchemaValidator:
             ):
                 xml_doc = self._clean_ignorable_namespaces(xml_doc)
-            # Validate
             if schema.validate(xml_doc):
                 return True, set()
             else:
                 errors = set()
                 for error in schema.error_log:
-                    # Store normalized error message (without line numbers for comparison)
                     errors.add(error.message)
                 return False, errors
@@ -939,18 +785,12 @@ class BaseSchemaValidator:
             return False, {str(e)}
     def _get_original_file_errors(self, xml_file):
-        """Get XSD validation errors from a single file in the original document.
+        if self.original_file is None:
+            return set()
-        Args:
-            xml_file: Path to the XML file in unpacked_dir to check
-        Returns:
-            set: Set of error messages from the original file
-        """
         import tempfile
         import zipfile
-        # Resolve both paths to handle symlinks (e.g., /var vs /private/var on macOS)
         xml_file = Path(xml_file).resolve()
         unpacked_dir = self.unpacked_dir.resolve()
         relative_path = xml_file.relative_to(unpacked_dir)
@@ -958,37 +798,23 @@ class BaseSchemaValidator:
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_path = Path(temp_dir)
-            # Extract original file
             with zipfile.ZipFile(self.original_file, "r") as zip_ref:
                 zip_ref.extractall(temp_path)
-            # Find corresponding file in original
             original_xml_file = temp_path / relative_path
             if not original_xml_file.exists():
-                # File didn't exist in original, so no original errors
                 return set()
-            # Validate the specific file in original
             is_valid, errors = self._validate_single_file_xsd(
                 original_xml_file, temp_path
             )
             return errors if errors else set()
     def _remove_template_tags_from_text_nodes(self, xml_doc):
-        """Remove template tags from XML text nodes and collect warnings.
-        Template tags follow the pattern {{ ... }} and are used as placeholders
-        for content replacement. They should be removed from text content before
-        XSD validation while preserving XML structure.
-        Returns:
-            tuple: (cleaned_xml_doc, warnings_list)
-        """
         warnings = []
         template_pattern = re.compile(r"\{\{[^}]*\}\}")
-        # Create a copy of the document to avoid modifying the original
         xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
         xml_copy = lxml.etree.fromstring(xml_string)
@@ -1004,9 +830,7 @@ class BaseSchemaValidator:
                 return template_pattern.sub("", text)
             return text
-        # Process all text nodes in the document
         for elem in xml_copy.iter():
-            # Skip processing if this is a w:t element
             if not hasattr(elem, "tag") or callable(elem.tag):
                 continue
             tag_str = str(elem.tag)