npm - @heylemon/lemonade - Versions diffs - 0.0.4 → 0.0.6 - Mend

@heylemon/lemonade 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/skills/xlsx/scripts/office/validators/docx.py CHANGED Viewed

@@ -14,100 +14,76 @@ from .base import BaseSchemaValidator
 class DOCXSchemaValidator(BaseSchemaValidator):
-    """Validator for Word document XML files against XSD schemas."""
-    # Word-specific namespaces
     WORD_2006_NAMESPACE = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
     W14_NAMESPACE = "http://schemas.microsoft.com/office/word/2010/wordml"
     W16CID_NAMESPACE = "http://schemas.microsoft.com/office/word/2016/wordml/cid"
-    # Word-specific element to relationship type mappings
-    # Start with empty mapping - add specific cases as we discover them
     ELEMENT_RELATIONSHIP_TYPES = {}
     def validate(self):
-        """Run all validation checks and return True if all pass."""
-        # Test 0: XML well-formedness
         if not self.validate_xml():
             return False
-        # Test 1: Namespace declarations
         all_valid = True
         if not self.validate_namespaces():
             all_valid = False
-        # Test 2: Unique IDs
         if not self.validate_unique_ids():
             all_valid = False
-        # Test 3: Relationship and file reference validation
         if not self.validate_file_references():
             all_valid = False
-        # Test 4: Content type declarations
         if not self.validate_content_types():
             all_valid = False
-        # Test 5: XSD schema validation
         if not self.validate_against_xsd():
             all_valid = False
-        # Test 6: Whitespace preservation
         if not self.validate_whitespace_preservation():
             all_valid = False
-        # Test 7: Deletion validation
         if not self.validate_deletions():
             all_valid = False
-        # Test 8: Insertion validation
         if not self.validate_insertions():
             all_valid = False
-        # Test 9: Relationship ID reference validation
         if not self.validate_all_relationship_ids():
             all_valid = False
-        # Test 10: ID constraints (paraId, durableId)
         if not self.validate_id_constraints():
             all_valid = False
-        # Test 11: Comment marker validation
         if not self.validate_comment_markers():
             all_valid = False
-        # Count and compare paragraphs
         self.compare_paragraph_counts()
         return all_valid
     def validate_whitespace_preservation(self):
-        """
-        Validate that w:t elements with whitespace have xml:space='preserve'.
-        """
         errors = []
         for xml_file in self.xml_files:
-            # Only check document.xml files
             if xml_file.name != "document.xml":
                 continue
             try:
                 root = lxml.etree.parse(str(xml_file)).getroot()
-                # Find all w:t elements
                 for elem in root.iter(f"{{{self.WORD_2006_NAMESPACE}}}t"):
                     if elem.text:
                         text = elem.text
-                        # Check if text starts or ends with whitespace
-                        if re.match(r"^\s.*", text) or re.match(r".*\s$", text):
-                            # Check if xml:space="preserve" attribute exists
+                        if re.search(r"^[ \t\n\r]", text) or re.search(
+                            r"[ \t\n\r]$", text
+                        ):
                             xml_space_attr = f"{{{self.XML_NAMESPACE}}}space"
                             if (
                                 xml_space_attr not in elem.attrib
                                 or elem.attrib[xml_space_attr] != "preserve"
                             ):
-                                # Show a preview of the text
                                 text_preview = (
                                     repr(text)[:50] + "..."
                                     if len(repr(text)) > 50
@@ -134,15 +110,9 @@ class DOCXSchemaValidator(BaseSchemaValidator):
             return True
     def validate_deletions(self):
-        """
-        Validate that w:t and w:instrText elements are not within w:del elements.
-        Inside w:del, use w:delText and w:delInstrText instead.
-        XSD validation does not catch this, so we do it manually.
-        """
         errors = []
         for xml_file in self.xml_files:
-            # Only check document.xml files
             if xml_file.name != "document.xml":
                 continue
@@ -150,10 +120,8 @@ class DOCXSchemaValidator(BaseSchemaValidator):
                 root = lxml.etree.parse(str(xml_file)).getroot()
                 namespaces = {"w": self.WORD_2006_NAMESPACE}
-                # Find all w:t elements that are descendants of w:del elements
                 for t_elem in root.xpath(".//w:del//w:t", namespaces=namespaces):
                     if t_elem.text:
-                        # Show a preview of the text
                         text_preview = (
                             repr(t_elem.text)[:50] + "..."
                             if len(repr(t_elem.text)) > 50
@@ -164,9 +132,9 @@ class DOCXSchemaValidator(BaseSchemaValidator):
                             f"Line {t_elem.sourceline}: <w:t> found within <w:del>: {text_preview}"
                         )
-                # Find all w:instrText elements that are descendants of w:del elements
-                # These should be w:delInstrText instead
-                for instr_elem in root.xpath(".//w:del//w:instrText", namespaces=namespaces):
+                for instr_elem in root.xpath(
+                    ".//w:del//w:instrText", namespaces=namespaces
+                ):
                     text_preview = (
                         repr(instr_elem.text or "")[:50] + "..."
                         if len(repr(instr_elem.text or "")) > 50
@@ -193,17 +161,14 @@ class DOCXSchemaValidator(BaseSchemaValidator):
             return True
     def count_paragraphs_in_unpacked(self):
-        """Count the number of paragraphs in the unpacked document."""
         count = 0
         for xml_file in self.xml_files:
-            # Only check document.xml files
             if xml_file.name != "document.xml":
                 continue
             try:
                 root = lxml.etree.parse(str(xml_file)).getroot()
-                # Count all w:p elements
                 paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
                 count = len(paragraphs)
             except Exception as e:
@@ -212,21 +177,20 @@ class DOCXSchemaValidator(BaseSchemaValidator):
         return count
     def count_paragraphs_in_original(self):
-        """Count the number of paragraphs in the original docx file."""
+        original = self.original_file
+        if original is None:
+            return 0
         count = 0
         try:
-            # Create temporary directory to unpack original
             with tempfile.TemporaryDirectory() as temp_dir:
-                # Unpack original docx
-                with zipfile.ZipFile(self.original_file, "r") as zip_ref:
+                with zipfile.ZipFile(original, "r") as zip_ref:
                     zip_ref.extractall(temp_dir)
-                # Parse document.xml
                 doc_xml_path = temp_dir + "/word/document.xml"
                 root = lxml.etree.parse(doc_xml_path).getroot()
-                # Count all w:p elements
                 paragraphs = root.findall(f".//{{{self.WORD_2006_NAMESPACE}}}p")
                 count = len(paragraphs)
@@ -236,10 +200,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
         return count
     def validate_insertions(self):
-        """
-        Validate that w:delText elements are not within w:ins elements.
-        w:delText is only allowed in w:ins if nested within a w:del.
-        """
         errors = []
         for xml_file in self.xml_files:
@@ -250,7 +210,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
                 root = lxml.etree.parse(str(xml_file)).getroot()
                 namespaces = {"w": self.WORD_2006_NAMESPACE}
-                # Find w:delText in w:ins that are NOT within w:del
                 invalid_elements = root.xpath(
                     ".//w:ins//w:delText[not(ancestor::w:del)]", namespaces=namespaces
                 )
@@ -282,7 +241,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
             return True
     def compare_paragraph_counts(self):
-        """Compare paragraph counts between original and new document."""
         original_count = self.count_paragraphs_in_original()
         new_count = self.count_paragraphs_in_unpacked()
@@ -291,24 +249,9 @@ class DOCXSchemaValidator(BaseSchemaValidator):
         print(f"\nParagraphs: {original_count} → {new_count} ({diff_str})")
     def _parse_id_value(self, val: str, base: int = 16) -> int:
-        """Parse an ID value as hex (base=16) or decimal (base=10).
-        Args:
-            val: The string value to parse
-            base: The numeric base (16 for hex, 10 for decimal)
-        Returns:
-            The parsed integer value
-        """
         return int(val, base)
     def validate_id_constraints(self):
-        """Validate paraId and durableId values per OOXML spec.
-        Checks:
-        - paraId < 0x80000000 (always hex)
-        - durableId < 0x7FFFFFFF (decimal in numbering.xml, hex elsewhere)
-        """
         errors = []
         para_id_attr = f"{{{self.W14_NAMESPACE}}}paraId"
         durable_id_attr = f"{{{self.W16CID_NAMESPACE}}}durableId"
@@ -316,7 +259,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
         for xml_file in self.xml_files:
             try:
                 for elem in lxml.etree.parse(str(xml_file)).iter():
-                    # paraId is always hex format
                     if val := elem.get(para_id_attr):
                         if self._parse_id_value(val, base=16) >= 0x80000000:
                             errors.append(
@@ -324,8 +266,6 @@ class DOCXSchemaValidator(BaseSchemaValidator):
                             )
                     if val := elem.get(durable_id_attr):
-                        # durableId in numbering.xml must be decimal.
-                        # Word rejects hex-formatted durableIds in numbering.xml.
                         if xml_file.name == "numbering.xml":
                             try:
                                 if self._parse_id_value(val, base=10) >= 0x7FFFFFFF:
@@ -334,12 +274,10 @@ class DOCXSchemaValidator(BaseSchemaValidator):
                                         f"durableId={val} >= 0x7FFFFFFF"
                                     )
                             except ValueError:
-                                # Contains non-decimal characters (e.g., hex letters A-F)
                                 errors.append(
                                     f"  {xml_file.name}:{elem.sourceline}: "
                                     f"durableId={val} must be decimal in numbering.xml"
                                 )
-                        # durableId in other files (e.g. commentsIds.xml) uses hex format
                         else:
                             if self._parse_id_value(val, base=16) >= 0x7FFFFFFF:
                                 errors.append(
@@ -358,16 +296,8 @@ class DOCXSchemaValidator(BaseSchemaValidator):
         return not errors
     def validate_comment_markers(self):
-        """Validate comment markers are properly paired and reference existing comments.
-        Checks:
-        - Every commentRangeStart has a matching commentRangeEnd
-        - Every commentRangeEnd has a matching commentRangeStart
-        - Every marker in document.xml references an existing comment
-        """
         errors = []
-        # Find document.xml and comments.xml
         document_xml = None
         comments_xml = None
         for xml_file in self.xml_files:
@@ -385,50 +315,59 @@ class DOCXSchemaValidator(BaseSchemaValidator):
             doc_root = lxml.etree.parse(str(document_xml)).getroot()
             namespaces = {"w": self.WORD_2006_NAMESPACE}
-            # Collect all comment marker IDs from document.xml
             range_starts = {
                 elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
-                for elem in doc_root.xpath(".//w:commentRangeStart", namespaces=namespaces)
+                for elem in doc_root.xpath(
+                    ".//w:commentRangeStart", namespaces=namespaces
+                )
             }
             range_ends = {
                 elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
-                for elem in doc_root.xpath(".//w:commentRangeEnd", namespaces=namespaces)
+                for elem in doc_root.xpath(
+                    ".//w:commentRangeEnd", namespaces=namespaces
+                )
             }
             references = {
                 elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
-                for elem in doc_root.xpath(".//w:commentReference", namespaces=namespaces)
+                for elem in doc_root.xpath(
+                    ".//w:commentReference", namespaces=namespaces
+                )
             }
-            # Check for orphaned commentRangeEnd (missing commentRangeStart)
             orphaned_ends = range_ends - range_starts
-            for comment_id in sorted(orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0):
+            for comment_id in sorted(
+                orphaned_ends, key=lambda x: int(x) if x and x.isdigit() else 0
+            ):
                 errors.append(
-                    f"  document.xml: commentRangeEnd id=\"{comment_id}\" has no matching commentRangeStart"
+                    f'  document.xml: commentRangeEnd id="{comment_id}" has no matching commentRangeStart'
                 )
-            # Check for orphaned commentRangeStart (missing commentRangeEnd)
             orphaned_starts = range_starts - range_ends
-            for comment_id in sorted(orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0):
+            for comment_id in sorted(
+                orphaned_starts, key=lambda x: int(x) if x and x.isdigit() else 0
+            ):
                 errors.append(
-                    f"  document.xml: commentRangeStart id=\"{comment_id}\" has no matching commentRangeEnd"
+                    f'  document.xml: commentRangeStart id="{comment_id}" has no matching commentRangeEnd'
                 )
-            # Get comment IDs from comments.xml if it exists
             comment_ids = set()
             if comments_xml and comments_xml.exists():
                 comments_root = lxml.etree.parse(str(comments_xml)).getroot()
                 comment_ids = {
                     elem.get(f"{{{self.WORD_2006_NAMESPACE}}}id")
-                    for elem in comments_root.xpath(".//w:comment", namespaces=namespaces)
+                    for elem in comments_root.xpath(
+                        ".//w:comment", namespaces=namespaces
+                    )
                 }
-                # Check for markers referencing non-existent comments
                 marker_ids = range_starts | range_ends | references
                 invalid_refs = marker_ids - comment_ids
-                for comment_id in sorted(invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0):
-                    if comment_id:  # Skip None values
+                for comment_id in sorted(
+                    invalid_refs, key=lambda x: int(x) if x and x.isdigit() else 0
+                ):
+                    if comment_id:
                         errors.append(
-                            f"  document.xml: marker id=\"{comment_id}\" references non-existent comment"
+                            f'  document.xml: marker id="{comment_id}" references non-existent comment'
                         )
         except (lxml.etree.XMLSyntaxError, Exception) as e:
@@ -445,22 +384,11 @@ class DOCXSchemaValidator(BaseSchemaValidator):
             return True
     def repair(self) -> int:
-        """Run DOCX-specific auto-repairs."""
         repairs = super().repair()
         repairs += self.repair_durableId()
         return repairs
     def repair_durableId(self) -> int:
-        """Fix invalid durableId values.
-        Repairs:
-        - durableId >= 0x7FFFFFFF (value out of range)
-        - durableId with hex letters in numbering.xml (wrong format)
-        Note: paraId is not auto-repaired because it may be referenced by
-        commentsExtended.xml, commentsIds.xml, and comment threading (paraIdParent).
-        Changing paraId without updating all references would break comment associations.
-        """
         repairs = 0
         for xml_file in self.xml_files:
@@ -476,28 +404,27 @@ class DOCXSchemaValidator(BaseSchemaValidator):
                     durable_id = elem.getAttribute("w16cid:durableId")
                     needs_repair = False
-                    # Check if durableId needs repair based on file type
                     if xml_file.name == "numbering.xml":
-                        # numbering.xml requires decimal format
                         try:
-                            needs_repair = self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF
+                            needs_repair = (
+                                self._parse_id_value(durable_id, base=10) >= 0x7FFFFFFF
+                            )
                         except ValueError:
-                            # Contains non-decimal characters (e.g., hex letters A-F)
                             needs_repair = True
                     else:
-                        # Other files (e.g. commentsIds.xml) use hex format
                         try:
-                            needs_repair = self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF
+                            needs_repair = (
+                                self._parse_id_value(durable_id, base=16) >= 0x7FFFFFFF
+                            )
                         except ValueError:
                             needs_repair = True
                     if needs_repair:
-                        # Generate new ID in the correct format for this file type
                         value = random.randint(1, 0x7FFFFFFE)
                         if xml_file.name == "numbering.xml":
-                            new_id = str(value)  # decimal for numbering.xml
+                            new_id = str(value)
                         else:
-                            new_id = f"{value:08X}"  # hex for other files
+                            new_id = f"{value:08X}"
                         elem.setAttribute("w16cid:durableId", new_id)
                         print(

package/skills/xlsx/scripts/office/validators/pptx.py CHANGED Viewed

@@ -8,14 +8,11 @@ from .base import BaseSchemaValidator
 class PPTXSchemaValidator(BaseSchemaValidator):
-    """Validator for PowerPoint presentation XML files against XSD schemas."""
-    # PowerPoint presentation namespace
     PRESENTATIONML_NAMESPACE = (
         "http://schemas.openxmlformats.org/presentationml/2006/main"
     )
-    # PowerPoint-specific element to relationship type mappings
     ELEMENT_RELATIONSHIP_TYPES = {
         "sldid": "slide",
         "sldmasterid": "slidemaster",
@@ -26,60 +23,46 @@ class PPTXSchemaValidator(BaseSchemaValidator):
     }
     def validate(self):
-        """Run all validation checks and return True if all pass."""
-        # Test 0: XML well-formedness
         if not self.validate_xml():
             return False
-        # Test 1: Namespace declarations
         all_valid = True
         if not self.validate_namespaces():
             all_valid = False
-        # Test 2: Unique IDs
         if not self.validate_unique_ids():
             all_valid = False
-        # Test 3: UUID ID validation
         if not self.validate_uuid_ids():
             all_valid = False
-        # Test 4: Relationship and file reference validation
         if not self.validate_file_references():
             all_valid = False
-        # Test 5: Slide layout ID validation
         if not self.validate_slide_layout_ids():
             all_valid = False
-        # Test 6: Content type declarations
         if not self.validate_content_types():
             all_valid = False
-        # Test 7: XSD schema validation
         if not self.validate_against_xsd():
             all_valid = False
-        # Test 8: Notes slide reference validation
         if not self.validate_notes_slide_references():
             all_valid = False
-        # Test 9: Relationship ID reference validation
         if not self.validate_all_relationship_ids():
             all_valid = False
-        # Test 10: Duplicate slide layout references validation
         if not self.validate_no_duplicate_slide_layouts():
             all_valid = False
         return all_valid
     def validate_uuid_ids(self):
-        """Validate that ID attributes that look like UUIDs contain only hex values."""
         import lxml.etree
         errors = []
-        # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
         uuid_pattern = re.compile(
             r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
         )
@@ -88,15 +71,11 @@ class PPTXSchemaValidator(BaseSchemaValidator):
             try:
                 root = lxml.etree.parse(str(xml_file)).getroot()
-                # Check all elements for ID attributes
                 for elem in root.iter():
                     for attr, value in elem.attrib.items():
-                        # Check if this is an ID attribute
                         attr_name = attr.split("}")[-1].lower()
                         if attr_name == "id" or attr_name.endswith("id"):
-                            # Check if value looks like a UUID (has the right length and pattern structure)
                             if self._looks_like_uuid(value):
-                                # Validate that it contains only hex characters in the right positions
                                 if not uuid_pattern.match(value):
                                     errors.append(
                                         f"  {xml_file.relative_to(self.unpacked_dir)}: "
@@ -119,19 +98,14 @@ class PPTXSchemaValidator(BaseSchemaValidator):
             return True
     def _looks_like_uuid(self, value):
-        """Check if a value has the general structure of a UUID."""
-        # Remove common UUID delimiters
         clean_value = value.strip("{}()").replace("-", "")
-        # Check if it's 32 hex-like characters (could include invalid hex chars)
         return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
     def validate_slide_layout_ids(self):
-        """Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
         import lxml.etree
         errors = []
-        # Find all slide master files
         slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
         if not slide_masters:
@@ -141,10 +115,8 @@ class PPTXSchemaValidator(BaseSchemaValidator):
         for slide_master in slide_masters:
             try:
-                # Parse the slide master file
                 root = lxml.etree.parse(str(slide_master)).getroot()
-                # Find the corresponding _rels file for this slide master
                 rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
                 if not rels_file.exists():
@@ -154,10 +126,8 @@ class PPTXSchemaValidator(BaseSchemaValidator):
                     )
                     continue
-                # Parse the relationships file
                 rels_root = lxml.etree.parse(str(rels_file)).getroot()
-                # Build a set of valid relationship IDs that point to slide layouts
                 valid_layout_rids = set()
                 for rel in rels_root.findall(
                     f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
@@ -166,7 +136,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
                     if "slideLayout" in rel_type:
                         valid_layout_rids.add(rel.get("Id"))
-                # Find all sldLayoutId elements in the slide master
                 for sld_layout_id in root.findall(
                     f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
                 ):
@@ -201,7 +170,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
             return True
     def validate_no_duplicate_slide_layouts(self):
-        """Validate that each slide has exactly one slideLayout reference."""
         import lxml.etree
         errors = []
@@ -211,7 +179,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
             try:
                 root = lxml.etree.parse(str(rels_file)).getroot()
-                # Find all slideLayout relationships
                 layout_rels = [
                     rel
                     for rel in root.findall(
@@ -241,13 +208,11 @@ class PPTXSchemaValidator(BaseSchemaValidator):
             return True
     def validate_notes_slide_references(self):
-        """Validate that each notesSlide file is referenced by only one slide."""
         import lxml.etree
         errors = []
-        notes_slide_references = {}  # Track which slides reference each notesSlide
+        notes_slide_references = {}
-        # Find all slide relationship files
         slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
         if not slide_rels_files:
@@ -257,10 +222,8 @@ class PPTXSchemaValidator(BaseSchemaValidator):
         for rels_file in slide_rels_files:
             try:
-                # Parse the relationships file
                 root = lxml.etree.parse(str(rels_file)).getroot()
-                # Find all notesSlide relationships
                 for rel in root.findall(
                     f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
                 ):
@@ -268,13 +231,11 @@ class PPTXSchemaValidator(BaseSchemaValidator):
                     if "notesSlide" in rel_type:
                         target = rel.get("Target", "")
                         if target:
-                            # Normalize the target path to handle relative paths
                             normalized_target = target.replace("../", "")
-                            # Track which slide references this notesSlide
                             slide_name = rels_file.stem.replace(
                                 ".xml", ""
-                            )  # e.g., "slide1"
+                            )
                             if normalized_target not in notes_slide_references:
                                 notes_slide_references[normalized_target] = []
@@ -287,7 +248,6 @@ class PPTXSchemaValidator(BaseSchemaValidator):
                     f"  {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
                 )
-        # Check for duplicate references
         for target, references in notes_slide_references.items():
             if len(references) > 1:
                 slide_names = [ref[0] for ref in references]