npm - @farazirfan/costar-server-executor - Versions diffs - 1.7.37 → 1.7.39 - Mend

@farazirfan/costar-server-executor 1.7.37 → 1.7.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (253) hide show

package/skills/pptx/scripts/ooxml/scripts/validation/pptx.py ADDED Viewed

@@ -0,0 +1,315 @@
+"""
+Validator for PowerPoint presentation XML files against XSD schemas.
+"""
+import re
+from .base import BaseSchemaValidator
+class PPTXSchemaValidator(BaseSchemaValidator):
+    """Validator for PowerPoint presentation XML files against XSD schemas."""
+    # PowerPoint presentation namespace
+    PRESENTATIONML_NAMESPACE = (
+        "http://schemas.openxmlformats.org/presentationml/2006/main"
+    )
+    # PowerPoint-specific element to relationship type mappings
+    ELEMENT_RELATIONSHIP_TYPES = {
+        "sldid": "slide",
+        "sldmasterid": "slidemaster",
+        "notesmasterid": "notesmaster",
+        "sldlayoutid": "slidelayout",
+        "themeid": "theme",
+        "tablestyleid": "tablestyles",
+    }
+    def validate(self):
+        """Run all validation checks and return True if all pass."""
+        # Test 0: XML well-formedness
+        if not self.validate_xml():
+            return False
+        # Test 1: Namespace declarations
+        all_valid = True
+        if not self.validate_namespaces():
+            all_valid = False
+        # Test 2: Unique IDs
+        if not self.validate_unique_ids():
+            all_valid = False
+        # Test 3: UUID ID validation
+        if not self.validate_uuid_ids():
+            all_valid = False
+        # Test 4: Relationship and file reference validation
+        if not self.validate_file_references():
+            all_valid = False
+        # Test 5: Slide layout ID validation
+        if not self.validate_slide_layout_ids():
+            all_valid = False
+        # Test 6: Content type declarations
+        if not self.validate_content_types():
+            all_valid = False
+        # Test 7: XSD schema validation
+        if not self.validate_against_xsd():
+            all_valid = False
+        # Test 8: Notes slide reference validation
+        if not self.validate_notes_slide_references():
+            all_valid = False
+        # Test 9: Relationship ID reference validation
+        if not self.validate_all_relationship_ids():
+            all_valid = False
+        # Test 10: Duplicate slide layout references validation
+        if not self.validate_no_duplicate_slide_layouts():
+            all_valid = False
+        return all_valid
+    def validate_uuid_ids(self):
+        """Validate that ID attributes that look like UUIDs contain only hex values."""
+        import lxml.etree
+        errors = []
+        # UUID pattern: 8-4-4-4-12 hex digits with optional braces/hyphens
+        uuid_pattern = re.compile(
+            r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
+        )
+        for xml_file in self.xml_files:
+            try:
+                root = lxml.etree.parse(str(xml_file)).getroot()
+                # Check all elements for ID attributes
+                for elem in root.iter():
+                    for attr, value in elem.attrib.items():
+                        # Check if this is an ID attribute
+                        attr_name = attr.split("}")[-1].lower()
+                        if attr_name == "id" or attr_name.endswith("id"):
+                            # Check if value looks like a UUID (has the right length and pattern structure)
+                            if self._looks_like_uuid(value):
+                                # Validate that it contains only hex characters in the right positions
+                                if not uuid_pattern.match(value):
+                                    errors.append(
+                                        f"  {xml_file.relative_to(self.unpacked_dir)}: "
+                                        f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters"
+                                    )
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+        if errors:
+            print(f"FAILED - Found {len(errors)} UUID ID validation errors:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All UUID-like IDs contain valid hex values")
+            return True
+    def _looks_like_uuid(self, value):
+        """Check if a value has the general structure of a UUID."""
+        # Remove common UUID delimiters
+        clean_value = value.strip("{}()").replace("-", "")
+        # Check if it's 32 hex-like characters (could include invalid hex chars)
+        return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
+    def validate_slide_layout_ids(self):
+        """Validate that sldLayoutId elements in slide masters reference valid slide layouts."""
+        import lxml.etree
+        errors = []
+        # Find all slide master files
+        slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
+        if not slide_masters:
+            if self.verbose:
+                print("PASSED - No slide masters found")
+            return True
+        for slide_master in slide_masters:
+            try:
+                # Parse the slide master file
+                root = lxml.etree.parse(str(slide_master)).getroot()
+                # Find the corresponding _rels file for this slide master
+                rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
+                if not rels_file.exists():
+                    errors.append(
+                        f"  {slide_master.relative_to(self.unpacked_dir)}: "
+                        f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}"
+                    )
+                    continue
+                # Parse the relationships file
+                rels_root = lxml.etree.parse(str(rels_file)).getroot()
+                # Build a set of valid relationship IDs that point to slide layouts
+                valid_layout_rids = set()
+                for rel in rels_root.findall(
+                    f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
+                ):
+                    rel_type = rel.get("Type", "")
+                    if "slideLayout" in rel_type:
+                        valid_layout_rids.add(rel.get("Id"))
+                # Find all sldLayoutId elements in the slide master
+                for sld_layout_id in root.findall(
+                    f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
+                ):
+                    r_id = sld_layout_id.get(
+                        f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id"
+                    )
+                    layout_id = sld_layout_id.get("id")
+                    if r_id and r_id not in valid_layout_rids:
+                        errors.append(
+                            f"  {slide_master.relative_to(self.unpacked_dir)}: "
+                            f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' "
+                            f"references r:id='{r_id}' which is not found in slide layout relationships"
+                        )
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {slide_master.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+        if errors:
+            print(f"FAILED - Found {len(errors)} slide layout ID validation errors:")
+            for error in errors:
+                print(error)
+            print(
+                "Remove invalid references or add missing slide layouts to the relationships file."
+            )
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All slide layout IDs reference valid slide layouts")
+            return True
+    def validate_no_duplicate_slide_layouts(self):
+        """Validate that each slide has exactly one slideLayout reference."""
+        import lxml.etree
+        errors = []
+        slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
+        for rels_file in slide_rels_files:
+            try:
+                root = lxml.etree.parse(str(rels_file)).getroot()
+                # Find all slideLayout relationships
+                layout_rels = [
+                    rel
+                    for rel in root.findall(
+                        f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
+                    )
+                    if "slideLayout" in rel.get("Type", "")
+                ]
+                if len(layout_rels) > 1:
+                    errors.append(
+                        f"  {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references"
+                    )
+            except Exception as e:
+                errors.append(
+                    f"  {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+        if errors:
+            print("FAILED - Found slides with duplicate slideLayout references:")
+            for error in errors:
+                print(error)
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All slides have exactly one slideLayout reference")
+            return True
+    def validate_notes_slide_references(self):
+        """Validate that each notesSlide file is referenced by only one slide."""
+        import lxml.etree
+        errors = []
+        notes_slide_references = {}  # Track which slides reference each notesSlide
+        # Find all slide relationship files
+        slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
+        if not slide_rels_files:
+            if self.verbose:
+                print("PASSED - No slide relationship files found")
+            return True
+        for rels_file in slide_rels_files:
+            try:
+                # Parse the relationships file
+                root = lxml.etree.parse(str(rels_file)).getroot()
+                # Find all notesSlide relationships
+                for rel in root.findall(
+                    f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
+                ):
+                    rel_type = rel.get("Type", "")
+                    if "notesSlide" in rel_type:
+                        target = rel.get("Target", "")
+                        if target:
+                            # Normalize the target path to handle relative paths
+                            normalized_target = target.replace("../", "")
+                            # Track which slide references this notesSlide
+                            slide_name = rels_file.stem.replace(
+                                ".xml", ""
+                            )  # e.g., "slide1"
+                            if normalized_target not in notes_slide_references:
+                                notes_slide_references[normalized_target] = []
+                            notes_slide_references[normalized_target].append(
+                                (slide_name, rels_file)
+                            )
+            except (lxml.etree.XMLSyntaxError, Exception) as e:
+                errors.append(
+                    f"  {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
+                )
+        # Check for duplicate references
+        for target, references in notes_slide_references.items():
+            if len(references) > 1:
+                slide_names = [ref[0] for ref in references]
+                errors.append(
+                    f"  Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}"
+                )
+                for slide_name, rels_file in references:
+                    errors.append(f"    - {rels_file.relative_to(self.unpacked_dir)}")
+        if errors:
+            print(
+                f"FAILED - Found {len([e for e in errors if not e.startswith('    ')])} notes slide reference validation errors:"
+            )
+            for error in errors:
+                print(error)
+            print("Each slide may optionally have its own slide file.")
+            return False
+        else:
+            if self.verbose:
+                print("PASSED - All notes slide references are unique")
+            return True
+if __name__ == "__main__":
+    raise RuntimeError("This module should not be run directly.")

package/skills/pptx/scripts/ooxml/scripts/validation/redlining.py ADDED Viewed

@@ -0,0 +1,284 @@
+"""
+Validator for tracked changes in Word documents.
+"""
+import subprocess
+import tempfile
+import zipfile
+from pathlib import Path
+class RedliningValidator:
+    """Validator for tracked changes in Word documents."""
+    def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"):
+        self.unpacked_dir = Path(unpacked_dir)
+        self.original_docx = Path(original_docx)
+        self.verbose = verbose
+        self.author = author
+        self.namespaces = {
+            "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+        }
+    def repair(self) -> int:
+        """No auto-repairs for redlining validation. Returns 0."""
+        return 0
+    def validate(self):
+        """Main validation method that returns True if valid, False otherwise."""
+        # Verify unpacked directory exists and has correct structure
+        modified_file = self.unpacked_dir / "word" / "document.xml"
+        if not modified_file.exists():
+            print(f"FAILED - Modified document.xml not found at {modified_file}")
+            return False
+        # First, check if there are any tracked changes by the author to validate
+        try:
+            import xml.etree.ElementTree as ET
+            tree = ET.parse(modified_file)
+            root = tree.getroot()
+            # Check for w:del or w:ins tags by the specified author
+            del_elements = root.findall(".//w:del", self.namespaces)
+            ins_elements = root.findall(".//w:ins", self.namespaces)
+            # Filter to only include changes by the specified author
+            author_del_elements = [
+                elem
+                for elem in del_elements
+                if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
+            ]
+            author_ins_elements = [
+                elem
+                for elem in ins_elements
+                if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
+            ]
+            # Redlining validation is only needed if tracked changes by the author have been used.
+            if not author_del_elements and not author_ins_elements:
+                if self.verbose:
+                    print(f"PASSED - No tracked changes by {self.author} found.")
+                return True
+        except Exception:
+            # If we can't parse the XML, continue with full validation
+            pass
+        # Create temporary directory for unpacking original docx
+        with tempfile.TemporaryDirectory() as temp_dir:
+            temp_path = Path(temp_dir)
+            # Unpack original docx
+            try:
+                with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
+                    zip_ref.extractall(temp_path)
+            except Exception as e:
+                print(f"FAILED - Error unpacking original docx: {e}")
+                return False
+            original_file = temp_path / "word" / "document.xml"
+            if not original_file.exists():
+                print(
+                    f"FAILED - Original document.xml not found in {self.original_docx}"
+                )
+                return False
+            # Parse both XML files using xml.etree.ElementTree for redlining validation
+            try:
+                import xml.etree.ElementTree as ET
+                modified_tree = ET.parse(modified_file)
+                modified_root = modified_tree.getroot()
+                original_tree = ET.parse(original_file)
+                original_root = original_tree.getroot()
+            except ET.ParseError as e:
+                print(f"FAILED - Error parsing XML files: {e}")
+                return False
+            # Remove the author's tracked changes from both documents
+            self._remove_author_tracked_changes(original_root)
+            self._remove_author_tracked_changes(modified_root)
+            # Extract and compare text content
+            modified_text = self._extract_text_content(modified_root)
+            original_text = self._extract_text_content(original_root)
+            if modified_text != original_text:
+                # Show detailed character-level differences for each paragraph
+                error_message = self._generate_detailed_diff(
+                    original_text, modified_text
+                )
+                print(error_message)
+                return False
+            if self.verbose:
+                print(f"PASSED - All changes by {self.author} are properly tracked")
+            return True
+    def _generate_detailed_diff(self, original_text, modified_text):
+        """Generate detailed word-level differences using git word diff."""
+        error_parts = [
+            f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes",
+            "",
+            "Likely causes:",
+            "  1. Modified text inside another author's <w:ins> or <w:del> tags",
+            "  2. Made edits without proper tracked changes",
+            "  3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
+            "",
+            "For pre-redlined documents, use correct patterns:",
+            "  - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
+            "  - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
+            "",
+        ]
+        # Show git word diff
+        git_diff = self._get_git_word_diff(original_text, modified_text)
+        if git_diff:
+            error_parts.extend(["Differences:", "============", git_diff])
+        else:
+            error_parts.append("Unable to generate word diff (git not available)")
+        return "\n".join(error_parts)
+    def _get_git_word_diff(self, original_text, modified_text):
+        """Generate word diff using git with character-level precision."""
+        try:
+            with tempfile.TemporaryDirectory() as temp_dir:
+                temp_path = Path(temp_dir)
+                # Create two files
+                original_file = temp_path / "original.txt"
+                modified_file = temp_path / "modified.txt"
+                original_file.write_text(original_text, encoding="utf-8")
+                modified_file.write_text(modified_text, encoding="utf-8")
+                # Try character-level diff first for precise differences
+                result = subprocess.run(
+                    [
+                        "git",
+                        "diff",
+                        "--word-diff=plain",
+                        "--word-diff-regex=.",  # Character-by-character diff
+                        "-U0",  # Zero lines of context - show only changed lines
+                        "--no-index",
+                        str(original_file),
+                        str(modified_file),
+                    ],
+                    capture_output=True,
+                    text=True,
+                )
+                if result.stdout.strip():
+                    # Clean up the output - remove git diff header lines
+                    lines = result.stdout.split("\n")
+                    # Skip the header lines (diff --git, index, +++, ---, @@)
+                    content_lines = []
+                    in_content = False
+                    for line in lines:
+                        if line.startswith("@@"):
+                            in_content = True
+                            continue
+                        if in_content and line.strip():
+                            content_lines.append(line)
+                    if content_lines:
+                        return "\n".join(content_lines)
+                # Fallback to word-level diff if character-level is too verbose
+                result = subprocess.run(
+                    [
+                        "git",
+                        "diff",
+                        "--word-diff=plain",
+                        "-U0",  # Zero lines of context
+                        "--no-index",
+                        str(original_file),
+                        str(modified_file),
+                    ],
+                    capture_output=True,
+                    text=True,
+                )
+                if result.stdout.strip():
+                    lines = result.stdout.split("\n")
+                    content_lines = []
+                    in_content = False
+                    for line in lines:
+                        if line.startswith("@@"):
+                            in_content = True
+                            continue
+                        if in_content and line.strip():
+                            content_lines.append(line)
+                    return "\n".join(content_lines)
+        except (subprocess.CalledProcessError, FileNotFoundError, Exception):
+            # Git not available or other error, return None to use fallback
+            pass
+        return None
+    def _remove_author_tracked_changes(self, root):
+        """Remove tracked changes authored by the specified author from the XML root."""
+        ins_tag = f"{{{self.namespaces['w']}}}ins"
+        del_tag = f"{{{self.namespaces['w']}}}del"
+        author_attr = f"{{{self.namespaces['w']}}}author"
+        # Remove w:ins elements
+        for parent in root.iter():
+            to_remove = []
+            for child in parent:
+                if child.tag == ins_tag and child.get(author_attr) == self.author:
+                    to_remove.append(child)
+            for elem in to_remove:
+                parent.remove(elem)
+        # Unwrap content in w:del elements where author matches
+        deltext_tag = f"{{{self.namespaces['w']}}}delText"
+        t_tag = f"{{{self.namespaces['w']}}}t"
+        for parent in root.iter():
+            to_process = []
+            for child in parent:
+                if child.tag == del_tag and child.get(author_attr) == self.author:
+                    to_process.append((child, list(parent).index(child)))
+            # Process in reverse order to maintain indices
+            for del_elem, del_index in reversed(to_process):
+                # Convert w:delText to w:t before moving
+                for elem in del_elem.iter():
+                    if elem.tag == deltext_tag:
+                        elem.tag = t_tag
+                # Move all children of w:del to its parent before removing w:del
+                for child in reversed(list(del_elem)):
+                    parent.insert(del_index, child)
+                parent.remove(del_elem)
+    def _extract_text_content(self, root):
+        """Extract text content from Word XML, preserving paragraph structure.
+        Empty paragraphs are skipped to avoid false positives when tracked
+        insertions add only structural elements without text content.
+        """
+        p_tag = f"{{{self.namespaces['w']}}}p"
+        t_tag = f"{{{self.namespaces['w']}}}t"
+        paragraphs = []
+        for p_elem in root.findall(f".//{p_tag}"):
+            # Get all text elements within this paragraph
+            text_parts = []
+            for t_elem in p_elem.findall(f".//{t_tag}"):
+                if t_elem.text:
+                    text_parts.append(t_elem.text)
+            paragraph_text = "".join(text_parts)
+            # Skip empty paragraphs - they don't affect content validation
+            if paragraph_text:
+                paragraphs.append(paragraph_text)
+        return "\n".join(paragraphs)
+if __name__ == "__main__":
+    raise RuntimeError("This module should not be run directly.")