npm - @heylemon/lemonade - Versions diffs - 0.0.4 → 0.0.6 - Mend

@heylemon/lemonade 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

package/skills/docx/scripts/office/validators/redlining.py CHANGED Viewed

@@ -9,7 +9,6 @@ from pathlib import Path
 class RedliningValidator:
-    """Validator for tracked changes in Word documents."""
     def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"):
         self.unpacked_dir = Path(unpacked_dir)
@@ -21,29 +20,23 @@ class RedliningValidator:
         }
     def repair(self) -> int:
-        """No auto-repairs for redlining validation. Returns 0."""
         return 0
     def validate(self):
-        """Main validation method that returns True if valid, False otherwise."""
-        # Verify unpacked directory exists and has correct structure
         modified_file = self.unpacked_dir / "word" / "document.xml"
         if not modified_file.exists():
             print(f"FAILED - Modified document.xml not found at {modified_file}")
             return False
-        # First, check if there are any tracked changes by the author to validate
         try:
             import xml.etree.ElementTree as ET
             tree = ET.parse(modified_file)
             root = tree.getroot()
-            # Check for w:del or w:ins tags by the specified author
             del_elements = root.findall(".//w:del", self.namespaces)
             ins_elements = root.findall(".//w:ins", self.namespaces)
-            # Filter to only include changes by the specified author
             author_del_elements = [
                 elem
                 for elem in del_elements
@@ -55,21 +48,17 @@ class RedliningValidator:
                 if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
             ]
-            # Redlining validation is only needed if tracked changes by the author have been used.
             if not author_del_elements and not author_ins_elements:
                 if self.verbose:
                     print(f"PASSED - No tracked changes by {self.author} found.")
                 return True
         except Exception:
-            # If we can't parse the XML, continue with full validation
             pass
-        # Create temporary directory for unpacking original docx
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_path = Path(temp_dir)
-            # Unpack original docx
             try:
                 with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
                     zip_ref.extractall(temp_path)
@@ -84,7 +73,6 @@ class RedliningValidator:
                 )
                 return False
-            # Parse both XML files using xml.etree.ElementTree for redlining validation
             try:
                 import xml.etree.ElementTree as ET
@@ -96,16 +84,13 @@ class RedliningValidator:
                 print(f"FAILED - Error parsing XML files: {e}")
                 return False
-            # Remove the author's tracked changes from both documents
             self._remove_author_tracked_changes(original_root)
             self._remove_author_tracked_changes(modified_root)
-            # Extract and compare text content
             modified_text = self._extract_text_content(modified_root)
             original_text = self._extract_text_content(original_root)
             if modified_text != original_text:
-                # Show detailed character-level differences for each paragraph
                 error_message = self._generate_detailed_diff(
                     original_text, modified_text
                 )
@@ -117,7 +102,6 @@ class RedliningValidator:
             return True
     def _generate_detailed_diff(self, original_text, modified_text):
-        """Generate detailed word-level differences using git word diff."""
         error_parts = [
             f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes",
             "",
@@ -132,7 +116,6 @@ class RedliningValidator:
             "",
         ]
-        # Show git word diff
         git_diff = self._get_git_word_diff(original_text, modified_text)
         if git_diff:
             error_parts.extend(["Differences:", "============", git_diff])
@@ -142,26 +125,23 @@ class RedliningValidator:
         return "\n".join(error_parts)
     def _get_git_word_diff(self, original_text, modified_text):
-        """Generate word diff using git with character-level precision."""
         try:
             with tempfile.TemporaryDirectory() as temp_dir:
                 temp_path = Path(temp_dir)
-                # Create two files
                 original_file = temp_path / "original.txt"
                 modified_file = temp_path / "modified.txt"
                 original_file.write_text(original_text, encoding="utf-8")
                 modified_file.write_text(modified_text, encoding="utf-8")
-                # Try character-level diff first for precise differences
                 result = subprocess.run(
                     [
                         "git",
                         "diff",
                         "--word-diff=plain",
-                        "--word-diff-regex=.",  # Character-by-character diff
-                        "-U0",  # Zero lines of context - show only changed lines
+                        "--word-diff-regex=.",
+                        "-U0",
                         "--no-index",
                         str(original_file),
                         str(modified_file),
@@ -171,9 +151,7 @@ class RedliningValidator:
                 )
                 if result.stdout.strip():
-                    # Clean up the output - remove git diff header lines
                     lines = result.stdout.split("\n")
-                    # Skip the header lines (diff --git, index, +++, ---, @@)
                     content_lines = []
                     in_content = False
                     for line in lines:
@@ -186,13 +164,12 @@ class RedliningValidator:
                     if content_lines:
                         return "\n".join(content_lines)
-                # Fallback to word-level diff if character-level is too verbose
                 result = subprocess.run(
                     [
                         "git",
                         "diff",
                         "--word-diff=plain",
-                        "-U0",  # Zero lines of context
+                        "-U0",
                         "--no-index",
                         str(original_file),
                         str(modified_file),
@@ -214,18 +191,15 @@ class RedliningValidator:
                     return "\n".join(content_lines)
         except (subprocess.CalledProcessError, FileNotFoundError, Exception):
-            # Git not available or other error, return None to use fallback
             pass
         return None
     def _remove_author_tracked_changes(self, root):
-        """Remove tracked changes authored by the specified author from the XML root."""
         ins_tag = f"{{{self.namespaces['w']}}}ins"
         del_tag = f"{{{self.namespaces['w']}}}del"
         author_attr = f"{{{self.namespaces['w']}}}author"
-        # Remove w:ins elements
         for parent in root.iter():
             to_remove = []
             for child in parent:
@@ -234,7 +208,6 @@ class RedliningValidator:
             for elem in to_remove:
                 parent.remove(elem)
-        # Unwrap content in w:del elements where author matches
         deltext_tag = f"{{{self.namespaces['w']}}}delText"
         t_tag = f"{{{self.namespaces['w']}}}t"
@@ -244,36 +217,26 @@ class RedliningValidator:
                 if child.tag == del_tag and child.get(author_attr) == self.author:
                     to_process.append((child, list(parent).index(child)))
-            # Process in reverse order to maintain indices
             for del_elem, del_index in reversed(to_process):
-                # Convert w:delText to w:t before moving
                 for elem in del_elem.iter():
                     if elem.tag == deltext_tag:
                         elem.tag = t_tag
-                # Move all children of w:del to its parent before removing w:del
                 for child in reversed(list(del_elem)):
                     parent.insert(del_index, child)
                 parent.remove(del_elem)
     def _extract_text_content(self, root):
-        """Extract text content from Word XML, preserving paragraph structure.
-        Empty paragraphs are skipped to avoid false positives when tracked
-        insertions add only structural elements without text content.
-        """
         p_tag = f"{{{self.namespaces['w']}}}p"
         t_tag = f"{{{self.namespaces['w']}}}t"
         paragraphs = []
         for p_elem in root.findall(f".//{p_tag}"):
-            # Get all text elements within this paragraph
             text_parts = []
             for t_elem in p_elem.findall(f".//{t_tag}"):
                 if t_elem.text:
                     text_parts.append(t_elem.text)
             paragraph_text = "".join(text_parts)
-            # Skip empty paragraphs - they don't affect content validation
             if paragraph_text:
                 paragraphs.append(paragraph_text)

package/skills/pdf/SKILL.md CHANGED Viewed

@@ -1,24 +1,11 @@
 ---
 name: pdf
-description: Comprehensive PDF manipulation toolkit for extracting text and tables, creating new PDFs, merging/splitting documents, and handling forms. When Claude needs to fill in a PDF form or programmatically process, generate, or analyze PDF documents at scale.
+description: Use this skill whenever the user wants to do anything with PDF files. This includes reading or extracting text/tables from PDFs, combining or merging multiple PDFs into one, splitting PDFs apart, rotating pages, adding watermarks, creating new PDFs, filling PDF forms, encrypting/decrypting PDFs, extracting images, and OCR on scanned PDFs to make them searchable. If the user mentions a .pdf file or asks to produce one, use this skill.
 license: Proprietary. LICENSE.txt has complete terms
 ---
 # PDF Processing Guide
-## Document Integrity Mode (CRITICAL)
-When the user asks to **fill an existing PDF** (especially official/government forms), preserve layout exactly:
-- Never recreate the document from scratch.
-- Never reflow, rewrite, or "clean up" page content.
-- Never convert PDF -> DOCX/Markdown -> PDF for form filling tasks.
-- Never replace or redesign page templates.
-- Always keep the original file unchanged and write to a new output file.
-- Use the workflow in `FORMS.md` exactly (fillable fields first, then fallback path).
-If the user asks for exact formatting, treat that as strict mode and prioritize minimal-delta edits only.
 ## Overview
 This guide covers essential PDF processing operations using Python libraries and command-line tools. For advanced features, JavaScript libraries, and detailed examples, see REFERENCE.md. If you need to fill out a PDF form, read FORMS.md and follow its instructions.
@@ -128,7 +115,7 @@ with pdfplumber.open("document.pdf") as pdf:
 # Combine all tables
 if all_tables:
     combined_df = pd.concat(all_tables, ignore_index=True)
-    combined_df.to_excel(os.path.expanduser("~/Desktop/extracted_tables.xlsx"), index=False)
+    combined_df.to_excel("extracted_tables.xlsx", index=False)
 ```
 ### reportlab - Create PDFs
@@ -179,6 +166,26 @@ story.append(Paragraph("Content for page 2", styles['Normal']))
 doc.build(story)
 ```
+#### Subscripts and Superscripts
+**IMPORTANT**: Never use Unicode subscript/superscript characters (₀₁₂₃₄₅₆₇₈₉, ⁰¹²³⁴⁵⁶⁷⁸⁹) in ReportLab PDFs. The built-in fonts do not include these glyphs, causing them to render as solid black boxes.
+Instead, use ReportLab's XML markup tags in Paragraph objects:
+```python
+from reportlab.platypus import Paragraph
+from reportlab.lib.styles import getSampleStyleSheet
+styles = getSampleStyleSheet()
+# Subscripts: use <sub> tag
+chemical = Paragraph("H<sub>2</sub>O", styles['Normal'])
+# Superscripts: use <super> tag
+squared = Paragraph("x<super>2</super> + y<super>2</super>", styles['Normal'])
+```
+For canvas-drawn text (not Paragraph objects), manually adjust font the size and position rather than using Unicode subscripts/superscripts.
 ## Command-Line Tools
 ### pdftotext (poppler-utils)

package/skills/pdf/{FORMS.md → forms.md} RENAMED Viewed

@@ -1,17 +1,5 @@
 **CRITICAL: You MUST complete these steps in order. Do not skip ahead to writing code.**
-## Safety Rules (Exact-Formatting Forms)
-For official/sensitive forms, follow these rules strictly:
-- Keep the original PDF untouched. Always write to a new file (for example, `original.filled.pdf`).
-- Do not overwrite the input file.
-- Do not regenerate pages or rebuild the PDF from extracted text.
-- Do not use "create PDF" workflows for form-filling requests.
-- Prefer true form-field filling whenever available; this preserves layout best.
-- If the file has no fillable fields, explain that non-fillable fallback uses overlays/annotations and may not be pixel-perfect in every viewer.
-- For non-fillable fallback, ask for a brief confirmation before writing output when exact legal formatting is required.
 If you need to fill out a PDF form, first check to see if the PDF has fillable form fields. Run this script from this file's directory:
  `python scripts/check_fillable_fields <file.pdf>`, and depending on the result go to either the "Fillable fields" or "Non-fillable fields" and follow those instructions.
@@ -86,7 +74,6 @@ Then analyze the images to determine the purpose of each form field (make sure t
 - Run the `fill_fillable_fields.py` script from this file's directory to create a filled-in PDF:
 `python scripts/fill_fillable_fields.py <input pdf> <field_values.json> <output pdf>`
 This script will verify that the field IDs and values you provide are valid; if it prints error messages, correct the appropriate fields and try again.
-- Use a new output filename and keep the input unchanged.
 # Non-fillable fields
 If the PDF doesn't have fillable form fields, you'll add text annotations. First try to extract coordinates from the PDF structure (more accurate), then fall back to visual estimation if needed.
@@ -295,7 +282,6 @@ Fix any reported errors in fields.json before proceeding.
 The fill script auto-detects the coordinate system and handles conversion:
 `python scripts/fill_pdf_form_with_annotations.py <input.pdf> fields.json <output.pdf>`
-- Use a new output filename and keep the input unchanged.
 ## Step 4: Verify Output

package/skills/pdf/scripts/check_bounding_boxes.py CHANGED Viewed

@@ -3,8 +3,6 @@ import json
 import sys
-# Script to check that the `fields.json` file that Claude creates when analyzing PDFs
-# does not have overlapping bounding boxes. See FORMS.md.
 @dataclass
@@ -14,7 +12,6 @@ class RectAndField:
     field: dict
-# Returns a list of messages that are printed to stdout for Claude to read.
 def get_bounding_box_messages(fields_json_stream) -> list[str]:
     messages = []
     fields = json.load(fields_json_stream)
@@ -32,7 +29,6 @@ def get_bounding_box_messages(fields_json_stream) -> list[str]:
     has_error = False
     for i, ri in enumerate(rects_and_fields):
-        # This is O(N^2); we can optimize if it becomes a problem.
         for j in range(i + 1, len(rects_and_fields)):
             rj = rects_and_fields[j]
             if ri.field["page_number"] == rj.field["page_number"] and rects_intersect(ri.rect, rj.rect):
@@ -63,7 +59,6 @@ if __name__ == "__main__":
     if len(sys.argv) != 2:
         print("Usage: check_bounding_boxes.py [fields.json]")
         sys.exit(1)
-    # Input file should be in the `fields.json` format described in FORMS.md.
     with open(sys.argv[1]) as f:
         messages = get_bounding_box_messages(f)
     for msg in messages:

package/skills/pdf/scripts/check_fillable_fields.py CHANGED Viewed

@@ -2,7 +2,6 @@ import sys
 from pypdf import PdfReader
-# Script for Claude to run to determine whether a PDF has fillable form fields. See FORMS.md.
 reader = PdfReader(sys.argv[1])

package/skills/pdf/scripts/convert_pdf_to_images.py CHANGED Viewed

@@ -4,14 +4,12 @@ import sys
 from pdf2image import convert_from_path
-# Converts each page of a PDF to a PNG image.
 def convert(pdf_path, output_dir, max_dim=1000):
     images = convert_from_path(pdf_path, dpi=200)
     for i, image in enumerate(images):
-        # Scale image if needed to keep width/height under `max_dim`
         width, height = image.size
         if width > max_dim or height > max_dim:
             scale_factor = min(max_dim / width, max_dim / height)

package/skills/pdf/scripts/create_validation_image.py CHANGED Viewed

@@ -4,12 +4,9 @@ import sys
 from PIL import Image, ImageDraw
-# Creates "validation" images with rectangles for the bounding box information that
-# Claude creates when determining where to add text annotations in PDFs. See FORMS.md.
 def create_validation_image(page_number, fields_json_path, input_path, output_path):
-    # Input file should be in the `fields.json` format described in FORMS.md.
     with open(fields_json_path, 'r') as f:
         data = json.load(f)
@@ -21,7 +18,6 @@ def create_validation_image(page_number, fields_json_path, input_path, output_pa
             if field["page_number"] == page_number:
                 entry_box = field['entry_bounding_box']
                 label_box = field['label_bounding_box']
-                # Draw red rectangle over entry bounding box and blue rectangle over the label.
                 draw.rectangle(entry_box, outline='red', width=2)
                 draw.rectangle(label_box, outline='blue', width=2)
                 num_boxes += 2

package/skills/pdf/scripts/extract_form_field_info.py CHANGED Viewed

@@ -4,11 +4,8 @@ import sys
 from pypdf import PdfReader
-# Extracts data for the fillable form fields in a PDF and outputs JSON that
-# Claude uses to fill the fields. See FORMS.md.
-# This matches the format used by PdfReader `get_fields` and `update_page_form_field_values` methods.
 def get_full_annotation_field_id(annotation):
     components = []
     while annotation:
@@ -25,12 +22,9 @@ def make_field_dict(field, field_id):
     if ft == "/Tx":
         field_dict["type"] = "text"
     elif ft == "/Btn":
-        field_dict["type"] = "checkbox"  # radio groups handled separately
+        field_dict["type"] = "checkbox"
         states = field.get("/_States_", [])
         if len(states) == 2:
-            # "/Off" seems to always be the unchecked value, as suggested by
-            # https://opensource.adobe.com/dc-acrobat-sdk-docs/standards/pdfstandards/pdf/PDF32000_2008.pdf#page=448
-            # It can be either first or second in the "/_States_" list.
             if "/Off" in states:
                 field_dict["checked_value"] = states[0] if states[0] != "/Off" else states[1]
                 field_dict["unchecked_value"] = "/Off"
@@ -50,15 +44,6 @@ def make_field_dict(field, field_id):
     return field_dict
-# Returns a list of fillable PDF fields:
-# [
-#   {
-#     "field_id": "name",
-#     "page": 1,
-#     "type": ("text", "checkbox", "radio_group", or "choice")
-#     // Per-type additional fields described in FORMS.md
-#   },
-# ]
 def get_field_info(reader: PdfReader):
     fields = reader.get_fields()
@@ -66,19 +51,13 @@ def get_field_info(reader: PdfReader):
     possible_radio_names = set()
     for field_id, field in fields.items():
-        # Skip if this is a container field with children, except that it might be
-        # a parent group for radio button options.
         if field.get("/Kids"):
             if field.get("/FT") == "/Btn":
                 possible_radio_names.add(field_id)
             continue
         field_info_by_id[field_id] = make_field_dict(field, field_id)
-    # Bounding rects are stored in annotations in page objects.
-    # Radio button options have a separate annotation for each choice;
-    # all choices have the same field name.
-    # See https://westhealth.github.io/exploring-fillable-forms-with-pdfrw.html
     radio_fields_by_id = {}
     for page_index, page in enumerate(reader.pages):
@@ -90,8 +69,6 @@ def get_field_info(reader: PdfReader):
                 field_info_by_id[field_id]["rect"] = ann.get('/Rect')
             elif field_id in possible_radio_names:
                 try:
-                    # ann['/AP']['/N'] should have two items. One of them is '/Off',
-                    # the other is the active value.
                     on_values = [v for v in ann["/AP"]["/N"] if v != "/Off"]
                 except KeyError:
                     continue
@@ -104,17 +81,11 @@ def get_field_info(reader: PdfReader):
                             "page": page_index + 1,
                             "radio_options": [],
                         }
-                    # Note: at least on macOS 15.7, Preview.app doesn't show selected
-                    # radio buttons correctly. (It does if you remove the leading slash
-                    # from the value, but that causes them not to appear correctly in
-                    # Chrome/Firefox/Acrobat/etc).
                     radio_fields_by_id[field_id]["radio_options"].append({
                         "value": on_values[0],
                         "rect": rect,
                     })
-    # Some PDFs have form field definitions without corresponding annotations,
-    # so we can't tell where they are. Ignore these fields for now.
     fields_with_location = []
     for field_info in field_info_by_id.values():
         if "page" in field_info:
@@ -122,7 +93,6 @@ def get_field_info(reader: PdfReader):
         else:
             print(f"Unable to determine location for field id: {field_info.get('field_id')}, ignoring")
-    # Sort by page number, then Y position (flipped in PDF coordinate system), then X.
     def sort_key(f):
         if "radio_options" in f:
             rect = f["radio_options"][0]["rect"] or [0, 0, 0, 0]

package/skills/pdf/scripts/extract_form_structure.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#!/usr/bin/env python3
 """
 Extract form structure from a non-fillable PDF.
@@ -19,7 +18,6 @@ import pdfplumber
 def extract_form_structure(pdf_path):
-    """Extract structural elements from a PDF form."""
     structure = {
         "pages": [],
         "labels": [],
@@ -30,14 +28,12 @@ def extract_form_structure(pdf_path):
     with pdfplumber.open(pdf_path) as pdf:
         for page_num, page in enumerate(pdf.pages, 1):
-            # Page info
             structure["pages"].append({
                 "page_number": page_num,
                 "width": float(page.width),
                 "height": float(page.height)
             })
-            # Extract text labels with positions
             words = page.extract_words()
             for word in words:
                 structure["labels"].append({
@@ -49,9 +45,7 @@ def extract_form_structure(pdf_path):
                     "bottom": round(float(word["bottom"]), 1)
                 })
-            # Extract horizontal lines (row separators)
             for line in page.lines:
-                # Horizontal lines span most of page width
                 if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
                     structure["lines"].append({
                         "page": page_num,
@@ -60,11 +54,9 @@ def extract_form_structure(pdf_path):
                         "x1": round(float(line["x1"]), 1)
                     })
-            # Extract checkboxes (small square rectangles)
             for rect in page.rects:
                 width = float(rect["x1"]) - float(rect["x0"])
                 height = float(rect["bottom"]) - float(rect["top"])
-                # Checkboxes are typically 5-15 points square
                 if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
                     structure["checkboxes"].append({
                         "page": page_num,
@@ -76,7 +68,6 @@ def extract_form_structure(pdf_path):
                         "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
                     })
-    # Calculate row boundaries from horizontal lines
     lines_by_page = {}
     for line in structure["lines"]:
         page = line["page"]

package/skills/pdf/scripts/fill_fillable_fields.py CHANGED Viewed

@@ -1,25 +1,16 @@
 import json
 import sys
-import os
 from pypdf import PdfReader, PdfWriter
 from extract_form_field_info import get_field_info
-# Fills fillable form fields in a PDF. See FORMS.md.
 def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str):
-    input_abs = os.path.abspath(input_pdf_path)
-    output_abs = os.path.abspath(output_pdf_path)
-    if input_abs == output_abs:
-        print("ERROR: Refusing to overwrite input PDF. Use a different output path.")
-        sys.exit(1)
     with open(fields_json_path) as f:
         fields = json.load(f)
-    # Group by page number.
     fields_by_page = {}
     for field in fields:
         if "value" in field:
@@ -55,8 +46,6 @@ def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path:
     for page, field_values in fields_by_page.items():
         writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False)
-    # This seems to be necessary for many PDF viewers to format the form values correctly.
-    # It may cause the viewer to show a "save changes" dialog even if the user doesn't make any changes.
     writer.set_need_appearances_writer(True)
     with open(output_pdf_path, "wb") as f:
@@ -82,18 +71,6 @@ def validation_error_for_field_value(field_info, field_value):
     return None
-# pypdf (at least version 5.7.0) has a bug when setting the value for a selection list field.
-# In _writer.py around line 966:
-#
-# if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
-#     txt = "\n".join(annotation.get_inherited(FA.Opt, []))
-#
-# The problem is that for selection lists, `get_inherited` returns a list of two-element lists like
-# [["value1", "Text 1"], ["value2", "Text 2"], ...]
-# This causes `join` to throw a TypeError because it expects an iterable of strings.
-# The horrible workaround is to patch `get_inherited` to return a list of the value strings.
-# We call the original method and adjust the return value only if the argument to `get_inherited`
-# is `FA.Opt` and if the return value is a list of two-element lists.
 def monkeypatch_pydpf_method():
     from pypdf.generic import DictionaryObject
     from pypdf.constants import FieldDictionaryAttributes