npm - @farazirfan/costar-server-executor - Versions diffs - 1.7.37 → 1.7.39 - Mend

@farazirfan/costar-server-executor 1.7.37 → 1.7.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (253) hide show

package/skills/pdf/scripts/extract_form_structure.py ADDED Viewed

@@ -0,0 +1,124 @@
+#!/usr/bin/env python3
+"""
+Extract form structure from a non-fillable PDF.
+This script analyzes the PDF to find:
+- Text labels with their exact coordinates
+- Horizontal lines (row boundaries)
+- Checkboxes (small rectangles)
+Output: A JSON file with the form structure that can be used to generate
+accurate field coordinates for filling.
+Usage: python extract_form_structure.py <input.pdf> <output.json>
+"""
+import json
+import sys
+import pdfplumber
+def extract_form_structure(pdf_path):
+    """Extract structural elements from a PDF form."""
+    structure = {
+        "pages": [],
+        "labels": [],
+        "lines": [],
+        "checkboxes": [],
+        "row_boundaries": []
+    }
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, 1):
+            # Page info
+            structure["pages"].append({
+                "page_number": page_num,
+                "width": float(page.width),
+                "height": float(page.height)
+            })
+            # Extract text labels with positions
+            words = page.extract_words()
+            for word in words:
+                structure["labels"].append({
+                    "page": page_num,
+                    "text": word["text"],
+                    "x0": round(float(word["x0"]), 1),
+                    "top": round(float(word["top"]), 1),
+                    "x1": round(float(word["x1"]), 1),
+                    "bottom": round(float(word["bottom"]), 1)
+                })
+            # Extract horizontal lines (row separators)
+            for line in page.lines:
+                # Horizontal lines span most of page width
+                if abs(float(line["x1"]) - float(line["x0"])) > page.width * 0.5:
+                    structure["lines"].append({
+                        "page": page_num,
+                        "y": round(float(line["top"]), 1),
+                        "x0": round(float(line["x0"]), 1),
+                        "x1": round(float(line["x1"]), 1)
+                    })
+            # Extract checkboxes (small square rectangles)
+            for rect in page.rects:
+                width = float(rect["x1"]) - float(rect["x0"])
+                height = float(rect["bottom"]) - float(rect["top"])
+                # Checkboxes are typically 5-15 points square
+                if 5 <= width <= 15 and 5 <= height <= 15 and abs(width - height) < 2:
+                    structure["checkboxes"].append({
+                        "page": page_num,
+                        "x0": round(float(rect["x0"]), 1),
+                        "top": round(float(rect["top"]), 1),
+                        "x1": round(float(rect["x1"]), 1),
+                        "bottom": round(float(rect["bottom"]), 1),
+                        "center_x": round((float(rect["x0"]) + float(rect["x1"])) / 2, 1),
+                        "center_y": round((float(rect["top"]) + float(rect["bottom"])) / 2, 1)
+                    })
+    # Calculate row boundaries from horizontal lines
+    lines_by_page = {}
+    for line in structure["lines"]:
+        page = line["page"]
+        if page not in lines_by_page:
+            lines_by_page[page] = []
+        lines_by_page[page].append(line["y"])
+    for page, y_coords in lines_by_page.items():
+        y_coords = sorted(set(y_coords))
+        for i in range(len(y_coords) - 1):
+            structure["row_boundaries"].append({
+                "page": page,
+                "row_top": y_coords[i],
+                "row_bottom": y_coords[i + 1],
+                "row_height": round(y_coords[i + 1] - y_coords[i], 1)
+            })
+    return structure
+def main():
+    if len(sys.argv) != 3:
+        print("Usage: extract_form_structure.py <input.pdf> <output.json>")
+        sys.exit(1)
+    pdf_path = sys.argv[1]
+    output_path = sys.argv[2]
+    print(f"Extracting structure from {pdf_path}...")
+    structure = extract_form_structure(pdf_path)
+    with open(output_path, "w") as f:
+        json.dump(structure, f, indent=2)
+    print(f"Found:")
+    print(f"  - {len(structure['pages'])} pages")
+    print(f"  - {len(structure['labels'])} text labels")
+    print(f"  - {len(structure['lines'])} horizontal lines")
+    print(f"  - {len(structure['checkboxes'])} checkboxes")
+    print(f"  - {len(structure['row_boundaries'])} row boundaries")
+    print(f"Saved to {output_path}")
+if __name__ == "__main__":
+    main()

package/skills/pdf/scripts/fill_fillable_fields.py ADDED Viewed

@@ -0,0 +1,116 @@
+import json
+import sys
+from pypdf import PdfReader, PdfWriter
+from extract_form_field_info import get_field_info
+# Fills fillable form fields in a PDF. See FORMS.md.
+def fill_pdf_fields(input_pdf_path: str, fields_json_path: str, output_pdf_path: str):
+    with open(fields_json_path) as f:
+        fields = json.load(f)
+    # Group by page number.
+    fields_by_page = {}
+    for field in fields:
+        if "value" in field:
+            field_id = field["field_id"]
+            page = field["page"]
+            if page not in fields_by_page:
+                fields_by_page[page] = {}
+            fields_by_page[page][field_id] = field["value"]
+    reader = PdfReader(input_pdf_path)
+    has_error = False
+    field_info = get_field_info(reader)
+    fields_by_ids = {f["field_id"]: f for f in field_info}
+    for field in fields:
+        existing_field = fields_by_ids.get(field["field_id"])
+        if not existing_field:
+            has_error = True
+            print(f"ERROR: `{field['field_id']}` is not a valid field ID")
+        elif field["page"] != existing_field["page"]:
+            has_error = True
+            print(f"ERROR: Incorrect page number for `{field['field_id']}` (got {field['page']}, expected {existing_field['page']})")
+        else:
+            if "value" in field:
+                err = validation_error_for_field_value(existing_field, field["value"])
+                if err:
+                    print(err)
+                    has_error = True
+    if has_error:
+        sys.exit(1)
+    writer = PdfWriter(clone_from=reader)
+    for page, field_values in fields_by_page.items():
+        writer.update_page_form_field_values(writer.pages[page - 1], field_values, auto_regenerate=False)
+    # This seems to be necessary for many PDF viewers to format the form values correctly.
+    # It may cause the viewer to show a "save changes" dialog even if the user doesn't make any changes.
+    writer.set_need_appearances_writer(True)
+    with open(output_pdf_path, "wb") as f:
+        writer.write(f)
+    print(f"PDF form filled and saved to {output_pdf_path}")
+def validation_error_for_field_value(field_info, field_value):
+    field_type = field_info["type"]
+    field_id = field_info["field_id"]
+    if field_type == "checkbox":
+        checked_val = field_info["checked_value"]
+        unchecked_val = field_info["unchecked_value"]
+        if field_value != checked_val and field_value != unchecked_val:
+            return f'ERROR: Invalid value "{field_value}" for checkbox field "{field_id}". The checked value is "{checked_val}" and the unchecked value is "{unchecked_val}"'
+    elif field_type == "radio_group":
+        option_values = [opt["value"] for opt in field_info["radio_options"]]
+        if field_value not in option_values:
+            return f'ERROR: Invalid value "{field_value}" for radio group field "{field_id}". Valid values are: {option_values}'
+    elif field_type == "choice":
+        choice_values = [opt["value"] for opt in field_info["choice_options"]]
+        if field_value not in choice_values:
+            return f'ERROR: Invalid value "{field_value}" for choice field "{field_id}". Valid values are: {choice_values}'
+    return None
+# pypdf (at least version 5.7.0) has a bug when setting the value for a selection list field.
+# In _writer.py around line 966:
+#
+# if field.get(FA.FT, "/Tx") == "/Ch" and field_flags & FA.FfBits.Combo == 0:
+#     txt = "\n".join(annotation.get_inherited(FA.Opt, []))
+#
+# The problem is that for selection lists, `get_inherited` returns a list of two-element lists like
+# [["value1", "Text 1"], ["value2", "Text 2"], ...]
+# This causes `join` to throw a TypeError because it expects an iterable of strings.
+# The horrible workaround is to patch `get_inherited` to return a list of the value strings.
+# We call the original method and adjust the return value only if the argument to `get_inherited`
+# is `FA.Opt` and if the return value is a list of two-element lists.
+def monkeypatch_pydpf_method():
+    from pypdf.generic import DictionaryObject
+    from pypdf.constants import FieldDictionaryAttributes
+    original_get_inherited = DictionaryObject.get_inherited
+    def patched_get_inherited(self, key: str, default = None):
+        result = original_get_inherited(self, key, default)
+        if key == FieldDictionaryAttributes.Opt:
+            if isinstance(result, list) and all(isinstance(v, list) and len(v) == 2 for v in result):
+                result = [r[0] for r in result]
+        return result
+    DictionaryObject.get_inherited = patched_get_inherited
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: fill_fillable_fields.py [input pdf] [field_values.json] [output pdf]")
+        sys.exit(1)
+    monkeypatch_pydpf_method()
+    input_pdf = sys.argv[1]
+    fields_json = sys.argv[2]
+    output_pdf = sys.argv[3]
+    fill_pdf_fields(input_pdf, fields_json, output_pdf)

package/skills/pdf/scripts/fill_pdf_form_with_annotations.py ADDED Viewed

@@ -0,0 +1,136 @@
+import json
+import sys
+from pypdf import PdfReader, PdfWriter
+from pypdf.annotations import FreeText
+# Fills a PDF by adding text annotations defined in `fields.json`. See FORMS.md.
+def transform_from_image_coords(bbox, image_width, image_height, pdf_width, pdf_height):
+    """Transform bounding box from image coordinates to PDF coordinates"""
+    # Image coordinates: origin at top-left, y increases downward
+    # PDF coordinates: origin at bottom-left, y increases upward
+    x_scale = pdf_width / image_width
+    y_scale = pdf_height / image_height
+    left = bbox[0] * x_scale
+    right = bbox[2] * x_scale
+    # Flip Y coordinates for PDF
+    top = pdf_height - (bbox[1] * y_scale)
+    bottom = pdf_height - (bbox[3] * y_scale)
+    return left, bottom, right, top
+def transform_from_pdf_coords(bbox, pdf_height):
+    """Transform bounding box from pdfplumber coordinates to pypdf coordinates.
+    pdfplumber uses y=0 at top, y increases downward (like images).
+    pypdf FreeText expects y=0 at bottom, y increases upward.
+    Both use the same scale (PDF points), so only Y needs flipping.
+    """
+    left = bbox[0]
+    right = bbox[2]
+    # bbox is [left, top, right, bottom] where top < bottom (y=0 at top)
+    # pypdf wants [left, bottom, right, top] where bottom < top (y=0 at bottom)
+    pypdf_top = pdf_height - bbox[1]      # flip the "top" value
+    pypdf_bottom = pdf_height - bbox[3]   # flip the "bottom" value
+    return left, pypdf_bottom, right, pypdf_top
+def fill_pdf_form(input_pdf_path, fields_json_path, output_pdf_path):
+    """Fill the PDF form with data from fields.json"""
+    # `fields.json` format described in FORMS.md.
+    with open(fields_json_path, "r") as f:
+        fields_data = json.load(f)
+    # Open the PDF
+    reader = PdfReader(input_pdf_path)
+    writer = PdfWriter()
+    # Copy all pages to writer
+    writer.append(reader)
+    # Get PDF dimensions for each page
+    pdf_dimensions = {}
+    for i, page in enumerate(reader.pages):
+        mediabox = page.mediabox
+        pdf_dimensions[i + 1] = [mediabox.width, mediabox.height]
+    # Process each form field
+    annotations = []
+    for field in fields_data["form_fields"]:
+        page_num = field["page_number"]
+        # Get page dimensions and transform coordinates.
+        page_info = next(p for p in fields_data["pages"] if p["page_number"] == page_num)
+        pdf_width, pdf_height = pdf_dimensions[page_num]
+        # Detect coordinate system: pdf_width/pdf_height = PDF coords, image_width/image_height = image coords
+        if "pdf_width" in page_info:
+            # PDF coordinates from structure extraction (pdfplumber style)
+            # Only need Y-flip, no scaling
+            transformed_entry_box = transform_from_pdf_coords(
+                field["entry_bounding_box"],
+                float(pdf_height)
+            )
+        else:
+            # Image coordinates - need scaling and Y-flip
+            image_width = page_info["image_width"]
+            image_height = page_info["image_height"]
+            transformed_entry_box = transform_from_image_coords(
+                field["entry_bounding_box"],
+                image_width, image_height,
+                float(pdf_width), float(pdf_height)
+            )
+        # Skip empty fields
+        if "entry_text" not in field or "text" not in field["entry_text"]:
+            continue
+        entry_text = field["entry_text"]
+        text = entry_text["text"]
+        if not text:
+            continue
+        font_name = entry_text.get("font", "Arial")
+        font_size = str(entry_text.get("font_size", 14)) + "pt"
+        font_color = entry_text.get("font_color", "000000")
+        # Font size/color seems to not work reliably across viewers:
+        # https://github.com/py-pdf/pypdf/issues/2084
+        annotation = FreeText(
+            text=text,
+            rect=transformed_entry_box,
+            font=font_name,
+            font_size=font_size,
+            font_color=font_color,
+            border_color=None,
+            background_color=None,
+        )
+        annotations.append(annotation)
+        # page_number is 0-based for pypdf
+        writer.add_annotation(page_number=page_num - 1, annotation=annotation)
+    # Save the filled PDF
+    with open(output_pdf_path, "wb") as output:
+        writer.write(output)
+    print(f"Successfully filled PDF form and saved to {output_pdf_path}")
+    print(f"Added {len(annotations)} text annotations")
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: fill_pdf_form_with_annotations.py [input pdf] [fields.json] [output pdf]")
+        sys.exit(1)
+    input_pdf = sys.argv[1]
+    fields_json = sys.argv[2]
+    output_pdf = sys.argv[3]
+    fill_pdf_form(input_pdf, fields_json, output_pdf)

package/skills/pptx/SKILL.md ADDED Viewed

@@ -0,0 +1,171 @@
+---
+name: pptx
+description: "Presentation creation, editing, and analysis. When Claude needs to work with presentations (.pptx files) for: (1) Creating new presentations, (2) Modifying or editing content, (3) Working with layouts, (4) Adding comments or speaker notes, or any other presentation tasks"
+license: Proprietary. LICENSE.txt has complete terms
+---
+# PPTX Skill
+## Quick Reference
+| Task | Guide |
+|------|-------|
+| Read/analyze content | `python -m markitdown presentation.pptx` |
+| Edit or create from template | Read [editing.md](editing.md) (Python) |
+| Create from scratch | Read [pptxgenjs.md](pptxgenjs.md) (Node.js) |
+---
+## Reading Content
+```bash
+# Text extraction
+python -m markitdown presentation.pptx
+# Visual overview
+python scripts/thumbnail.py presentation.pptx
+# Raw XML
+python scripts/unpack.py presentation.pptx unpacked/
+```
+---
+## Editing Workflow
+**Read [editing.md](editing.md) for full details.**
+1. Analyze template with `thumbnail.py`
+2. Unpack → manipulate slides → edit content → clean → pack
+---
+## Creating from Scratch
+**Read [pptxgenjs.md](pptxgenjs.md) for full details.**
+Use when no template or reference presentation is available.
+**Important:** PptxGenJS is a **Node.js/JavaScript** library. Create a `.js` file and run it with `node script.js`.
+---
+## Design Ideas
+**Don't create boring slides.** Plain bullets on a white background won't impress anyone. Consider ideas from this list for each slide.
+### Before Starting
+- **Pick a bold, content-informed color palette**: The palette should feel designed for THIS topic. If swapping your colors into a completely different presentation would still "work," you haven't made specific enough choices.
+- **Dominance over equality**: One color should dominate (60-70% visual weight), with 1-2 supporting tones and one sharp accent. Never give all colors equal weight.
+- **Dark/light contrast**: Dark backgrounds for title + conclusion slides, light for content ("sandwich" structure). Or commit to dark throughout for a premium feel.
+- **Commit to a visual motif**: Pick ONE distinctive element and repeat it — rounded image frames, icons in colored circles, thick single-side borders. Carry it across every slide.
+### Color Palettes
+Choose colors that match your topic — don't default to generic blue. Use these palettes as inspiration:
+| Theme | Primary | Secondary | Accent |
+|-------|---------|-----------|--------|
+| **Midnight Executive** | `1E2761` (navy) | `CADCFC` (ice blue) | `FFFFFF` (white) |
+| **Forest & Moss** | `2C5F2D` (forest) | `97BC62` (moss) | `F5F5F5` (cream) |
+| **Coral Energy** | `F96167` (coral) | `F9E795` (gold) | `2F3C7E` (navy) |
+| **Warm Terracotta** | `B85042` (terracotta) | `E7E8D1` (sand) | `A7BEAE` (sage) |
+| **Ocean Gradient** | `065A82` (deep blue) | `1C7293` (teal) | `21295C` (midnight) |
+| **Charcoal Minimal** | `36454F` (charcoal) | `F2F2F2` (off-white) | `212121` (black) |
+| **Teal Trust** | `028090` (teal) | `00A896` (seafoam) | `02C39A` (mint) |
+| **Berry & Cream** | `6D2E46` (berry) | `A26769` (dusty rose) | `ECE2D0` (cream) |
+| **Sage Calm** | `84B59F` (sage) | `69A297` (eucalyptus) | `50808E` (slate) |
+| **Cherry Bold** | `990011` (cherry) | `FCF6F5` (off-white) | `2F3C7E` (navy) |
+### For Each Slide
+**Every slide needs a visual element** — image, chart, icon, or shape. Text-only slides are forgettable.
+**Layout options:**
+- Two-column (text left, illustration on right)
+- Icon + text rows (icon in colored circle, bold header, description below)
+- 2x2 or 2x3 grid (image on one side, grid of content blocks on other)
+- Half-bleed image (full left or right side) with content overlay
+**Data display:**
+- Large stat callouts (big numbers 60-72pt with small labels below)
+- Comparison columns (before/after, pros/cons, side-by-side options)
+- Timeline or process flow (numbered steps, arrows)
+**Visual polish:**
+- Icons in small colored circles next to section headers
+- Italic accent text for key stats or taglines
+### Typography
+**Choose an interesting font pairing** — don't default to Arial. Pick a header font with personality and pair it with a clean body font.
+| Header Font | Body Font |
+|-------------|-----------|
+| Georgia | Calibri |
+| Arial Black | Arial |
+| Calibri | Calibri Light |
+| Cambria | Calibri |
+| Trebuchet MS | Calibri |
+| Impact | Arial |
+| Palatino | Garamond |
+| Consolas | Calibri |
+| Element | Size |
+|---------|------|
+| Slide title | 36-44pt bold |
+| Section header | 20-24pt bold |
+| Body text | 14-16pt |
+| Captions | 10-12pt muted |
+### Spacing
+- 0.5" minimum margins
+- 0.3-0.5" between content blocks
+- Leave breathing room—don't fill every inch
+### Avoid (Common Mistakes)
+- **Don't repeat the same layout** — vary columns, cards, and callouts across slides
+- **Don't center body text** — left-align paragraphs and lists; center only titles
+- **Don't skimp on size contrast** — titles need 36pt+ to stand out from 14-16pt body
+- **Don't default to blue** — pick colors that reflect the specific topic
+- **Don't mix spacing randomly** — choose 0.3" or 0.5" gaps and use consistently
+- **Don't style one slide and leave the rest plain** — commit fully or keep it simple throughout
+- **Don't create text-only slides** — add images, icons, charts, or visual elements; avoid plain title + bullets
+- **Don't forget text box padding** — when aligning lines or shapes with text edges, set `margin: 0` on the text box or offset the shape to account for padding
+- **Don't use low-contrast elements** — icons AND text need strong contrast against the background; avoid light text on light backgrounds or dark text on dark backgrounds
+- **NEVER use horizontal lines to seperate title and body** — use whitespace or background color instead
+---
+## QA (Required)
+**Assume there are problems. Your job is to find them.**
+Your first render is almost never correct. Approach QA as a bug hunt, not a confirmation step. If you found zero issues on first inspection, you weren't looking hard enough.
+### Content QA
+```bash
+python -m markitdown output.pptx
+```
+Check for missing content, typos, wrong order.
+**When using templates, check for leftover placeholder text:**
+```bash
+python -m markitdown output.pptx | grep -iE "xxxx|lorem|ipsum|this.*(page|slide).*layout"
+```
+If grep returns results, fix them before declaring success.
+---
+## Dependencies
+- `pip install "markitdown[pptx]"` - text extraction
+- `pip install Pillow` - thumbnail grids
+- `npm install -g pptxgenjs` - creating from scratch