npm - @simplysm/sd-claude - Versions diffs - 14.0.46 → 14.0.48 - Mend

@simplysm/sd-claude 14.0.46 → 14.0.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/claude/skills/sd-doc-extract/_common.py CHANGED Viewed

@@ -65,7 +65,14 @@ def ext_from_content_type(content_type: str) -> str:
 def normalize_cell(text) -> str:
     if text is None:
         return ""
-    return str(text).strip().replace("\n", " ")
+    return (
+        str(text).strip()
+        .replace("\\", "\\\\")
+        .replace("|", "\\|")
+        .replace("\r\n", "<br>")
+        .replace("\n", "<br>")
+        .replace("\r", "<br>")
+    )
 def parse_heading_level(style_name: str) -> int | None:

package/claude/skills/sd-doc-extract/_extract_docx.py CHANGED Viewed

@@ -9,6 +9,8 @@ def extract(file_path):
     ensure_packages(PACKAGES)
     from docx import Document
     from docx.oxml.ns import qn
+    from docx.table import Table as DocxTable
+    from docx.text.paragraph import Paragraph
     doc = Document(file_path)
     text_parts = []
@@ -17,47 +19,85 @@ def extract(file_path):
     img_idx = 0
     emb_idx = 0
-    for para in doc.paragraphs:
-        para_img_markers = []
+    def _extract_drawing(drawing):
+        nonlocal img_idx
+        blip = drawing.find(f".//{qn('a:blip')}")
+        if blip is None:
+            return None
+        embed_id = blip.get(qn("r:embed"))
+        if not embed_id:
+            return None
+        rel = doc.part.rels.get(embed_id)
+        if not rel or not hasattr(rel, 'target_part'):
+            return None
+        ext = ext_from_content_type(rel.target_part.content_type)
+        img_idx += 1
+        doc_pr = drawing.find(f".//{qn('wp:docPr')}")
+        alt = ""
+        if doc_pr is not None:
+            alt = doc_pr.get("descr", "") or doc_pr.get("title", "")
+        images.append({
+            "data": rel.target_part.blob,
+            "ext": ext,
+            "context": alt or "paragraph image",
+        })
+        return img_idx
+    def _process_paragraph(element):
+        para = Paragraph(element, doc)
+        style = para.style.name if para.style else ""
+        prefix = ""
+        if "Heading" in style:
+            level = parse_heading_level(style)
+            prefix = "#" * (level or 2) + " "
+        parts = []
         for run in para.runs:
+            if run.text:
+                parts.append(run.text)
             drawings = (run._element.findall(f".//{qn('wp:inline')}") +
                         run._element.findall(f".//{qn('wp:anchor')}"))
-            for drawing in drawings:
-                blip = drawing.find(f".//{qn('a:blip')}")
-                if blip is not None:
-                    embed_id = blip.get(qn("r:embed"))
-                    if embed_id:
-                        rel = doc.part.rels.get(embed_id)
-                        if rel and hasattr(rel, 'target_part'):
-                            ext = ext_from_content_type(rel.target_part.content_type)
-                            img_idx += 1
-                            images.append({
-                                "data": rel.target_part.blob,
-                                "ext": ext,
-                                "context": "paragraph image",
-                            })
-                            para_img_markers.append(f"[IMG:{img_idx}]")
-        text = para.text.strip()
-        if text:
-            style = para.style.name if para.style else ""
-            prefix = ""
-            if "Heading" in style:
-                level = parse_heading_level(style)
-                if level is not None:
-                    prefix = "#" * level + " "
-                else:
-                    prefix = "## "
-            text_parts.append(f"{prefix}{text}")
+            for d in drawings:
+                idx = _extract_drawing(d)
+                if idx is not None:
+                    parts.append(f"[IMG:{idx}]")
-        for marker in para_img_markers:
-            text_parts.append(marker)
+        line = "".join(parts).strip()
+        if line:
+            text_parts.append(f"{prefix}{line}")
-    for t_idx, table in enumerate(doc.tables):
-        text_parts.append(f"\n### Table {t_idx + 1}\n")
-        for row in table.rows:
+    def _process_table(element):
+        table = DocxTable(element, doc)
+        rows = list(table.rows)
+        if not rows:
+            return
+        text_parts.append("")
+        for r_idx, row in enumerate(rows):
             cells = [normalize_cell(cell.text) for cell in row.cells]
             text_parts.append("| " + " | ".join(cells) + " |")
+            if r_idx == 0:
+                text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
+        text_parts.append("")
+    # Iterate body elements in document order (paragraphs and tables interleaved)
+    for child in doc.element.body:
+        tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
+        if tag == 'p':
+            _process_paragraph(child)
+        elif tag == 'tbl':
+            _process_table(child)
+    # Headers and footers
+    for sec_idx, section in enumerate(doc.sections):
+        h_parts = [p.text.strip() for p in section.header.paragraphs if p.text.strip()]
+        f_parts = [p.text.strip() for p in section.footer.paragraphs if p.text.strip()]
+        if h_parts or f_parts:
+            text_parts.append("")
+            text_parts.append(f"[Header/Footer — Section {sec_idx + 1}]")
+            if h_parts:
+                text_parts.append(f"Header: {' | '.join(h_parts)}")
+            if f_parts:
+                text_parts.append(f"Footer: {' | '.join(f_parts)}")
     # OLE embedded objects
     seen = set()

package/claude/skills/sd-doc-extract/_extract_pdf.py CHANGED Viewed

@@ -37,11 +37,22 @@ def extract(file_path):
                 if w <= 4 or h <= 4:
                     continue
+            # Get image position on page
+            try:
+                rects = page.get_image_rects(xref)
+                if rects:
+                    r = rects[0]
+                    bbox_str = f" bbox:({r.x0:.0f},{r.y0:.0f},{r.x1:.0f},{r.y1:.0f})"
+                else:
+                    bbox_str = ""
+            except Exception:
+                bbox_str = ""
             img_idx += 1
             images.append({
                 "data": data,
                 "ext": ext,
-                "context": f"Page {page_num}",
+                "context": f"Page {page_num}{bbox_str}",
             })
             page_img_indices[page_num].append(img_idx)

package/claude/skills/sd-doc-extract/_extract_pptx.py CHANGED Viewed

@@ -1,8 +1,17 @@
-"""PPTX handler: extract text, images, and OLE embedded objects."""
+"""PPTX handler: render slides to PNG via PowerPoint COM, extract text and OLE embedded.
-from _common import ensure_packages, ext_from_content_type
+Individual image/shape extraction is intentionally omitted — slide screenshots
+contain all visuals including overlay shapes (boxes, arrows, annotations) that
+lose their spatial relationship when decomposed. Requires Windows + Microsoft
+PowerPoint installed.
+"""
-PACKAGES = {"python-pptx": "pptx"}
+import tempfile
+from pathlib import Path
+from _common import ensure_packages
+PACKAGES = {"pywin32": "win32com.client", "python-pptx": "pptx"}
 def _emu_to_inches(emu):
@@ -15,37 +24,107 @@ def _pos(shape):
     return f"(left={_emu_to_inches(shape.left)}\", top={_emu_to_inches(shape.top)}\")"
+def _extract_shapes(shapes, text_parts):
+    for shape in shapes:
+        if shape.shape_type == 6:  # MSO_SHAPE_TYPE.GROUP
+            _extract_shapes(shape.shapes, text_parts)
+        elif shape.has_table:
+            tbl = shape.table
+            text_parts.append(f"[TABLE] {_pos(shape)}")
+            for r_idx, row in enumerate(tbl.rows):
+                cells = [
+                    cell.text.strip().replace("\\", "\\\\").replace("|", "\\|")
+                    .replace("\r\n", "<br>").replace("\n", "<br>").replace("\r", "<br>")
+                    for cell in row.cells
+                ]
+                text_parts.append("| " + " | ".join(cells) + " |")
+                if r_idx == 0:
+                    text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
+        elif hasattr(shape, "text") and shape.text.strip():
+            text = shape.text.strip().replace("\n", "\n       ")
+            text_parts.append(f"[TXT] {_pos(shape)} {text}")
+def _render_slides_via_com(file_path: str, tmp_dir: Path, slide_count: int,
+                           width: int, height: int) -> list[bytes]:
+    import win32com.client
+    import pythoncom
+    pythoncom.CoInitialize()
+    try:
+        app = win32com.client.DispatchEx("PowerPoint.Application")
+        try:
+            try:
+                app.DisplayAlerts = 0
+            except Exception:
+                pass
+            abs_path = str(Path(file_path).resolve())
+            prs = app.Presentations.Open(abs_path, ReadOnly=True, Untitled=False,
+                                         WithWindow=False)
+            try:
+                results = []
+                for i in range(1, slide_count + 1):
+                    tmp_path = tmp_dir / f"__tmp_slide_{i}.png"
+                    prs.Slides(i).Export(str(tmp_path), "PNG", width, height)
+                    results.append(tmp_path.read_bytes())
+                    tmp_path.unlink()
+                return results
+            finally:
+                prs.Close()
+        finally:
+            app.Quit()
+    finally:
+        pythoncom.CoUninitialize()
 def extract(file_path):
     ensure_packages(PACKAGES)
     from pptx import Presentation
-    from pptx.enum.shapes import MSO_SHAPE_TYPE
     prs = Presentation(file_path)
+    slide_count = len(prs.slides)
+    target_width = 1920
+    if prs.slide_width and prs.slide_height:
+        target_height = int(target_width * prs.slide_height / prs.slide_width)
+    else:
+        target_height = 1080
+    try:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            slide_pngs = _render_slides_via_com(
+                file_path, Path(tmpdir), slide_count, target_width, target_height
+            )
+    except Exception as e:
+        raise RuntimeError(
+            f"PowerPoint COM rendering failed: {e}. "
+            "This extractor requires Windows with Microsoft PowerPoint installed."
+        ) from e
     text_parts = []
-    images = []
+    slide_images = []
     embedded = []
-    img_idx = 0
     emb_idx = 0
     for slide_num, slide in enumerate(prs.slides, 1):
         text_parts.append(f"[Slide {slide_num}]")
-        for shape in slide.shapes:
-            if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
-                ext = ext_from_content_type(shape.image.content_type)
-                img_idx += 1
-                images.append({
-                    "data": shape.image.blob,
-                    "ext": ext,
-                    "context": f"Slide {slide_num} {_pos(shape)}",
-                })
-                text_parts.append(f"[IMG:{img_idx}]")
-            if hasattr(shape, "text") and shape.text.strip():
-                text = shape.text.strip().replace("\n", "\n       ")
-                text_parts.append(f"[TXT] {_pos(shape)} {text}")
-        # OLE embedded objects from slide relationships
+        slide_images.append({
+            "filename": f"slide_{slide_num:03d}.png",
+            "data": slide_pngs[slide_num - 1],
+        })
+        text_parts.append(f"[SLIDE:{slide_num}]")
+        _extract_shapes(slide.shapes, text_parts)
+        # Speaker notes
+        if slide.has_notes_slide:
+            notes_frame = slide.notes_slide.notes_text_frame
+            notes_text = notes_frame.text.strip() if notes_frame else ""
+            if notes_text:
+                notes_text = notes_text.replace("\n", "\n       ")
+                text_parts.append(f"[Notes] {notes_text}")
         seen = set()
         for rel in slide.part.rels.values():
             reltype = rel.reltype or ""
@@ -69,7 +148,8 @@ def extract(file_path):
     return {
         "text": "\n".join(text_parts),
-        "images": images,
+        "images": [],
         "embedded": embedded,
         "metadata": {},
+        "slide_images": slide_images,
     }

package/claude/skills/sd-doc-extract/_extract_xlsb.py CHANGED Viewed

@@ -1,8 +1,36 @@
-"""XLSB handler: extract cell data from binary Excel format."""
+"""XLSB handler: extract cell data and VBA macros from binary Excel format.
+Output format matches the XLSX handler: per sheet, a markdown table with
+Excel column letters as headers and the original row number in the first
+column.  VBA macros are extracted via oletools and appended as fenced code
+blocks.
+"""
 from _common import ensure_packages
-PACKAGES = {"pyxlsb": "pyxlsb"}
+PACKAGES = {"pyxlsb": "pyxlsb", "oletools": "oletools"}
+def _escape_md(v):
+    if v is None:
+        return ""
+    s = str(v).strip()
+    return (
+        s.replace("\\", "\\\\")
+        .replace("|", "\\|")
+        .replace("\r\n", "<br>")
+        .replace("\n", "<br>")
+        .replace("\r", "<br>")
+    )
+def _col_letter(n):
+    # 1-based column index → Excel letter (A, B, ..., Z, AA, AB, ...)
+    s = ""
+    while n > 0:
+        n, r = divmod(n - 1, 26)
+        s = chr(65 + r) + s
+    return s
 def extract(file_path):
@@ -14,10 +42,71 @@ def extract(file_path):
     with open_workbook(file_path) as wb:
         for sheet_name in wb.sheets:
             text_parts.append(f"[Sheet: {sheet_name}]")
+            text_parts.append("")
             with wb.get_sheet(sheet_name) as sheet:
+                rows_data = []
+                max_col = 0
                 for row in sheet.rows():
-                    cells = [str(cell.v) if cell.v is not None else "" for cell in row]
-                    text_parts.append(" | ".join(cells))
+                    if not row:
+                        continue
+                    row_num = row[0].r + 1  # pyxlsb is 0-based
+                    cells = [_escape_md(cell.v) for cell in row]
+                    if len(cells) > max_col:
+                        max_col = len(cells)
+                    rows_data.append((row_num, cells))
+                if not rows_data:
+                    text_parts.append("(empty sheet)")
+                    text_parts.append("")
+                    continue
+                headers = ["Row"] + [_col_letter(c) for c in range(1, max_col + 1)]
+                text_parts.append("| " + " | ".join(headers) + " |")
+                text_parts.append("|" + "|".join(["---"] * len(headers)) + "|")
+                for row_num, cells in rows_data:
+                    padded = list(cells) + [""] * (max_col - len(cells))
+                    text_parts.append(
+                        f"| {row_num} | " + " | ".join(padded[:max_col]) + " |"
+                    )
+                text_parts.append("")
+    # --- VBA macro extraction ---
+    vba_parts = []
+    try:
+        from oletools.olevba import VBA_Parser
+        vba_parser = VBA_Parser(file_path)
+        if vba_parser.detect_vba_macros():
+            vba_parts.append("[VBA Macros]")
+            vba_parts.append("")
+            for filename, stream_path, vba_filename, vba_code in vba_parser.extract_macros():
+                vba_parts.append(f"### {vba_filename}")
+                vba_parts.append(f"<!-- stream: {stream_path} -->")
+                vba_parts.append("")
+                vba_parts.append("```vb")
+                vba_parts.append(vba_code)
+                vba_parts.append("```")
+                vba_parts.append("")
+            analysis = vba_parser.analyze_macros()
+            suspicious = [e for e in analysis if e[0] in ("AutoExec", "Suspicious", "IOC")]
+            if suspicious:
+                vba_parts.append("### Analysis")
+                vba_parts.append("")
+                vba_parts.append("| Type | Keyword | Description |")
+                vba_parts.append("|------|---------|-------------|")
+                for entry_type, keyword, description in suspicious:
+                    vba_parts.append(f"| {entry_type} | `{keyword}` | {description} |")
+                vba_parts.append("")
+        vba_parser.close()
+    except Exception:
+        pass
+    if vba_parts:
+        text_parts.append("")
+        text_parts.extend(vba_parts)
     return {
         "text": "\n".join(text_parts),

package/claude/skills/sd-doc-extract/_extract_xlsx.py CHANGED Viewed

@@ -1,4 +1,14 @@
-"""XLSX handler: extract cell data, images, and embedded objects."""
+"""XLSX handler: extract cell data, images, and embedded objects.
+Output format: per sheet, cell data is rendered as a markdown table whose
+column headers are Excel column letters (A, B, C, ...) and whose first
+column is the original Excel row number.  When an image is anchored to a
+row, the current table chunk is flushed, the [IMG:N] placeholder is
+emitted, and a new table (re-rendering the header) resumes from the next
+row.  This preserves the spatial relationship between cell data and
+images while keeping each chunk a valid markdown table that LLMs parse
+natively.
+"""
 import zipfile
 from _common import ensure_packages
@@ -6,9 +16,35 @@ from _common import ensure_packages
 PACKAGES = {"openpyxl": "openpyxl"}
+def _escape_md(v):
+    if v is None:
+        return ""
+    s = str(v).strip()
+    return (
+        s.replace("\\", "\\\\")
+        .replace("|", "\\|")
+        .replace("\r\n", "<br>")
+        .replace("\n", "<br>")
+        .replace("\r", "<br>")
+    )
+def _render_chunk(chunk_rows, max_col, get_col_letter):
+    if not chunk_rows:
+        return []
+    headers = ["Row"] + [get_col_letter(c) for c in range(1, max_col + 1)]
+    out = ["| " + " | ".join(headers) + " |",
+           "|" + "|".join(["---"] * len(headers)) + "|"]
+    for row_num, cells in chunk_rows:
+        padded = list(cells) + [""] * (max_col - len(cells))
+        out.append(f"| {row_num} | " + " | ".join(padded[:max_col]) + " |")
+    return out
 def extract(file_path):
     ensure_packages(PACKAGES)
     from openpyxl import load_workbook
+    from openpyxl.utils import get_column_letter
     from openpyxl.worksheet.worksheet import Worksheet
     wb = load_workbook(file_path, data_only=True)
@@ -21,57 +57,83 @@ def extract(file_path):
     for sheet_name in wb.sheetnames:
         ws = wb[sheet_name]
         text_parts.append(f"[Sheet: {sheet_name}]")
+        text_parts.append("")
         if not isinstance(ws, Worksheet):
             text_parts.append(f"({type(ws).__name__} — 데이터 없음)")
+            text_parts.append("")
             continue
         if ws.max_row is None or ws.max_row == 0:
             text_parts.append("(empty sheet)")
+            text_parts.append("")
             continue
-        # Collect images for this sheet with anchor row info
-        ws_images = getattr(ws, '_images', [])
-        row_img_markers = {}  # row_number -> list of img_idx
+        # Merged cells annotation
+        merged = list(ws.merged_cells.ranges)
+        if merged:
+            text_parts.append(f"[Merged: {', '.join(str(r) for r in merged)}]")
+            text_parts.append("")
+        ws_images = getattr(ws, "_images", [])
+        row_img_markers = {}
         for img in ws_images:
-            data_fn = getattr(img, '_data', None)
+            data_fn = getattr(img, "_data", None)
             blob = data_fn() if callable(data_fn) else b""
-            if blob:
-                img_idx += 1
-                images.append({
-                    "data": blob,
-                    "ext": "png",
-                    "context": f"sheet '{sheet_name}'",
-                })
-                anchor = getattr(img, 'anchor', None)
-                anchor_row = None
-                if anchor:
-                    _from = getattr(anchor, '_from', None)
-                    if _from:
-                        anchor_row = getattr(_from, 'row', None)
-                        if anchor_row is not None:
-                            anchor_row += 1  # openpyxl anchor is 0-based
-                if anchor_row is None:
-                    anchor_row = ws.max_row or 1
-                row_img_markers.setdefault(anchor_row, []).append(img_idx)
-        # Output rows with inline image markers at anchor positions
+            if not blob:
+                continue
+            img_idx += 1
+            anchor = getattr(img, "anchor", None)
+            anchor_row = None
+            anchor_col = None
+            if anchor:
+                _from = getattr(anchor, "_from", None)
+                if _from:
+                    anchor_row = getattr(_from, "row", None)
+                    anchor_col = getattr(_from, "col", None)
+                    if anchor_row is not None:
+                        anchor_row += 1
+                    if anchor_col is not None:
+                        anchor_col += 1
+            if anchor_row is None:
+                anchor_row = ws.max_row or 1
+            cell_ref = ""
+            if anchor_col is not None:
+                cell_ref = f" anchor:{get_column_letter(anchor_col)}{anchor_row}"
+            else:
+                cell_ref = f" anchor:row {anchor_row}"
+            images.append({
+                "data": blob,
+                "ext": "png",
+                "context": f"sheet '{sheet_name}'{cell_ref}",
+            })
+            row_img_markers.setdefault(anchor_row, []).append(img_idx)
+        max_col = ws.max_column or 1
+        chunk = []
         for row in ws.iter_rows(values_only=False):
-            cells = []
-            for cell in row:
-                val = cell.value
-                cells.append(str(val).strip() if val is not None else "")
             row_num = row[0].row
-            text_parts.append(f"[{row[0].column_letter}{row_num}] " + " | ".join(cells))
-            for idx in row_img_markers.get(row_num, []):
-                text_parts.append(f"[IMG:{idx}]")
+            cells = [_escape_md(c.value) for c in row]
+            chunk.append((row_num, cells))
+            if row_num in row_img_markers:
+                text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
+                text_parts.append("")
+                for idx in row_img_markers[row_num]:
+                    text_parts.append(f"[IMG:{idx}]")
+                text_parts.append("")
+                chunk = []
+        if chunk:
+            text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
+            text_parts.append("")
-    # Embedded objects from XLSX ZIP
     try:
-        with zipfile.ZipFile(file_path, 'r') as zf:
+        with zipfile.ZipFile(file_path, "r") as zf:
             for name in zf.namelist():
-                if 'embeddings/' in name.lower():
-                    filename = name.split('/')[-1]
+                if "embeddings/" in name.lower():
+                    filename = name.split("/")[-1]
                     data = zf.read(name)
                     emb_idx += 1
                     embedded.append({"filename": filename, "data": data})

package/claude/skills/sd-doc-extract/extract.py CHANGED Viewed

@@ -44,6 +44,13 @@ def extract_recursive(file_path: Path, out_dir: Path):
             "context": img.get("context", ""),
         })
+    # Save pre-named slide images (PPTX screenshots)
+    saved_slides = []
+    for s in result.get("slide_images", []):
+        slide_path = out_dir / s["filename"]
+        slide_path.write_bytes(s["data"])
+        saved_slides.append({"filename": s["filename"], "size": len(s["data"])})
     # Save embedded/attached files and recurse
     prefix = "attachment" if result.get("metadata", {}).get("email_headers") else "embedded"
     saved_embedded = []
@@ -68,12 +75,13 @@ def extract_recursive(file_path: Path, out_dir: Path):
         saved_embedded.append(entry)
     # Generate {stem}.md index in parent of out_dir
-    _generate_index_md(out_dir, file_path, result, saved_images, saved_embedded)
+    _generate_index_md(out_dir, file_path, result, saved_images, saved_embedded, saved_slides)
 def _generate_index_md(out_dir: Path, file_path: Path, result: dict,
-                       saved_images: list, saved_embedded: list):
+                       saved_images: list, saved_embedded: list,
+                       saved_slides: list | None = None):
     """Generate {stem}.md in parent of out_dir, summarizing extraction results.
     Images and embedded files are placed inline via [IMG:N]/[EMB:N] placeholders
@@ -120,7 +128,8 @@ def _generate_index_md(out_dir: Path, file_path: Path, result: dict,
             referenced_imgs.add(idx)
             if 1 <= idx <= len(saved_images):
                 img = saved_images[idx - 1]
-                return f"![{img['filename']}]({rel_prefix}/{img['filename']})"
+                alt = img.get('context', '') or img['filename']
+                return f"![{alt}]({rel_prefix}/{img['filename']})"
             return m.group(0)
         def replace_emb(m):
@@ -136,8 +145,18 @@ def _generate_index_md(out_dir: Path, file_path: Path, result: dict,
                     return f"> embedded: [{name}]({rel_prefix}/{name})"
             return m.group(0)
+        slides_list = saved_slides or []
+        def replace_slide(m):
+            idx = int(m.group(1))
+            if 1 <= idx <= len(slides_list):
+                fname = slides_list[idx - 1]["filename"]
+                return f"![{fname}]({rel_prefix}/{fname})"
+            return m.group(0)
         text = re.sub(r'\[IMG:(\d+)\]', replace_img, text)
         text = re.sub(r'\[EMB:(\d+)\]', replace_emb, text)
+        text = re.sub(r'\[SLIDE:(\d+)\]', replace_slide, text)
         if len(text) > 10000:
             body_path = out_dir / "body.txt"