PyPI - exstruct - Versions diffs - 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

exstruct 0.2.80py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

exstruct/__init__.py +23 -12
exstruct/cli/main.py +20 -0
exstruct/core/backends/__init__.py +7 -0
exstruct/core/backends/base.py +42 -0
exstruct/core/backends/com_backend.py +230 -0
exstruct/core/backends/openpyxl_backend.py +191 -0
exstruct/core/cells.py +999 -483
exstruct/core/charts.py +243 -241
exstruct/core/integrate.py +42 -375
exstruct/core/logging_utils.py +16 -0
exstruct/core/modeling.py +87 -0
exstruct/core/pipeline.py +749 -0
exstruct/core/ranges.py +48 -0
exstruct/core/shapes.py +282 -36
exstruct/core/workbook.py +114 -0
exstruct/engine.py +51 -123
exstruct/errors.py +12 -1
exstruct/io/__init__.py +130 -138
exstruct/io/serialize.py +112 -0
exstruct/models/__init__.py +58 -8
exstruct/render/__init__.py +3 -7
{exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/METADATA +133 -18
exstruct-0.3.2.dist-info/RECORD +30 -0
exstruct-0.2.80.dist-info/RECORD +0 -20
{exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/WHEEL +0 -0
{exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/entry_points.txt +0 -0

exstruct/core/integrate.py CHANGED Viewed

@@ -1,388 +1,55 @@
 from __future__ import annotations
-import logging
-import os
 from pathlib import Path
-from typing import Any, Literal, cast
+from typing import Literal
-from openpyxl import load_workbook
-from openpyxl.utils import range_boundaries
-import xlwings as xw
-from ..models import CellRow, PrintArea, Shape, SheetData, WorkbookData
-from .cells import (
-    detect_tables,
-    detect_tables_openpyxl,
-    extract_sheet_cells,
-    extract_sheet_cells_with_links,
-)
-from .charts import get_charts
-from .shapes import get_shapes_with_position
-logger = logging.getLogger(__name__)
-_ALLOWED_MODES: set[str] = {"light", "standard", "verbose"}
-def _find_open_workbook(file_path: Path) -> xw.Book | None:
-    """Return an existing workbook if already open in Excel; otherwise None."""
-    try:
-        for app in xw.apps:
-            for wb in app.books:
-                try:
-                    if Path(wb.fullname).resolve() == file_path.resolve():
-                        return wb
-                except Exception:
-                    continue
-    except Exception:
-        return None
-    return None
-def _open_workbook(file_path: Path) -> tuple[xw.Book, bool]:
-    """
-    Open workbook:
-    - If already open, reuse and do not close Excel on exit.
-    - Otherwise create invisible Excel (visible=False) and close when done.
-    Returns (workbook, should_close_app).
-    """
-    existing = _find_open_workbook(file_path)
-    if existing:
-        return existing, False
-    app = xw.App(add_book=False, visible=False)
-    wb = app.books.open(str(file_path))
-    return wb, True
-def _parse_print_area_range(
-    range_str: str, *, zero_based: bool = True
-) -> tuple[int, int, int, int] | None:
-    """
-    Parse an Excel range string into (r1, c1, r2, c2). Returns None on failure.
-    """
-    cleaned = range_str.strip()
-    if not cleaned:
-        return None
-    if "!" in cleaned:
-        cleaned = cleaned.split("!", 1)[1]
-    try:
-        min_col, min_row, max_col, max_row = range_boundaries(cleaned)
-    except Exception:
-        return None
-    if zero_based:
-        return (min_row - 1, min_col - 1, max_row - 1, max_col - 1)
-    return (min_row, min_col, max_row, max_col)
-def _extract_print_areas_openpyxl(  # noqa: C901
-    file_path: Path,
-) -> dict[str, list[PrintArea]]:
-    """
-    Extract print areas per sheet using openpyxl defined names.
-    Returns {sheet_name: [PrintArea, ...]}.
-    """
-    try:
-        wb = load_workbook(file_path, data_only=True, read_only=True)
-    except Exception:
-        return {}
-    try:
-        defined = wb.defined_names.get("_xlnm.Print_Area")
-        areas: dict[str, list[PrintArea]] = {}
-        if defined:
-            for sheet_name, range_str in defined.destinations:
-                if sheet_name not in wb.sheetnames:
-                    continue
-                # A single destination can contain multiple comma-separated ranges.
-                for part in str(range_str).split(","):
-                    parsed = _parse_print_area_range(part)
-                    if not parsed:
-                        continue
-                    r1, c1, r2, c2 = parsed
-                    areas.setdefault(sheet_name, []).append(
-                        PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
-                    )
-        # Fallback: some files carry sheet-level print_area without defined name.
-        if not areas:
-            for ws in wb.worksheets:
-                pa = getattr(ws, "_print_area", None)
-                if not pa:
-                    continue
-                for part in str(pa).split(","):
-                    parsed = _parse_print_area_range(part)
-                    if not parsed:
-                        continue
-                    r1, c1, r2, c2 = parsed
-                    areas.setdefault(ws.title, []).append(
-                        PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
-                    )
-        return areas
-    finally:
-        try:
-            wb.close()
-        except Exception:
-            pass
-def _extract_print_areas_com(workbook: xw.Book) -> dict[str, list[PrintArea]]:
-    """
-    Extract print areas per sheet via xlwings/COM.
-    Uses Sheet.PageSetup.PrintArea which may contain comma-separated ranges.
-    """
-    areas: dict[str, list[PrintArea]] = {}
-    for sheet in workbook.sheets:
-        try:
-            raw = sheet.api.PageSetup.PrintArea or ""
-        except Exception:
-            continue
-        if not raw:
-            continue
-        parts = str(raw).split(",")
-        for part in parts:
-            parsed = _parse_print_area_range(part, zero_based=True)
-            if not parsed:
-                continue
-            r1, c1, r2, c2 = parsed
-            areas.setdefault(sheet.name, []).append(
-                PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
-            )
-    return areas
-def _normalize_area_for_sheet(part: str, ws_name: str) -> str | None:
-    """
-    Strip sheet name from a range part when it matches the target sheet; otherwise None.
-    """
-    s = part.strip()
-    if "!" not in s:
-        return s
-    sheet, rng = s.rsplit("!", 1)
-    sheet = sheet.strip()
-    if sheet.startswith("'") and sheet.endswith("'"):
-        sheet = sheet[1:-1].replace("''", "'")
-    return rng if sheet == ws_name else None
-def _split_csv_respecting_quotes(raw: str) -> list[str]:
-    """
-    Split a CSV-like string while keeping commas inside single quotes intact.
-    """
-    parts: list[str] = []
-    buf: list[str] = []
-    in_quote = False
-    i = 0
-    while i < len(raw):
-        ch = raw[i]
-        if ch == "'":
-            if in_quote and i + 1 < len(raw) and raw[i + 1] == "'":
-                buf.append("''")
-                i += 2
-                continue
-            in_quote = not in_quote
-            buf.append(ch)
-            i += 1
-            continue
-        if ch == "," and not in_quote:
-            parts.append("".join(buf).strip())
-            buf = []
-            i += 1
-            continue
-        buf.append(ch)
-        i += 1
-    if buf:
-        parts.append("".join(buf).strip())
-    return [p for p in parts if p]
-def _compute_auto_page_break_areas(workbook: xw.Book) -> dict[str, list[PrintArea]]:
-    """
-    Compute auto page-break rectangles per sheet using Excel COM.
-    Falls back to empty dict on failure.
-    """
-    results: dict[str, list[PrintArea]] = {}
-    for sheet in workbook.sheets:
-        try:
-            ws_api = cast(Any, sheet.api)  # xlwings COM API; treated as Any
-            original_display: bool | None = ws_api.DisplayPageBreaks
-            ws_api.DisplayPageBreaks = True
-            print_area = ws_api.PageSetup.PrintArea or ws_api.UsedRange.Address
-            parts_raw = _split_csv_respecting_quotes(str(print_area))
-            area_parts: list[str] = []
-            for part in parts_raw:
-                rng = _normalize_area_for_sheet(part, sheet.name)
-                if rng:
-                    area_parts.append(rng)
-            hpb = cast(Any, ws_api.HPageBreaks)
-            vpb = cast(Any, ws_api.VPageBreaks)
-            h_break_rows = [
-                hpb.Item(i).Location.Row for i in range(1, int(hpb.Count) + 1)
-            ]
-            v_break_cols = [
-                vpb.Item(i).Location.Column for i in range(1, int(vpb.Count) + 1)
-            ]
-            for addr in area_parts:
-                range_obj = cast(Any, ws_api.Range(addr))
-                min_row = int(range_obj.Row)
-                max_row = min_row + int(range_obj.Rows.Count) - 1
-                min_col = int(range_obj.Column)
-                max_col = min_col + int(range_obj.Columns.Count) - 1
-                rows = (
-                    [min_row]
-                    + [r for r in h_break_rows if min_row < r <= max_row]
-                    + [max_row + 1]
-                )
-                cols = (
-                    [min_col]
-                    + [c for c in v_break_cols if min_col < c <= max_col]
-                    + [max_col + 1]
-                )
-                for i in range(len(rows) - 1):
-                    r1, r2 = rows[i], rows[i + 1] - 1
-                    for j in range(len(cols) - 1):
-                        c1, c2 = cols[j], cols[j + 1] - 1
-                        c1_0 = c1 - 1
-                        c2_0 = c2 - 1
-                        results.setdefault(sheet.name, []).append(
-                            PrintArea(r1=r1, c1=c1_0, r2=r2, c2=c2_0)
-                        )
-            if original_display is not None:
-                ws_api.DisplayPageBreaks = original_display
-        except Exception:
-            try:
-                if original_display is not None:
-                    ws_api.DisplayPageBreaks = original_display
-            except Exception:
-                pass
-            continue
-    return results
-def integrate_sheet_content(
-    cell_data: dict[str, list[CellRow]],
-    shape_data: dict[str, list[Shape]],
-    workbook: xw.Book,
-    mode: Literal["light", "standard", "verbose"] = "standard",
-    print_area_data: dict[str, list[PrintArea]] | None = None,
-    auto_page_break_data: dict[str, list[PrintArea]] | None = None,
-) -> dict[str, SheetData]:
-    """Integrate cells, shapes, charts, and tables into SheetData per sheet."""
-    result: dict[str, SheetData] = {}
-    for sheet_name, rows in cell_data.items():
-        sheet_shapes = shape_data.get(sheet_name, [])
-        sheet = workbook.sheets[sheet_name]
-        sheet_model = SheetData(
-            rows=rows,
-            shapes=sheet_shapes,
-            charts=[] if mode == "light" else get_charts(sheet, mode=mode),
-            table_candidates=detect_tables(sheet),
-            print_areas=print_area_data.get(sheet_name, []) if print_area_data else [],
-            auto_print_areas=auto_page_break_data.get(sheet_name, [])
-            if auto_page_break_data
-            else [],
-        )
-        result[sheet_name] = sheet_model
-    return result
+from ..models import WorkbookData
+from .pipeline import resolve_extraction_inputs, run_extraction_pipeline
 def extract_workbook(  # noqa: C901
     file_path: str | Path,
     mode: Literal["light", "standard", "verbose"] = "standard",
     *,
-    include_cell_links: bool = False,
-    include_print_areas: bool = True,
+    include_cell_links: bool | None = None,
+    include_print_areas: bool | None = None,
     include_auto_page_breaks: bool = False,
+    include_colors_map: bool | None = None,
+    include_default_background: bool = False,
+    ignore_colors: set[str] | None = None,
+    include_merged_cells: bool | None = None,
 ) -> WorkbookData:
-    """Extract workbook and return WorkbookData; fallback to cells+tables if Excel COM is unavailable."""
-    if mode not in _ALLOWED_MODES:
-        raise ValueError(f"Unsupported mode: {mode}")
-    normalized_file_path = file_path if isinstance(file_path, Path) else Path(file_path)
-    cell_data = (
-        extract_sheet_cells_with_links(normalized_file_path)
-        if include_cell_links
-        else extract_sheet_cells(normalized_file_path)
+    """Extract workbook and return WorkbookData.
+    Falls back to cells+tables if Excel COM is unavailable.
+    Args:
+        file_path: Workbook path.
+        mode: Extraction mode.
+        include_cell_links: Whether to include cell hyperlinks; None uses mode defaults.
+        include_print_areas: Whether to include print areas; None defaults to True.
+        include_auto_page_breaks: Whether to include auto page breaks.
+        include_colors_map: Whether to include colors map; None uses mode defaults.
+        include_default_background: Whether to include default background color.
+        ignore_colors: Optional set of color keys to ignore.
+        include_merged_cells: Whether to include merged cell ranges; None uses mode defaults.
+    Returns:
+        Extracted WorkbookData.
+    Raises:
+        ValueError: If mode is unsupported.
+    """
+    inputs = resolve_extraction_inputs(
+        file_path,
+        mode=mode,
+        include_cell_links=include_cell_links,
+        include_print_areas=include_print_areas,
+        include_auto_page_breaks=include_auto_page_breaks,
+        include_colors_map=include_colors_map,
+        include_default_background=include_default_background,
+        ignore_colors=ignore_colors,
+        include_merged_cells=include_merged_cells,
     )
-    print_area_data: dict[str, list[PrintArea]] = {}
-    if include_print_areas:
-        print_area_data = _extract_print_areas_openpyxl(normalized_file_path)
-    auto_page_break_data: dict[str, list[PrintArea]] = {}
-    def _cells_and_tables_only(reason: str) -> WorkbookData:
-        sheets: dict[str, SheetData] = {}
-        for sheet_name, rows in cell_data.items():
-            try:
-                tables = detect_tables_openpyxl(normalized_file_path, sheet_name)
-            except Exception:
-                tables = []
-            sheets[sheet_name] = SheetData(
-                rows=rows,
-                shapes=[],
-                charts=[],
-                table_candidates=tables,
-                print_areas=print_area_data.get(sheet_name, [])
-                if include_print_areas
-                else [],
-                auto_print_areas=[],
-            )
-        logger.warning(
-            "%s Falling back to cells+tables only; shapes and charts will be empty.",
-            reason,
-        )
-        return WorkbookData(book_name=normalized_file_path.name, sheets=sheets)
-    if mode == "light":
-        return _cells_and_tables_only("Light mode selected.")
-    if os.getenv("SKIP_COM_TESTS"):
-        return _cells_and_tables_only(
-            "SKIP_COM_TESTS is set; skipping COM/xlwings access."
-        )
-    try:
-        wb, close_app = _open_workbook(normalized_file_path)
-    except Exception as e:
-        return _cells_and_tables_only(f"xlwings/Excel COM is unavailable. ({e!r})")
-    try:
-        try:
-            shape_data = get_shapes_with_position(wb, mode=mode)
-            if include_print_areas and not print_area_data:
-                # openpyxl couldn't read (e.g., .xls). Try COM as a fallback.
-                try:
-                    print_area_data = _extract_print_areas_com(wb)
-                except Exception:
-                    print_area_data = {}
-            if include_auto_page_breaks:
-                try:
-                    auto_page_break_data = _compute_auto_page_break_areas(wb)
-                except Exception:
-                    auto_page_break_data = {}
-            merged = integrate_sheet_content(
-                cell_data,
-                shape_data,
-                wb,
-                mode=mode,
-                print_area_data=print_area_data if include_print_areas else None,
-                auto_page_break_data=auto_page_break_data
-                if include_auto_page_breaks
-                else None,
-            )
-            return WorkbookData(book_name=normalized_file_path.name, sheets=merged)
-        except Exception as e:
-            logger.warning(
-                "Shape extraction failed; falling back to cells+tables. (%r)", e
-            )
-            return _cells_and_tables_only(f"Shape extraction failed ({e!r}).")
-    finally:
-        # Close only if we created the app to avoid shutting user sessions.
-        try:
-            if close_app:
-                app = wb.app
-                wb.close()
-                app.quit()
-        except Exception:
-            pass
+    result = run_extraction_pipeline(inputs)
+    return result.workbook

exstruct/core/logging_utils.py ADDED Viewed

@@ -0,0 +1,16 @@
+from __future__ import annotations
+import logging
+from ..errors import FallbackReason
+def log_fallback(logger: logging.Logger, reason: FallbackReason, message: str) -> None:
+    """Log a standardized fallback warning.
+    Args:
+        logger: Logger instance to emit the warning.
+        reason: Fallback reason code.
+        message: Human-readable detail message.
+    """
+    logger.warning("[%s] %s", reason.value, message)

exstruct/core/modeling.py ADDED Viewed

@@ -0,0 +1,87 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from ..models import (
+    Arrow,
+    CellRow,
+    Chart,
+    MergedCell,
+    PrintArea,
+    Shape,
+    SheetData,
+    SmartArt,
+    WorkbookData,
+)
+@dataclass(frozen=True)
+class SheetRawData:
+    """Raw, extracted sheet data before model conversion.
+    Attributes:
+        rows: Extracted cell rows.
+        shapes: Extracted shapes.
+        charts: Extracted charts.
+        table_candidates: Detected table ranges.
+        print_areas: Extracted print areas.
+        auto_print_areas: Extracted auto page-break areas.
+        colors_map: Mapping of color keys to (row, column) positions.
+        merged_cells: Extracted merged cell ranges.
+    """
+    rows: list[CellRow]
+    shapes: list[Shape | Arrow | SmartArt]
+    charts: list[Chart]
+    table_candidates: list[str]
+    print_areas: list[PrintArea]
+    auto_print_areas: list[PrintArea]
+    colors_map: dict[str, list[tuple[int, int]]]
+    merged_cells: list[MergedCell]
+@dataclass(frozen=True)
+class WorkbookRawData:
+    """Raw, extracted workbook data before model conversion.
+    Attributes:
+        book_name: Workbook file name.
+        sheets: Mapping of sheet name to raw sheet data.
+    """
+    book_name: str
+    sheets: dict[str, SheetRawData]
+def build_sheet_data(raw: SheetRawData) -> SheetData:
+    """Build a SheetData model from raw sheet data.
+    Args:
+        raw: Raw sheet data.
+    Returns:
+        SheetData model instance.
+    """
+    return SheetData(
+        rows=raw.rows,
+        shapes=raw.shapes,
+        charts=raw.charts,
+        table_candidates=raw.table_candidates,
+        print_areas=raw.print_areas,
+        auto_print_areas=raw.auto_print_areas,
+        colors_map=raw.colors_map,
+        merged_cells=raw.merged_cells,
+    )
+def build_workbook_data(raw: WorkbookRawData) -> WorkbookData:
+    """Build a WorkbookData model from raw workbook data.
+    Args:
+        raw: Raw workbook data.
+    Returns:
+        WorkbookData model instance.
+    """
+    sheets = {name: build_sheet_data(sheet) for name, sheet in raw.sheets.items()}
+    return WorkbookData(book_name=raw.book_name, sheets=sheets)

exstruct 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl

exstruct 0.2.80py3-none-any.whl → 0.3.2py3-none-any.whl