PyPI - exstruct - Versions diffs - 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl - Mend

exstruct 0.2.80py3-none-any.whl → 0.3.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

exstruct/__init__.py +23 -12
exstruct/cli/main.py +20 -0
exstruct/core/backends/__init__.py +7 -0
exstruct/core/backends/base.py +42 -0
exstruct/core/backends/com_backend.py +230 -0
exstruct/core/backends/openpyxl_backend.py +191 -0
exstruct/core/cells.py +999 -483
exstruct/core/charts.py +243 -241
exstruct/core/integrate.py +42 -375
exstruct/core/logging_utils.py +16 -0
exstruct/core/modeling.py +87 -0
exstruct/core/pipeline.py +749 -0
exstruct/core/ranges.py +48 -0
exstruct/core/shapes.py +282 -36
exstruct/core/workbook.py +114 -0
exstruct/engine.py +51 -123
exstruct/errors.py +12 -1
exstruct/io/__init__.py +130 -138
exstruct/io/serialize.py +112 -0
exstruct/models/__init__.py +58 -8
exstruct/render/__init__.py +3 -7
{exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/METADATA +133 -18
exstruct-0.3.2.dist-info/RECORD +30 -0
exstruct-0.2.80.dist-info/RECORD +0 -20
{exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/WHEEL +0 -0
{exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/entry_points.txt +0 -0

exstruct/__init__.py CHANGED Viewed

@@ -7,9 +7,11 @@ from typing import Literal, TextIO
 from .core.cells import set_table_detection_params
 from .core.integrate import extract_workbook
 from .engine import (
+    ColorsOptions,
     DestinationOptions,
     ExStructEngine,
     FilterOptions,
+    FormatOptions,
     OutputOptions,
     StructOptions,
 )
@@ -75,7 +77,9 @@ __all__ = [
     "StructOptions",
     "OutputOptions",
     "FilterOptions",
+    "FormatOptions",
     "DestinationOptions",
+    "ColorsOptions",
     "serialize_workbook",
     "export_auto_page_breaks",
 ]
@@ -93,7 +97,7 @@ def extract(file_path: str | Path, mode: ExtractionMode = "standard") -> Workboo
         mode: "light" / "standard" / "verbose"
             - light: cells + table detection only (no COM, shapes/charts empty). Print areas via openpyxl.
             - standard: texted shapes + arrows + charts (COM if available), print areas included. Shape/chart size is kept but hidden by default in output.
-            - verbose: all shapes (including textless) with size, charts with size.
+            - verbose: all shapes (including textless) with size, charts with size, and colors_map.
     Returns:
         WorkbookData containing sheets, rows, shapes, charts, and print areas.
@@ -110,8 +114,13 @@ def extract(file_path: str | Path, mode: ExtractionMode = "standard") -> Workboo
         ['A1:B5']
     """
     include_links = True if mode == "verbose" else False
+    include_colors_map = True if mode == "verbose" else None
     engine = ExStructEngine(
-        options=StructOptions(mode=mode, include_cell_links=include_links)
+        options=StructOptions(
+            mode=mode,
+            include_cell_links=include_links,
+            include_colors_map=include_colors_map,
+        )
     )
     return engine.extract(file_path, mode=mode)
@@ -358,16 +367,18 @@ def process_excel(
     engine = ExStructEngine(
         options=StructOptions(mode=mode),
         output=OutputOptions(
-            fmt=out_fmt,
-            pretty=pretty,
-            indent=indent,
-            sheets_dir=sheets_dir,
-            print_areas_dir=print_areas_dir,
-            auto_page_breaks_dir=auto_page_breaks_dir,
-            include_print_areas=None if mode == "light" else True,
-            include_shape_size=True if mode == "verbose" else False,
-            include_chart_size=True if mode == "verbose" else False,
-            stream=stream,
+            format=FormatOptions(fmt=out_fmt, pretty=pretty, indent=indent),
+            filters=FilterOptions(
+                include_print_areas=None if mode == "light" else True,
+                include_shape_size=True if mode == "verbose" else False,
+                include_chart_size=True if mode == "verbose" else False,
+            ),
+            destinations=DestinationOptions(
+                sheets_dir=sheets_dir,
+                print_areas_dir=print_areas_dir,
+                auto_page_breaks_dir=auto_page_breaks_dir,
+                stream=stream,
+            ),
         ),
     )
     engine.process(

exstruct/cli/main.py CHANGED Viewed

@@ -2,11 +2,30 @@ from __future__ import annotations
 import argparse
 from pathlib import Path
+import sys
 from exstruct import process_excel
 from exstruct.cli.availability import ComAvailability, get_com_availability
+def _ensure_utf8_stdout() -> None:
+    """Reconfigure stdout to UTF-8 when supported.
+    Windows consoles default to cp932 and can raise encoding errors when piping
+    non-ASCII characters. Reconfiguring prevents failures without affecting
+    environments that already default to UTF-8.
+    """
+    stdout = sys.stdout
+    if not hasattr(stdout, "reconfigure"):
+        return
+    reconfigure = stdout.reconfigure
+    try:
+        reconfigure(encoding="utf-8", errors="replace")
+    except (AttributeError, ValueError):
+        return
 def _add_auto_page_breaks_argument(
     parser: argparse.ArgumentParser, availability: ComAvailability
 ) -> None:
@@ -102,6 +121,7 @@ def main(argv: list[str] | None = None) -> int:
     Returns:
         Exit code (0 for success, 1 for failure).
     """
+    _ensure_utf8_stdout()
     parser = build_parser()
     args = parser.parse_args(argv)

exstruct/core/backends/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from __future__ import annotations
+from .base import Backend
+from .com_backend import ComBackend
+from .openpyxl_backend import OpenpyxlBackend
+__all__ = ["Backend", "ComBackend", "OpenpyxlBackend"]

exstruct/core/backends/base.py ADDED Viewed

@@ -0,0 +1,42 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Protocol
+from ...models import CellRow, MergedCell, PrintArea
+from ..cells import WorkbookColorsMap
+CellData = dict[str, list[CellRow]]
+PrintAreaData = dict[str, list[PrintArea]]
+MergedCellData = dict[str, list[MergedCell]]
+@dataclass(frozen=True)
+class BackendConfig:
+    """Configuration options shared across backends.
+    Attributes:
+        include_default_background: Whether to include default background colors.
+        ignore_colors: Optional set of color keys to ignore.
+    """
+    include_default_background: bool
+    ignore_colors: set[str] | None
+class Backend(Protocol):
+    """Protocol for backend implementations."""
+    def extract_cells(self, *, include_links: bool) -> CellData:
+        """Extract cell rows from the workbook."""
+    def extract_print_areas(self) -> PrintAreaData:
+        """Extract print areas from the workbook."""
+    def extract_colors_map(
+        self, *, include_default_background: bool, ignore_colors: set[str] | None
+    ) -> WorkbookColorsMap | None:
+        """Extract colors map from the workbook."""
+    def extract_merged_cells(self) -> MergedCellData:
+        """Extract merged cell ranges from the workbook."""

exstruct/core/backends/com_backend.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""COM backend for Excel workbook extraction via xlwings."""
+from __future__ import annotations
+from dataclasses import dataclass
+import logging
+from typing import Any, cast
+import xlwings as xw
+from ...models import PrintArea
+from ..cells import WorkbookColorsMap, extract_sheet_colors_map_com
+from ..ranges import parse_range_zero_based
+from .base import MergedCellData, PrintAreaData
+logger = logging.getLogger(__name__)
+@dataclass(frozen=True)
+class ComBackend:
+    """COM-based backend for extraction tasks.
+    Attributes:
+        workbook: xlwings workbook instance.
+    """
+    workbook: xw.Book
+    def extract_print_areas(self) -> PrintAreaData:
+        """Extract print areas per sheet via xlwings/COM.
+        Returns:
+            Mapping of sheet name to print area list.
+        """
+        areas: PrintAreaData = {}
+        for sheet in self.workbook.sheets:
+            raw = ""
+            try:
+                raw = sheet.api.PageSetup.PrintArea or ""
+            except Exception as exc:
+                logger.warning(
+                    "Failed to read print area via COM for sheet '%s'. (%r)",
+                    sheet.name,
+                    exc,
+                )
+            if not raw:
+                continue
+            for part in str(raw).split(","):
+                parsed = _parse_print_area_range(part)
+                if not parsed:
+                    continue
+                r1, c1, r2, c2 = parsed
+                areas.setdefault(sheet.name, []).append(
+                    PrintArea(r1=r1 + 1, c1=c1, r2=r2 + 1, c2=c2)
+                )
+        return areas
+    def extract_colors_map(
+        self, *, include_default_background: bool, ignore_colors: set[str] | None
+    ) -> WorkbookColorsMap | None:
+        """Extract colors_map via COM; logs and skips on failure.
+        Args:
+            include_default_background: Whether to include default backgrounds.
+            ignore_colors: Optional set of color keys to ignore.
+        Returns:
+            WorkbookColorsMap or None when extraction fails.
+        """
+        try:
+            return extract_sheet_colors_map_com(
+                self.workbook,
+                include_default_background=include_default_background,
+                ignore_colors=ignore_colors,
+            )
+        except Exception as exc:
+            logger.warning(
+                "COM color map extraction failed; falling back to openpyxl. (%r)",
+                exc,
+            )
+            return None
+    def extract_auto_page_breaks(self) -> PrintAreaData:
+        """Compute auto page-break rectangles per sheet using Excel COM.
+        Returns:
+            Mapping of sheet name to auto page-break areas.
+        """
+        results: PrintAreaData = {}
+        for sheet in self.workbook.sheets:
+            ws_api: Any | None = None
+            original_display: bool | None = None
+            failed = False
+            try:
+                ws_api = cast(Any, sheet.api)
+                original_display = ws_api.DisplayPageBreaks
+                ws_api.DisplayPageBreaks = True
+                print_area = ws_api.PageSetup.PrintArea or ws_api.UsedRange.Address
+                parts_raw = _split_csv_respecting_quotes(str(print_area))
+                area_parts: list[str] = []
+                for part in parts_raw:
+                    rng = _normalize_area_for_sheet(part, sheet.name)
+                    if rng:
+                        area_parts.append(rng)
+                hpb = cast(Any, ws_api.HPageBreaks)
+                vpb = cast(Any, ws_api.VPageBreaks)
+                h_break_rows = [
+                    hpb.Item(i).Location.Row for i in range(1, int(hpb.Count) + 1)
+                ]
+                v_break_cols = [
+                    vpb.Item(i).Location.Column for i in range(1, int(vpb.Count) + 1)
+                ]
+                for addr in area_parts:
+                    range_obj = cast(Any, ws_api.Range(addr))
+                    min_row = int(range_obj.Row)
+                    max_row = min_row + int(range_obj.Rows.Count) - 1
+                    min_col = int(range_obj.Column)
+                    max_col = min_col + int(range_obj.Columns.Count) - 1
+                    rows = (
+                        [min_row]
+                        + [r for r in h_break_rows if min_row < r <= max_row]
+                        + [max_row + 1]
+                    )
+                    cols = (
+                        [min_col]
+                        + [c for c in v_break_cols if min_col < c <= max_col]
+                        + [max_col + 1]
+                    )
+                    for i in range(len(rows) - 1):
+                        r1, r2 = rows[i], rows[i + 1] - 1
+                        for j in range(len(cols) - 1):
+                            c1, c2 = cols[j], cols[j + 1] - 1
+                            results.setdefault(sheet.name, []).append(
+                                PrintArea(r1=r1, c1=c1 - 1, r2=r2, c2=c2 - 1)
+                            )
+            except Exception as exc:
+                logger.warning(
+                    "Failed to extract auto page breaks via COM for sheet '%s'. (%r)",
+                    sheet.name,
+                    exc,
+                )
+                failed = True
+            finally:
+                if ws_api is not None and original_display is not None:
+                    try:
+                        ws_api.DisplayPageBreaks = original_display
+                    except Exception as exc:
+                        logger.debug(
+                            "Failed to restore DisplayPageBreaks for sheet '%s'. (%r)",
+                            sheet.name,
+                            exc,
+                        )
+            if failed:
+                continue
+        return results
+    def extract_merged_cells(self) -> MergedCellData:
+        """Extract merged cell ranges via COM (not implemented)."""
+        raise NotImplementedError("COM merged cell extraction is not implemented.")
+def _parse_print_area_range(range_str: str) -> tuple[int, int, int, int] | None:
+    """Parse an Excel range string into zero-based coordinates.
+    Args:
+        range_str: Excel range string.
+    Returns:
+        Zero-based (r1, c1, r2, c2) tuple or None on failure.
+    """
+    bounds = parse_range_zero_based(range_str)
+    if bounds is None:
+        return None
+    return (bounds.r1, bounds.c1, bounds.r2, bounds.c2)
+def _normalize_area_for_sheet(part: str, ws_name: str) -> str | None:
+    """Strip sheet name from a range part when it matches the target sheet.
+    Args:
+        part: Raw range string part.
+        ws_name: Target worksheet name.
+    Returns:
+        Range without sheet prefix, or None if not matching.
+    """
+    s = part.strip()
+    if "!" not in s:
+        return s
+    sheet, rng = s.rsplit("!", 1)
+    sheet = sheet.strip()
+    if sheet.startswith("'") and sheet.endswith("'"):
+        sheet = sheet[1:-1].replace("''", "'")
+    return rng if sheet == ws_name else None
+def _split_csv_respecting_quotes(raw: str) -> list[str]:
+    """Split a CSV-like string while keeping commas inside single quotes intact.
+    Args:
+        raw: Raw CSV-like string.
+    Returns:
+        List of split parts.
+    """
+    parts: list[str] = []
+    buf: list[str] = []
+    in_quote = False
+    i = 0
+    while i < len(raw):
+        ch = raw[i]
+        if ch == "'":
+            if in_quote and i + 1 < len(raw) and raw[i + 1] == "'":
+                buf.append("''")
+                i += 2
+                continue
+            in_quote = not in_quote
+            buf.append(ch)
+            i += 1
+            continue
+        if ch == "," and not in_quote:
+            parts.append("".join(buf).strip())
+            buf = []
+            i += 1
+            continue
+        buf.append(ch)
+        i += 1
+    if buf:
+        parts.append("".join(buf).strip())
+    return [p for p in parts if p]

exstruct/core/backends/openpyxl_backend.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""Openpyxl backend for Excel workbook extraction."""
+from __future__ import annotations
+from dataclasses import dataclass
+import logging
+from pathlib import Path
+from ...models import PrintArea
+from ..cells import (
+    WorkbookColorsMap,
+    detect_tables_openpyxl,
+    extract_sheet_cells,
+    extract_sheet_cells_with_links,
+    extract_sheet_colors_map,
+    extract_sheet_merged_cells,
+)
+from ..ranges import parse_range_zero_based
+from ..workbook import openpyxl_workbook
+from .base import CellData, MergedCellData, PrintAreaData
+logger = logging.getLogger(__name__)
+@dataclass(frozen=True)
+class OpenpyxlBackend:
+    """Openpyxl-based backend for extraction tasks.
+    Attributes:
+        file_path: Path to the workbook file.
+    """
+    file_path: Path
+    def extract_cells(self, *, include_links: bool) -> CellData:
+        """Extract cell rows from the workbook.
+        Args:
+            include_links: Whether to include hyperlinks.
+        Returns:
+            Mapping of sheet name to cell rows.
+        """
+        return (
+            extract_sheet_cells_with_links(self.file_path)
+            if include_links
+            else extract_sheet_cells(self.file_path)
+        )
+    def extract_print_areas(self) -> PrintAreaData:
+        """Extract print areas per sheet using openpyxl defined names.
+        Returns:
+            Mapping of sheet name to print area list.
+        """
+        try:
+            with openpyxl_workbook(
+                self.file_path, data_only=True, read_only=False
+            ) as wb:
+                areas = _extract_print_areas_from_defined_names(wb)
+                if not areas:
+                    areas = _extract_print_areas_from_sheet_props(wb)
+                return areas
+        except Exception:
+            return {}
+    def extract_colors_map(
+        self, *, include_default_background: bool, ignore_colors: set[str] | None
+    ) -> WorkbookColorsMap | None:
+        """Extract colors_map using openpyxl.
+        Args:
+            include_default_background: Whether to include default background colors.
+            ignore_colors: Optional set of color keys to ignore.
+        Returns:
+            WorkbookColorsMap or None when extraction fails.
+        """
+        try:
+            return extract_sheet_colors_map(
+                self.file_path,
+                include_default_background=include_default_background,
+                ignore_colors=ignore_colors,
+            )
+        except Exception as exc:
+            logger.warning(
+                "Color map extraction failed; skipping colors_map. (%r)", exc
+            )
+            return None
+    def extract_merged_cells(self) -> MergedCellData:
+        """Extract merged cell ranges per sheet.
+        Returns:
+            Mapping of sheet name to merged cell ranges.
+        """
+        try:
+            return extract_sheet_merged_cells(self.file_path)
+        except Exception:
+            return {}
+    def detect_tables(self, sheet_name: str) -> list[str]:
+        """Detect table candidates for a single sheet.
+        Args:
+            sheet_name: Target worksheet name.
+        Returns:
+            List of table candidate ranges.
+        """
+        try:
+            return detect_tables_openpyxl(self.file_path, sheet_name)
+        except Exception:
+            return []
+def _extract_print_areas_from_defined_names(workbook: object) -> PrintAreaData:
+    """Extract print areas from defined names in an openpyxl workbook.
+    Args:
+        workbook: openpyxl workbook instance.
+    Returns:
+        Mapping of sheet name to print area list.
+    """
+    defined = getattr(workbook, "defined_names", None)
+    if defined is None:
+        return {}
+    defined_area = defined.get("_xlnm.Print_Area")
+    if not defined_area:
+        return {}
+    areas: PrintAreaData = {}
+    sheetnames = set(getattr(workbook, "sheetnames", []))
+    for sheet_name, range_str in defined_area.destinations:
+        if sheet_name not in sheetnames:
+            continue
+        _append_print_areas(areas, sheet_name, str(range_str))
+    return areas
+def _extract_print_areas_from_sheet_props(workbook: object) -> PrintAreaData:
+    """Extract print areas from sheet-level print area properties.
+    Args:
+        workbook: openpyxl workbook instance.
+    Returns:
+        Mapping of sheet name to print area list.
+    """
+    areas: PrintAreaData = {}
+    worksheets = getattr(workbook, "worksheets", [])
+    for ws in worksheets:
+        pa = getattr(ws, "_print_area", None)
+        if not pa:
+            continue
+        _append_print_areas(areas, str(getattr(ws, "title", "")), str(pa))
+    return areas
+def _append_print_areas(areas: PrintAreaData, sheet_name: str, range_str: str) -> None:
+    """Append parsed print areas to the mapping.
+    Args:
+        areas: Mapping to update.
+        sheet_name: Target sheet name.
+        range_str: Raw range string, possibly comma-separated.
+    """
+    for part in str(range_str).split(","):
+        parsed = _parse_print_area_range(part)
+        if not parsed:
+            continue
+        r1, c1, r2, c2 = parsed
+        areas.setdefault(sheet_name, []).append(
+            PrintArea(r1=r1 + 1, c1=c1, r2=r2 + 1, c2=c2)
+        )
+def _parse_print_area_range(range_str: str) -> tuple[int, int, int, int] | None:
+    """Parse an Excel range string into zero-based coordinates.
+    Args:
+        range_str: Excel range string.
+    Returns:
+        Zero-based (r1, c1, r2, c2) tuple or None on failure.
+    """
+    bounds = parse_range_zero_based(range_str)
+    if bounds is None:
+        return None
+    return (bounds.r1, bounds.c1, bounds.r2, bounds.c2)

exstruct 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl

exstruct 0.2.80py3-none-any.whl → 0.3.2py3-none-any.whl