PyPI - exstruct - Versions diffs - 0.2.80__py3-none-any.whl - Mend

exstruct 0.2.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

exstruct/__init__.py +387 -0
exstruct/cli/availability.py +49 -0
exstruct/cli/main.py +134 -0
exstruct/core/__init__.py +0 -0
exstruct/core/cells.py +1039 -0
exstruct/core/charts.py +241 -0
exstruct/core/integrate.py +388 -0
exstruct/core/shapes.py +275 -0
exstruct/engine.py +643 -0
exstruct/errors.py +35 -0
exstruct/io/__init__.py +555 -0
exstruct/models/__init__.py +335 -0
exstruct/models/maps.py +335 -0
exstruct/models/types.py +8 -0
exstruct/py.typed +0 -0
exstruct/render/__init__.py +118 -0
exstruct-0.2.80.dist-info/METADATA +435 -0
exstruct-0.2.80.dist-info/RECORD +20 -0
exstruct-0.2.80.dist-info/WHEEL +4 -0
exstruct-0.2.80.dist-info/entry_points.txt +3 -0

exstruct/core/charts.py ADDED Viewed

@@ -0,0 +1,241 @@
+from __future__ import annotations
+import logging
+from typing import Literal
+import xlwings as xw
+from ..models import Chart, ChartSeries
+from ..models.maps import XL_CHART_TYPE_MAP
+logger = logging.getLogger(__name__)
+def _extract_series_args_text(formula: str) -> str | None:  # noqa: C901
+    """Extract the outer argument text from '=SERIES(...)'; return None if unmatched."""
+    if not formula:
+        return None
+    s = formula.strip()
+    if not s.upper().startswith("=SERIES"):
+        return None
+    try:
+        open_idx = s.index("(", s.upper().index("=SERIES"))
+    except ValueError:
+        return None
+    depth_paren = 0
+    depth_brace = 0
+    in_str = False
+    i = open_idx + 1
+    start = i
+    while i < len(s):
+        ch = s[i]
+        if in_str:
+            if ch == '"':
+                if i + 1 < len(s) and s[i + 1] == '"':
+                    i += 2
+                    continue
+                else:
+                    in_str = False
+                    i += 1
+                    continue
+            else:
+                i += 1
+                continue
+        else:
+            if ch == '"':
+                in_str = True
+                i += 1
+                continue
+            elif ch == "(":
+                depth_paren += 1
+            elif ch == ")":
+                if depth_paren == 0:
+                    return s[start:i].strip()
+                depth_paren -= 1
+            elif ch == "{":
+                depth_brace += 1
+            elif ch == "}":
+                if depth_brace > 0:
+                    depth_brace -= 1
+            i += 1
+    return None
+def _split_top_level_args(args_text: str) -> list[str]:  # noqa: C901
+    """Split SERIES arguments at top-level separators (',' or ';')."""
+    if args_text is None:
+        return []
+    use_semicolon = (";" in args_text) and ("," not in args_text.split('"')[0])
+    sep_chars = (";",) if use_semicolon else (",",)
+    args: list[str] = []
+    buf: list[str] = []
+    depth_paren = 0
+    depth_brace = 0
+    in_str = False
+    i = 0
+    while i < len(args_text):
+        ch = args_text[i]
+        if in_str:
+            if ch == '"':
+                if i + 1 < len(args_text) and args_text[i + 1] == '"':
+                    buf.append('"')
+                    i += 2
+                    continue
+                else:
+                    in_str = False
+                    i += 1
+                    continue
+            else:
+                buf.append(ch)
+                i += 1
+                continue
+        else:
+            if ch == '"':
+                in_str = True
+                i += 1
+                continue
+            elif ch == "(":
+                depth_paren += 1
+                buf.append(ch)
+                i += 1
+                continue
+            elif ch == ")":
+                depth_paren = max(0, depth_paren - 1)
+                buf.append(ch)
+                i += 1
+                continue
+            elif ch == "{":
+                depth_brace += 1
+                buf.append(ch)
+                i += 1
+                continue
+            elif ch == "}":
+                depth_brace = max(0, depth_brace - 1)
+                buf.append(ch)
+                i += 1
+                continue
+            elif (ch in sep_chars) and depth_paren == 0 and depth_brace == 0:
+                args.append("".join(buf).strip())
+                buf = []
+                i += 1
+                continue
+            else:
+                buf.append(ch)
+                i += 1
+                continue
+    if buf or (args and args_text.endswith(sep_chars)):
+        args.append("".join(buf).strip())
+    return args
+def _unquote_excel_string(s: str | None) -> str | None:
+    """Decode Excel-style quoted string; return None if not quoted."""
+    if s is None:
+        return None
+    st = s.strip()
+    if len(st) >= 2 and st[0] == '"' and st[-1] == '"':
+        inner = st[1:-1]
+        return inner.replace('""', '"')
+    return None
+def parse_series_formula(formula: str) -> dict[str, str | None] | None:
+    """Parse =SERIES into a dict of references; return None on failure."""
+    args_text = _extract_series_args_text(formula)
+    if args_text is None:
+        return None
+    parts = _split_top_level_args(args_text)
+    name_part = parts[0].strip() if len(parts) >= 1 and parts[0].strip() != "" else None
+    x_part = parts[1].strip() if len(parts) >= 2 and parts[1].strip() != "" else None
+    y_part = parts[2].strip() if len(parts) >= 3 and parts[2].strip() != "" else None
+    plot_order_part = (
+        parts[3].strip() if len(parts) >= 4 and parts[3].strip() != "" else None
+    )
+    bubble_part = (
+        parts[4].strip() if len(parts) >= 5 and parts[4].strip() != "" else None
+    )
+    name_literal = _unquote_excel_string(name_part)
+    name_range = None if name_literal is not None else name_part
+    return {
+        "name_range": name_range,
+        "x_range": x_part,
+        "y_range": y_part,
+        "plot_order": plot_order_part,
+        "bubble_size_range": bubble_part,
+        "name_literal": name_literal,
+    }
+def get_charts(
+    sheet: xw.Sheet, mode: Literal["light", "standard", "verbose"] = "standard"
+) -> list[Chart]:
+    """Parse charts in a sheet into Chart models; failed charts carry an error field."""
+    charts: list[Chart] = []
+    for ch in sheet.charts:
+        series_list: list[ChartSeries] = []
+        y_axis_title: str = ""
+        y_axis_range: list[int] = []
+        chart_type_label: str = "unknown"
+        error: str | None = None
+        try:
+            chart_com = sheet.api.ChartObjects(ch.name).Chart
+            chart_type_num = chart_com.ChartType
+            chart_type_label = XL_CHART_TYPE_MAP.get(
+                chart_type_num, f"unknown_{chart_type_num}"
+            )
+            chart_width: int | None = None
+            chart_height: int | None = None
+            try:
+                chart_width = int(ch.width)
+                chart_height = int(ch.height)
+            except Exception:
+                chart_width = None
+                chart_height = None
+            for s in chart_com.SeriesCollection():
+                parsed = parse_series_formula(getattr(s, "Formula", ""))
+                name_range = parsed["name_range"] if parsed else None
+                x_range = parsed["x_range"] if parsed else None
+                y_range = parsed["y_range"] if parsed else None
+                series_list.append(
+                    ChartSeries(
+                        name=s.Name,
+                        name_range=name_range,
+                        x_range=x_range,
+                        y_range=y_range,
+                    )
+                )
+            try:
+                y_axis = chart_com.Axes(2, 1)
+                if y_axis.HasTitle:
+                    y_axis_title = y_axis.AxisTitle.Text
+                y_axis_range = [y_axis.MinimumScale, y_axis.MaximumScale]
+            except Exception:
+                y_axis_title = ""
+                y_axis_range = []
+            title = chart_com.ChartTitle.Text if chart_com.HasTitle else None
+        except Exception:
+            logger.warning("Failed to parse chart; returning with error string.")
+            title = None
+            error = "Failed to build chart JSON structure"
+        charts.append(
+            Chart(
+                name=ch.name,
+                chart_type=chart_type_label,
+                title=title,
+                y_axis_title=y_axis_title,
+                y_axis_range=[float(v) for v in y_axis_range],
+                w=chart_width,
+                h=chart_height,
+                series=series_list,
+                l=int(ch.left),
+                t=int(ch.top),
+                error=error,
+            )
+        )
+    return charts

exstruct/core/integrate.py ADDED Viewed

@@ -0,0 +1,388 @@
+from __future__ import annotations
+import logging
+import os
+from pathlib import Path
+from typing import Any, Literal, cast
+from openpyxl import load_workbook
+from openpyxl.utils import range_boundaries
+import xlwings as xw
+from ..models import CellRow, PrintArea, Shape, SheetData, WorkbookData
+from .cells import (
+    detect_tables,
+    detect_tables_openpyxl,
+    extract_sheet_cells,
+    extract_sheet_cells_with_links,
+)
+from .charts import get_charts
+from .shapes import get_shapes_with_position
+logger = logging.getLogger(__name__)
+_ALLOWED_MODES: set[str] = {"light", "standard", "verbose"}
+def _find_open_workbook(file_path: Path) -> xw.Book | None:
+    """Return an existing workbook if already open in Excel; otherwise None."""
+    try:
+        for app in xw.apps:
+            for wb in app.books:
+                try:
+                    if Path(wb.fullname).resolve() == file_path.resolve():
+                        return wb
+                except Exception:
+                    continue
+    except Exception:
+        return None
+    return None
+def _open_workbook(file_path: Path) -> tuple[xw.Book, bool]:
+    """
+    Open workbook:
+    - If already open, reuse and do not close Excel on exit.
+    - Otherwise create invisible Excel (visible=False) and close when done.
+    Returns (workbook, should_close_app).
+    """
+    existing = _find_open_workbook(file_path)
+    if existing:
+        return existing, False
+    app = xw.App(add_book=False, visible=False)
+    wb = app.books.open(str(file_path))
+    return wb, True
+def _parse_print_area_range(
+    range_str: str, *, zero_based: bool = True
+) -> tuple[int, int, int, int] | None:
+    """
+    Parse an Excel range string into (r1, c1, r2, c2). Returns None on failure.
+    """
+    cleaned = range_str.strip()
+    if not cleaned:
+        return None
+    if "!" in cleaned:
+        cleaned = cleaned.split("!", 1)[1]
+    try:
+        min_col, min_row, max_col, max_row = range_boundaries(cleaned)
+    except Exception:
+        return None
+    if zero_based:
+        return (min_row - 1, min_col - 1, max_row - 1, max_col - 1)
+    return (min_row, min_col, max_row, max_col)
+def _extract_print_areas_openpyxl(  # noqa: C901
+    file_path: Path,
+) -> dict[str, list[PrintArea]]:
+    """
+    Extract print areas per sheet using openpyxl defined names.
+    Returns {sheet_name: [PrintArea, ...]}.
+    """
+    try:
+        wb = load_workbook(file_path, data_only=True, read_only=True)
+    except Exception:
+        return {}
+    try:
+        defined = wb.defined_names.get("_xlnm.Print_Area")
+        areas: dict[str, list[PrintArea]] = {}
+        if defined:
+            for sheet_name, range_str in defined.destinations:
+                if sheet_name not in wb.sheetnames:
+                    continue
+                # A single destination can contain multiple comma-separated ranges.
+                for part in str(range_str).split(","):
+                    parsed = _parse_print_area_range(part)
+                    if not parsed:
+                        continue
+                    r1, c1, r2, c2 = parsed
+                    areas.setdefault(sheet_name, []).append(
+                        PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
+                    )
+        # Fallback: some files carry sheet-level print_area without defined name.
+        if not areas:
+            for ws in wb.worksheets:
+                pa = getattr(ws, "_print_area", None)
+                if not pa:
+                    continue
+                for part in str(pa).split(","):
+                    parsed = _parse_print_area_range(part)
+                    if not parsed:
+                        continue
+                    r1, c1, r2, c2 = parsed
+                    areas.setdefault(ws.title, []).append(
+                        PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
+                    )
+        return areas
+    finally:
+        try:
+            wb.close()
+        except Exception:
+            pass
+def _extract_print_areas_com(workbook: xw.Book) -> dict[str, list[PrintArea]]:
+    """
+    Extract print areas per sheet via xlwings/COM.
+    Uses Sheet.PageSetup.PrintArea which may contain comma-separated ranges.
+    """
+    areas: dict[str, list[PrintArea]] = {}
+    for sheet in workbook.sheets:
+        try:
+            raw = sheet.api.PageSetup.PrintArea or ""
+        except Exception:
+            continue
+        if not raw:
+            continue
+        parts = str(raw).split(",")
+        for part in parts:
+            parsed = _parse_print_area_range(part, zero_based=True)
+            if not parsed:
+                continue
+            r1, c1, r2, c2 = parsed
+            areas.setdefault(sheet.name, []).append(
+                PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
+            )
+    return areas
+def _normalize_area_for_sheet(part: str, ws_name: str) -> str | None:
+    """
+    Strip sheet name from a range part when it matches the target sheet; otherwise None.
+    """
+    s = part.strip()
+    if "!" not in s:
+        return s
+    sheet, rng = s.rsplit("!", 1)
+    sheet = sheet.strip()
+    if sheet.startswith("'") and sheet.endswith("'"):
+        sheet = sheet[1:-1].replace("''", "'")
+    return rng if sheet == ws_name else None
+def _split_csv_respecting_quotes(raw: str) -> list[str]:
+    """
+    Split a CSV-like string while keeping commas inside single quotes intact.
+    """
+    parts: list[str] = []
+    buf: list[str] = []
+    in_quote = False
+    i = 0
+    while i < len(raw):
+        ch = raw[i]
+        if ch == "'":
+            if in_quote and i + 1 < len(raw) and raw[i + 1] == "'":
+                buf.append("''")
+                i += 2
+                continue
+            in_quote = not in_quote
+            buf.append(ch)
+            i += 1
+            continue
+        if ch == "," and not in_quote:
+            parts.append("".join(buf).strip())
+            buf = []
+            i += 1
+            continue
+        buf.append(ch)
+        i += 1
+    if buf:
+        parts.append("".join(buf).strip())
+    return [p for p in parts if p]
+def _compute_auto_page_break_areas(workbook: xw.Book) -> dict[str, list[PrintArea]]:
+    """
+    Compute auto page-break rectangles per sheet using Excel COM.
+    Falls back to empty dict on failure.
+    """
+    results: dict[str, list[PrintArea]] = {}
+    for sheet in workbook.sheets:
+        try:
+            ws_api = cast(Any, sheet.api)  # xlwings COM API; treated as Any
+            original_display: bool | None = ws_api.DisplayPageBreaks
+            ws_api.DisplayPageBreaks = True
+            print_area = ws_api.PageSetup.PrintArea or ws_api.UsedRange.Address
+            parts_raw = _split_csv_respecting_quotes(str(print_area))
+            area_parts: list[str] = []
+            for part in parts_raw:
+                rng = _normalize_area_for_sheet(part, sheet.name)
+                if rng:
+                    area_parts.append(rng)
+            hpb = cast(Any, ws_api.HPageBreaks)
+            vpb = cast(Any, ws_api.VPageBreaks)
+            h_break_rows = [
+                hpb.Item(i).Location.Row for i in range(1, int(hpb.Count) + 1)
+            ]
+            v_break_cols = [
+                vpb.Item(i).Location.Column for i in range(1, int(vpb.Count) + 1)
+            ]
+            for addr in area_parts:
+                range_obj = cast(Any, ws_api.Range(addr))
+                min_row = int(range_obj.Row)
+                max_row = min_row + int(range_obj.Rows.Count) - 1
+                min_col = int(range_obj.Column)
+                max_col = min_col + int(range_obj.Columns.Count) - 1
+                rows = (
+                    [min_row]
+                    + [r for r in h_break_rows if min_row < r <= max_row]
+                    + [max_row + 1]
+                )
+                cols = (
+                    [min_col]
+                    + [c for c in v_break_cols if min_col < c <= max_col]
+                    + [max_col + 1]
+                )
+                for i in range(len(rows) - 1):
+                    r1, r2 = rows[i], rows[i + 1] - 1
+                    for j in range(len(cols) - 1):
+                        c1, c2 = cols[j], cols[j + 1] - 1
+                        c1_0 = c1 - 1
+                        c2_0 = c2 - 1
+                        results.setdefault(sheet.name, []).append(
+                            PrintArea(r1=r1, c1=c1_0, r2=r2, c2=c2_0)
+                        )
+            if original_display is not None:
+                ws_api.DisplayPageBreaks = original_display
+        except Exception:
+            try:
+                if original_display is not None:
+                    ws_api.DisplayPageBreaks = original_display
+            except Exception:
+                pass
+            continue
+    return results
+def integrate_sheet_content(
+    cell_data: dict[str, list[CellRow]],
+    shape_data: dict[str, list[Shape]],
+    workbook: xw.Book,
+    mode: Literal["light", "standard", "verbose"] = "standard",
+    print_area_data: dict[str, list[PrintArea]] | None = None,
+    auto_page_break_data: dict[str, list[PrintArea]] | None = None,
+) -> dict[str, SheetData]:
+    """Integrate cells, shapes, charts, and tables into SheetData per sheet."""
+    result: dict[str, SheetData] = {}
+    for sheet_name, rows in cell_data.items():
+        sheet_shapes = shape_data.get(sheet_name, [])
+        sheet = workbook.sheets[sheet_name]
+        sheet_model = SheetData(
+            rows=rows,
+            shapes=sheet_shapes,
+            charts=[] if mode == "light" else get_charts(sheet, mode=mode),
+            table_candidates=detect_tables(sheet),
+            print_areas=print_area_data.get(sheet_name, []) if print_area_data else [],
+            auto_print_areas=auto_page_break_data.get(sheet_name, [])
+            if auto_page_break_data
+            else [],
+        )
+        result[sheet_name] = sheet_model
+    return result
+def extract_workbook(  # noqa: C901
+    file_path: str | Path,
+    mode: Literal["light", "standard", "verbose"] = "standard",
+    *,
+    include_cell_links: bool = False,
+    include_print_areas: bool = True,
+    include_auto_page_breaks: bool = False,
+) -> WorkbookData:
+    """Extract workbook and return WorkbookData; fallback to cells+tables if Excel COM is unavailable."""
+    if mode not in _ALLOWED_MODES:
+        raise ValueError(f"Unsupported mode: {mode}")
+    normalized_file_path = file_path if isinstance(file_path, Path) else Path(file_path)
+    cell_data = (
+        extract_sheet_cells_with_links(normalized_file_path)
+        if include_cell_links
+        else extract_sheet_cells(normalized_file_path)
+    )
+    print_area_data: dict[str, list[PrintArea]] = {}
+    if include_print_areas:
+        print_area_data = _extract_print_areas_openpyxl(normalized_file_path)
+    auto_page_break_data: dict[str, list[PrintArea]] = {}
+    def _cells_and_tables_only(reason: str) -> WorkbookData:
+        sheets: dict[str, SheetData] = {}
+        for sheet_name, rows in cell_data.items():
+            try:
+                tables = detect_tables_openpyxl(normalized_file_path, sheet_name)
+            except Exception:
+                tables = []
+            sheets[sheet_name] = SheetData(
+                rows=rows,
+                shapes=[],
+                charts=[],
+                table_candidates=tables,
+                print_areas=print_area_data.get(sheet_name, [])
+                if include_print_areas
+                else [],
+                auto_print_areas=[],
+            )
+        logger.warning(
+            "%s Falling back to cells+tables only; shapes and charts will be empty.",
+            reason,
+        )
+        return WorkbookData(book_name=normalized_file_path.name, sheets=sheets)
+    if mode == "light":
+        return _cells_and_tables_only("Light mode selected.")
+    if os.getenv("SKIP_COM_TESTS"):
+        return _cells_and_tables_only(
+            "SKIP_COM_TESTS is set; skipping COM/xlwings access."
+        )
+    try:
+        wb, close_app = _open_workbook(normalized_file_path)
+    except Exception as e:
+        return _cells_and_tables_only(f"xlwings/Excel COM is unavailable. ({e!r})")
+    try:
+        try:
+            shape_data = get_shapes_with_position(wb, mode=mode)
+            if include_print_areas and not print_area_data:
+                # openpyxl couldn't read (e.g., .xls). Try COM as a fallback.
+                try:
+                    print_area_data = _extract_print_areas_com(wb)
+                except Exception:
+                    print_area_data = {}
+            if include_auto_page_breaks:
+                try:
+                    auto_page_break_data = _compute_auto_page_break_areas(wb)
+                except Exception:
+                    auto_page_break_data = {}
+            merged = integrate_sheet_content(
+                cell_data,
+                shape_data,
+                wb,
+                mode=mode,
+                print_area_data=print_area_data if include_print_areas else None,
+                auto_page_break_data=auto_page_break_data
+                if include_auto_page_breaks
+                else None,
+            )
+            return WorkbookData(book_name=normalized_file_path.name, sheets=merged)
+        except Exception as e:
+            logger.warning(
+                "Shape extraction failed; falling back to cells+tables. (%r)", e
+            )
+            return _cells_and_tables_only(f"Shape extraction failed ({e!r}).")
+    finally:
+        # Close only if we created the app to avoid shutting user sessions.
+        try:
+            if close_app:
+                app = wb.app
+                wb.close()
+                app.quit()
+        except Exception:
+            pass