exstruct 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
exstruct/__init__.py CHANGED
@@ -7,9 +7,11 @@ from typing import Literal, TextIO
7
7
  from .core.cells import set_table_detection_params
8
8
  from .core.integrate import extract_workbook
9
9
  from .engine import (
10
+ ColorsOptions,
10
11
  DestinationOptions,
11
12
  ExStructEngine,
12
13
  FilterOptions,
14
+ FormatOptions,
13
15
  OutputOptions,
14
16
  StructOptions,
15
17
  )
@@ -75,7 +77,9 @@ __all__ = [
75
77
  "StructOptions",
76
78
  "OutputOptions",
77
79
  "FilterOptions",
80
+ "FormatOptions",
78
81
  "DestinationOptions",
82
+ "ColorsOptions",
79
83
  "serialize_workbook",
80
84
  "export_auto_page_breaks",
81
85
  ]
@@ -93,7 +97,7 @@ def extract(file_path: str | Path, mode: ExtractionMode = "standard") -> Workboo
93
97
  mode: "light" / "standard" / "verbose"
94
98
  - light: cells + table detection only (no COM, shapes/charts empty). Print areas via openpyxl.
95
99
  - standard: texted shapes + arrows + charts (COM if available), print areas included. Shape/chart size is kept but hidden by default in output.
96
- - verbose: all shapes (including textless) with size, charts with size.
100
+ - verbose: all shapes (including textless) with size, charts with size, and colors_map.
97
101
 
98
102
  Returns:
99
103
  WorkbookData containing sheets, rows, shapes, charts, and print areas.
@@ -110,8 +114,13 @@ def extract(file_path: str | Path, mode: ExtractionMode = "standard") -> Workboo
110
114
  ['A1:B5']
111
115
  """
112
116
  include_links = True if mode == "verbose" else False
117
+ include_colors_map = True if mode == "verbose" else None
113
118
  engine = ExStructEngine(
114
- options=StructOptions(mode=mode, include_cell_links=include_links)
119
+ options=StructOptions(
120
+ mode=mode,
121
+ include_cell_links=include_links,
122
+ include_colors_map=include_colors_map,
123
+ )
115
124
  )
116
125
  return engine.extract(file_path, mode=mode)
117
126
 
@@ -358,16 +367,18 @@ def process_excel(
358
367
  engine = ExStructEngine(
359
368
  options=StructOptions(mode=mode),
360
369
  output=OutputOptions(
361
- fmt=out_fmt,
362
- pretty=pretty,
363
- indent=indent,
364
- sheets_dir=sheets_dir,
365
- print_areas_dir=print_areas_dir,
366
- auto_page_breaks_dir=auto_page_breaks_dir,
367
- include_print_areas=None if mode == "light" else True,
368
- include_shape_size=True if mode == "verbose" else False,
369
- include_chart_size=True if mode == "verbose" else False,
370
- stream=stream,
370
+ format=FormatOptions(fmt=out_fmt, pretty=pretty, indent=indent),
371
+ filters=FilterOptions(
372
+ include_print_areas=None if mode == "light" else True,
373
+ include_shape_size=True if mode == "verbose" else False,
374
+ include_chart_size=True if mode == "verbose" else False,
375
+ ),
376
+ destinations=DestinationOptions(
377
+ sheets_dir=sheets_dir,
378
+ print_areas_dir=print_areas_dir,
379
+ auto_page_breaks_dir=auto_page_breaks_dir,
380
+ stream=stream,
381
+ ),
371
382
  ),
372
383
  )
373
384
  engine.process(
exstruct/cli/main.py CHANGED
@@ -2,11 +2,30 @@ from __future__ import annotations
2
2
 
3
3
  import argparse
4
4
  from pathlib import Path
5
+ import sys
5
6
 
6
7
  from exstruct import process_excel
7
8
  from exstruct.cli.availability import ComAvailability, get_com_availability
8
9
 
9
10
 
11
+ def _ensure_utf8_stdout() -> None:
12
+ """Reconfigure stdout to UTF-8 when supported.
13
+
14
+ Windows consoles default to cp932 and can raise encoding errors when piping
15
+ non-ASCII characters. Reconfiguring prevents failures without affecting
16
+ environments that already default to UTF-8.
17
+ """
18
+
19
+ stdout = sys.stdout
20
+ if not hasattr(stdout, "reconfigure"):
21
+ return
22
+ reconfigure = stdout.reconfigure
23
+ try:
24
+ reconfigure(encoding="utf-8", errors="replace")
25
+ except (AttributeError, ValueError):
26
+ return
27
+
28
+
10
29
  def _add_auto_page_breaks_argument(
11
30
  parser: argparse.ArgumentParser, availability: ComAvailability
12
31
  ) -> None:
@@ -102,6 +121,7 @@ def main(argv: list[str] | None = None) -> int:
102
121
  Returns:
103
122
  Exit code (0 for success, 1 for failure).
104
123
  """
124
+ _ensure_utf8_stdout()
105
125
  parser = build_parser()
106
126
  args = parser.parse_args(argv)
107
127
 
@@ -0,0 +1,7 @@
1
+ from __future__ import annotations
2
+
3
+ from .base import Backend
4
+ from .com_backend import ComBackend
5
+ from .openpyxl_backend import OpenpyxlBackend
6
+
7
+ __all__ = ["Backend", "ComBackend", "OpenpyxlBackend"]
@@ -0,0 +1,42 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Protocol
5
+
6
+ from ...models import CellRow, MergedCell, PrintArea
7
+ from ..cells import WorkbookColorsMap
8
+
9
+ CellData = dict[str, list[CellRow]]
10
+ PrintAreaData = dict[str, list[PrintArea]]
11
+ MergedCellData = dict[str, list[MergedCell]]
12
+
13
+
14
+ @dataclass(frozen=True)
15
+ class BackendConfig:
16
+ """Configuration options shared across backends.
17
+
18
+ Attributes:
19
+ include_default_background: Whether to include default background colors.
20
+ ignore_colors: Optional set of color keys to ignore.
21
+ """
22
+
23
+ include_default_background: bool
24
+ ignore_colors: set[str] | None
25
+
26
+
27
+ class Backend(Protocol):
28
+ """Protocol for backend implementations."""
29
+
30
+ def extract_cells(self, *, include_links: bool) -> CellData:
31
+ """Extract cell rows from the workbook."""
32
+
33
+ def extract_print_areas(self) -> PrintAreaData:
34
+ """Extract print areas from the workbook."""
35
+
36
+ def extract_colors_map(
37
+ self, *, include_default_background: bool, ignore_colors: set[str] | None
38
+ ) -> WorkbookColorsMap | None:
39
+ """Extract colors map from the workbook."""
40
+
41
+ def extract_merged_cells(self) -> MergedCellData:
42
+ """Extract merged cell ranges from the workbook."""
@@ -0,0 +1,230 @@
1
+ """COM backend for Excel workbook extraction via xlwings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ import logging
7
+ from typing import Any, cast
8
+
9
+ import xlwings as xw
10
+
11
+ from ...models import PrintArea
12
+ from ..cells import WorkbookColorsMap, extract_sheet_colors_map_com
13
+ from ..ranges import parse_range_zero_based
14
+ from .base import MergedCellData, PrintAreaData
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class ComBackend:
21
+ """COM-based backend for extraction tasks.
22
+
23
+ Attributes:
24
+ workbook: xlwings workbook instance.
25
+ """
26
+
27
+ workbook: xw.Book
28
+
29
+ def extract_print_areas(self) -> PrintAreaData:
30
+ """Extract print areas per sheet via xlwings/COM.
31
+
32
+ Returns:
33
+ Mapping of sheet name to print area list.
34
+ """
35
+ areas: PrintAreaData = {}
36
+ for sheet in self.workbook.sheets:
37
+ raw = ""
38
+ try:
39
+ raw = sheet.api.PageSetup.PrintArea or ""
40
+ except Exception as exc:
41
+ logger.warning(
42
+ "Failed to read print area via COM for sheet '%s'. (%r)",
43
+ sheet.name,
44
+ exc,
45
+ )
46
+ if not raw:
47
+ continue
48
+ for part in str(raw).split(","):
49
+ parsed = _parse_print_area_range(part)
50
+ if not parsed:
51
+ continue
52
+ r1, c1, r2, c2 = parsed
53
+ areas.setdefault(sheet.name, []).append(
54
+ PrintArea(r1=r1 + 1, c1=c1, r2=r2 + 1, c2=c2)
55
+ )
56
+ return areas
57
+
58
+ def extract_colors_map(
59
+ self, *, include_default_background: bool, ignore_colors: set[str] | None
60
+ ) -> WorkbookColorsMap | None:
61
+ """Extract colors_map via COM; logs and skips on failure.
62
+
63
+ Args:
64
+ include_default_background: Whether to include default backgrounds.
65
+ ignore_colors: Optional set of color keys to ignore.
66
+
67
+ Returns:
68
+ WorkbookColorsMap or None when extraction fails.
69
+ """
70
+ try:
71
+ return extract_sheet_colors_map_com(
72
+ self.workbook,
73
+ include_default_background=include_default_background,
74
+ ignore_colors=ignore_colors,
75
+ )
76
+ except Exception as exc:
77
+ logger.warning(
78
+ "COM color map extraction failed; falling back to openpyxl. (%r)",
79
+ exc,
80
+ )
81
+ return None
82
+
83
+ def extract_auto_page_breaks(self) -> PrintAreaData:
84
+ """Compute auto page-break rectangles per sheet using Excel COM.
85
+
86
+ Returns:
87
+ Mapping of sheet name to auto page-break areas.
88
+ """
89
+ results: PrintAreaData = {}
90
+ for sheet in self.workbook.sheets:
91
+ ws_api: Any | None = None
92
+ original_display: bool | None = None
93
+ failed = False
94
+ try:
95
+ ws_api = cast(Any, sheet.api)
96
+ original_display = ws_api.DisplayPageBreaks
97
+ ws_api.DisplayPageBreaks = True
98
+ print_area = ws_api.PageSetup.PrintArea or ws_api.UsedRange.Address
99
+ parts_raw = _split_csv_respecting_quotes(str(print_area))
100
+ area_parts: list[str] = []
101
+ for part in parts_raw:
102
+ rng = _normalize_area_for_sheet(part, sheet.name)
103
+ if rng:
104
+ area_parts.append(rng)
105
+ hpb = cast(Any, ws_api.HPageBreaks)
106
+ vpb = cast(Any, ws_api.VPageBreaks)
107
+ h_break_rows = [
108
+ hpb.Item(i).Location.Row for i in range(1, int(hpb.Count) + 1)
109
+ ]
110
+ v_break_cols = [
111
+ vpb.Item(i).Location.Column for i in range(1, int(vpb.Count) + 1)
112
+ ]
113
+ for addr in area_parts:
114
+ range_obj = cast(Any, ws_api.Range(addr))
115
+ min_row = int(range_obj.Row)
116
+ max_row = min_row + int(range_obj.Rows.Count) - 1
117
+ min_col = int(range_obj.Column)
118
+ max_col = min_col + int(range_obj.Columns.Count) - 1
119
+ rows = (
120
+ [min_row]
121
+ + [r for r in h_break_rows if min_row < r <= max_row]
122
+ + [max_row + 1]
123
+ )
124
+ cols = (
125
+ [min_col]
126
+ + [c for c in v_break_cols if min_col < c <= max_col]
127
+ + [max_col + 1]
128
+ )
129
+ for i in range(len(rows) - 1):
130
+ r1, r2 = rows[i], rows[i + 1] - 1
131
+ for j in range(len(cols) - 1):
132
+ c1, c2 = cols[j], cols[j + 1] - 1
133
+ results.setdefault(sheet.name, []).append(
134
+ PrintArea(r1=r1, c1=c1 - 1, r2=r2, c2=c2 - 1)
135
+ )
136
+ except Exception as exc:
137
+ logger.warning(
138
+ "Failed to extract auto page breaks via COM for sheet '%s'. (%r)",
139
+ sheet.name,
140
+ exc,
141
+ )
142
+ failed = True
143
+ finally:
144
+ if ws_api is not None and original_display is not None:
145
+ try:
146
+ ws_api.DisplayPageBreaks = original_display
147
+ except Exception as exc:
148
+ logger.debug(
149
+ "Failed to restore DisplayPageBreaks for sheet '%s'. (%r)",
150
+ sheet.name,
151
+ exc,
152
+ )
153
+ if failed:
154
+ continue
155
+ return results
156
+
157
+ def extract_merged_cells(self) -> MergedCellData:
158
+ """Extract merged cell ranges via COM (not implemented)."""
159
+ raise NotImplementedError("COM merged cell extraction is not implemented.")
160
+
161
+
162
+ def _parse_print_area_range(range_str: str) -> tuple[int, int, int, int] | None:
163
+ """Parse an Excel range string into zero-based coordinates.
164
+
165
+ Args:
166
+ range_str: Excel range string.
167
+
168
+ Returns:
169
+ Zero-based (r1, c1, r2, c2) tuple or None on failure.
170
+ """
171
+ bounds = parse_range_zero_based(range_str)
172
+ if bounds is None:
173
+ return None
174
+ return (bounds.r1, bounds.c1, bounds.r2, bounds.c2)
175
+
176
+
177
+ def _normalize_area_for_sheet(part: str, ws_name: str) -> str | None:
178
+ """Strip sheet name from a range part when it matches the target sheet.
179
+
180
+ Args:
181
+ part: Raw range string part.
182
+ ws_name: Target worksheet name.
183
+
184
+ Returns:
185
+ Range without sheet prefix, or None if not matching.
186
+ """
187
+ s = part.strip()
188
+ if "!" not in s:
189
+ return s
190
+ sheet, rng = s.rsplit("!", 1)
191
+ sheet = sheet.strip()
192
+ if sheet.startswith("'") and sheet.endswith("'"):
193
+ sheet = sheet[1:-1].replace("''", "'")
194
+ return rng if sheet == ws_name else None
195
+
196
+
197
+ def _split_csv_respecting_quotes(raw: str) -> list[str]:
198
+ """Split a CSV-like string while keeping commas inside single quotes intact.
199
+
200
+ Args:
201
+ raw: Raw CSV-like string.
202
+
203
+ Returns:
204
+ List of split parts.
205
+ """
206
+ parts: list[str] = []
207
+ buf: list[str] = []
208
+ in_quote = False
209
+ i = 0
210
+ while i < len(raw):
211
+ ch = raw[i]
212
+ if ch == "'":
213
+ if in_quote and i + 1 < len(raw) and raw[i + 1] == "'":
214
+ buf.append("''")
215
+ i += 2
216
+ continue
217
+ in_quote = not in_quote
218
+ buf.append(ch)
219
+ i += 1
220
+ continue
221
+ if ch == "," and not in_quote:
222
+ parts.append("".join(buf).strip())
223
+ buf = []
224
+ i += 1
225
+ continue
226
+ buf.append(ch)
227
+ i += 1
228
+ if buf:
229
+ parts.append("".join(buf).strip())
230
+ return [p for p in parts if p]
@@ -0,0 +1,191 @@
1
+ """Openpyxl backend for Excel workbook extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ from ...models import PrintArea
10
+ from ..cells import (
11
+ WorkbookColorsMap,
12
+ detect_tables_openpyxl,
13
+ extract_sheet_cells,
14
+ extract_sheet_cells_with_links,
15
+ extract_sheet_colors_map,
16
+ extract_sheet_merged_cells,
17
+ )
18
+ from ..ranges import parse_range_zero_based
19
+ from ..workbook import openpyxl_workbook
20
+ from .base import CellData, MergedCellData, PrintAreaData
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class OpenpyxlBackend:
27
+ """Openpyxl-based backend for extraction tasks.
28
+
29
+ Attributes:
30
+ file_path: Path to the workbook file.
31
+ """
32
+
33
+ file_path: Path
34
+
35
+ def extract_cells(self, *, include_links: bool) -> CellData:
36
+ """Extract cell rows from the workbook.
37
+
38
+ Args:
39
+ include_links: Whether to include hyperlinks.
40
+
41
+ Returns:
42
+ Mapping of sheet name to cell rows.
43
+ """
44
+ return (
45
+ extract_sheet_cells_with_links(self.file_path)
46
+ if include_links
47
+ else extract_sheet_cells(self.file_path)
48
+ )
49
+
50
+ def extract_print_areas(self) -> PrintAreaData:
51
+ """Extract print areas per sheet using openpyxl defined names.
52
+
53
+ Returns:
54
+ Mapping of sheet name to print area list.
55
+ """
56
+ try:
57
+ with openpyxl_workbook(
58
+ self.file_path, data_only=True, read_only=False
59
+ ) as wb:
60
+ areas = _extract_print_areas_from_defined_names(wb)
61
+ if not areas:
62
+ areas = _extract_print_areas_from_sheet_props(wb)
63
+ return areas
64
+ except Exception:
65
+ return {}
66
+
67
+ def extract_colors_map(
68
+ self, *, include_default_background: bool, ignore_colors: set[str] | None
69
+ ) -> WorkbookColorsMap | None:
70
+ """Extract colors_map using openpyxl.
71
+
72
+ Args:
73
+ include_default_background: Whether to include default background colors.
74
+ ignore_colors: Optional set of color keys to ignore.
75
+
76
+ Returns:
77
+ WorkbookColorsMap or None when extraction fails.
78
+ """
79
+ try:
80
+ return extract_sheet_colors_map(
81
+ self.file_path,
82
+ include_default_background=include_default_background,
83
+ ignore_colors=ignore_colors,
84
+ )
85
+ except Exception as exc:
86
+ logger.warning(
87
+ "Color map extraction failed; skipping colors_map. (%r)", exc
88
+ )
89
+ return None
90
+
91
+ def extract_merged_cells(self) -> MergedCellData:
92
+ """Extract merged cell ranges per sheet.
93
+
94
+ Returns:
95
+ Mapping of sheet name to merged cell ranges.
96
+ """
97
+ try:
98
+ return extract_sheet_merged_cells(self.file_path)
99
+ except Exception:
100
+ return {}
101
+
102
+ def detect_tables(self, sheet_name: str) -> list[str]:
103
+ """Detect table candidates for a single sheet.
104
+
105
+ Args:
106
+ sheet_name: Target worksheet name.
107
+
108
+ Returns:
109
+ List of table candidate ranges.
110
+ """
111
+ try:
112
+ return detect_tables_openpyxl(self.file_path, sheet_name)
113
+ except Exception:
114
+ return []
115
+
116
+
117
+ def _extract_print_areas_from_defined_names(workbook: object) -> PrintAreaData:
118
+ """Extract print areas from defined names in an openpyxl workbook.
119
+
120
+ Args:
121
+ workbook: openpyxl workbook instance.
122
+
123
+ Returns:
124
+ Mapping of sheet name to print area list.
125
+ """
126
+ defined = getattr(workbook, "defined_names", None)
127
+ if defined is None:
128
+ return {}
129
+ defined_area = defined.get("_xlnm.Print_Area")
130
+ if not defined_area:
131
+ return {}
132
+
133
+ areas: PrintAreaData = {}
134
+ sheetnames = set(getattr(workbook, "sheetnames", []))
135
+ for sheet_name, range_str in defined_area.destinations:
136
+ if sheet_name not in sheetnames:
137
+ continue
138
+ _append_print_areas(areas, sheet_name, str(range_str))
139
+ return areas
140
+
141
+
142
+ def _extract_print_areas_from_sheet_props(workbook: object) -> PrintAreaData:
143
+ """Extract print areas from sheet-level print area properties.
144
+
145
+ Args:
146
+ workbook: openpyxl workbook instance.
147
+
148
+ Returns:
149
+ Mapping of sheet name to print area list.
150
+ """
151
+ areas: PrintAreaData = {}
152
+ worksheets = getattr(workbook, "worksheets", [])
153
+ for ws in worksheets:
154
+ pa = getattr(ws, "_print_area", None)
155
+ if not pa:
156
+ continue
157
+ _append_print_areas(areas, str(getattr(ws, "title", "")), str(pa))
158
+ return areas
159
+
160
+
161
+ def _append_print_areas(areas: PrintAreaData, sheet_name: str, range_str: str) -> None:
162
+ """Append parsed print areas to the mapping.
163
+
164
+ Args:
165
+ areas: Mapping to update.
166
+ sheet_name: Target sheet name.
167
+ range_str: Raw range string, possibly comma-separated.
168
+ """
169
+ for part in str(range_str).split(","):
170
+ parsed = _parse_print_area_range(part)
171
+ if not parsed:
172
+ continue
173
+ r1, c1, r2, c2 = parsed
174
+ areas.setdefault(sheet_name, []).append(
175
+ PrintArea(r1=r1 + 1, c1=c1, r2=r2 + 1, c2=c2)
176
+ )
177
+
178
+
179
+ def _parse_print_area_range(range_str: str) -> tuple[int, int, int, int] | None:
180
+ """Parse an Excel range string into zero-based coordinates.
181
+
182
+ Args:
183
+ range_str: Excel range string.
184
+
185
+ Returns:
186
+ Zero-based (r1, c1, r2, c2) tuple or None on failure.
187
+ """
188
+ bounds = parse_range_zero_based(range_str)
189
+ if bounds is None:
190
+ return None
191
+ return (bounds.r1, bounds.c1, bounds.r2, bounds.c2)