exstruct 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,388 +1,55 @@
1
1
  from __future__ import annotations
2
2
 
3
- import logging
4
- import os
5
3
  from pathlib import Path
6
- from typing import Any, Literal, cast
4
+ from typing import Literal
7
5
 
8
- from openpyxl import load_workbook
9
- from openpyxl.utils import range_boundaries
10
- import xlwings as xw
11
-
12
- from ..models import CellRow, PrintArea, Shape, SheetData, WorkbookData
13
- from .cells import (
14
- detect_tables,
15
- detect_tables_openpyxl,
16
- extract_sheet_cells,
17
- extract_sheet_cells_with_links,
18
- )
19
- from .charts import get_charts
20
- from .shapes import get_shapes_with_position
21
-
22
- logger = logging.getLogger(__name__)
23
- _ALLOWED_MODES: set[str] = {"light", "standard", "verbose"}
24
-
25
-
26
- def _find_open_workbook(file_path: Path) -> xw.Book | None:
27
- """Return an existing workbook if already open in Excel; otherwise None."""
28
- try:
29
- for app in xw.apps:
30
- for wb in app.books:
31
- try:
32
- if Path(wb.fullname).resolve() == file_path.resolve():
33
- return wb
34
- except Exception:
35
- continue
36
- except Exception:
37
- return None
38
- return None
39
-
40
-
41
- def _open_workbook(file_path: Path) -> tuple[xw.Book, bool]:
42
- """
43
- Open workbook:
44
- - If already open, reuse and do not close Excel on exit.
45
- - Otherwise create invisible Excel (visible=False) and close when done.
46
- Returns (workbook, should_close_app).
47
- """
48
- existing = _find_open_workbook(file_path)
49
- if existing:
50
- return existing, False
51
- app = xw.App(add_book=False, visible=False)
52
- wb = app.books.open(str(file_path))
53
- return wb, True
54
-
55
-
56
- def _parse_print_area_range(
57
- range_str: str, *, zero_based: bool = True
58
- ) -> tuple[int, int, int, int] | None:
59
- """
60
- Parse an Excel range string into (r1, c1, r2, c2). Returns None on failure.
61
- """
62
- cleaned = range_str.strip()
63
- if not cleaned:
64
- return None
65
- if "!" in cleaned:
66
- cleaned = cleaned.split("!", 1)[1]
67
- try:
68
- min_col, min_row, max_col, max_row = range_boundaries(cleaned)
69
- except Exception:
70
- return None
71
- if zero_based:
72
- return (min_row - 1, min_col - 1, max_row - 1, max_col - 1)
73
- return (min_row, min_col, max_row, max_col)
74
-
75
-
76
- def _extract_print_areas_openpyxl( # noqa: C901
77
- file_path: Path,
78
- ) -> dict[str, list[PrintArea]]:
79
- """
80
- Extract print areas per sheet using openpyxl defined names.
81
-
82
- Returns {sheet_name: [PrintArea, ...]}.
83
- """
84
- try:
85
- wb = load_workbook(file_path, data_only=True, read_only=True)
86
- except Exception:
87
- return {}
88
-
89
- try:
90
- defined = wb.defined_names.get("_xlnm.Print_Area")
91
- areas: dict[str, list[PrintArea]] = {}
92
- if defined:
93
- for sheet_name, range_str in defined.destinations:
94
- if sheet_name not in wb.sheetnames:
95
- continue
96
- # A single destination can contain multiple comma-separated ranges.
97
- for part in str(range_str).split(","):
98
- parsed = _parse_print_area_range(part)
99
- if not parsed:
100
- continue
101
- r1, c1, r2, c2 = parsed
102
- areas.setdefault(sheet_name, []).append(
103
- PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
104
- )
105
- # Fallback: some files carry sheet-level print_area without defined name.
106
- if not areas:
107
- for ws in wb.worksheets:
108
- pa = getattr(ws, "_print_area", None)
109
- if not pa:
110
- continue
111
- for part in str(pa).split(","):
112
- parsed = _parse_print_area_range(part)
113
- if not parsed:
114
- continue
115
- r1, c1, r2, c2 = parsed
116
- areas.setdefault(ws.title, []).append(
117
- PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
118
- )
119
- return areas
120
- finally:
121
- try:
122
- wb.close()
123
- except Exception:
124
- pass
125
-
126
-
127
- def _extract_print_areas_com(workbook: xw.Book) -> dict[str, list[PrintArea]]:
128
- """
129
- Extract print areas per sheet via xlwings/COM.
130
-
131
- Uses Sheet.PageSetup.PrintArea which may contain comma-separated ranges.
132
- """
133
- areas: dict[str, list[PrintArea]] = {}
134
- for sheet in workbook.sheets:
135
- try:
136
- raw = sheet.api.PageSetup.PrintArea or ""
137
- except Exception:
138
- continue
139
- if not raw:
140
- continue
141
- parts = str(raw).split(",")
142
- for part in parts:
143
- parsed = _parse_print_area_range(part, zero_based=True)
144
- if not parsed:
145
- continue
146
- r1, c1, r2, c2 = parsed
147
- areas.setdefault(sheet.name, []).append(
148
- PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
149
- )
150
- return areas
151
-
152
-
153
- def _normalize_area_for_sheet(part: str, ws_name: str) -> str | None:
154
- """
155
- Strip sheet name from a range part when it matches the target sheet; otherwise None.
156
- """
157
- s = part.strip()
158
- if "!" not in s:
159
- return s
160
- sheet, rng = s.rsplit("!", 1)
161
- sheet = sheet.strip()
162
- if sheet.startswith("'") and sheet.endswith("'"):
163
- sheet = sheet[1:-1].replace("''", "'")
164
- return rng if sheet == ws_name else None
165
-
166
-
167
- def _split_csv_respecting_quotes(raw: str) -> list[str]:
168
- """
169
- Split a CSV-like string while keeping commas inside single quotes intact.
170
- """
171
- parts: list[str] = []
172
- buf: list[str] = []
173
- in_quote = False
174
- i = 0
175
- while i < len(raw):
176
- ch = raw[i]
177
- if ch == "'":
178
- if in_quote and i + 1 < len(raw) and raw[i + 1] == "'":
179
- buf.append("''")
180
- i += 2
181
- continue
182
- in_quote = not in_quote
183
- buf.append(ch)
184
- i += 1
185
- continue
186
- if ch == "," and not in_quote:
187
- parts.append("".join(buf).strip())
188
- buf = []
189
- i += 1
190
- continue
191
- buf.append(ch)
192
- i += 1
193
- if buf:
194
- parts.append("".join(buf).strip())
195
- return [p for p in parts if p]
196
-
197
-
198
- def _compute_auto_page_break_areas(workbook: xw.Book) -> dict[str, list[PrintArea]]:
199
- """
200
- Compute auto page-break rectangles per sheet using Excel COM.
201
- Falls back to empty dict on failure.
202
- """
203
- results: dict[str, list[PrintArea]] = {}
204
- for sheet in workbook.sheets:
205
- try:
206
- ws_api = cast(Any, sheet.api) # xlwings COM API; treated as Any
207
- original_display: bool | None = ws_api.DisplayPageBreaks
208
- ws_api.DisplayPageBreaks = True
209
- print_area = ws_api.PageSetup.PrintArea or ws_api.UsedRange.Address
210
- parts_raw = _split_csv_respecting_quotes(str(print_area))
211
- area_parts: list[str] = []
212
- for part in parts_raw:
213
- rng = _normalize_area_for_sheet(part, sheet.name)
214
- if rng:
215
- area_parts.append(rng)
216
- hpb = cast(Any, ws_api.HPageBreaks)
217
- vpb = cast(Any, ws_api.VPageBreaks)
218
- h_break_rows = [
219
- hpb.Item(i).Location.Row for i in range(1, int(hpb.Count) + 1)
220
- ]
221
- v_break_cols = [
222
- vpb.Item(i).Location.Column for i in range(1, int(vpb.Count) + 1)
223
- ]
224
- for addr in area_parts:
225
- range_obj = cast(Any, ws_api.Range(addr))
226
- min_row = int(range_obj.Row)
227
- max_row = min_row + int(range_obj.Rows.Count) - 1
228
- min_col = int(range_obj.Column)
229
- max_col = min_col + int(range_obj.Columns.Count) - 1
230
- rows = (
231
- [min_row]
232
- + [r for r in h_break_rows if min_row < r <= max_row]
233
- + [max_row + 1]
234
- )
235
- cols = (
236
- [min_col]
237
- + [c for c in v_break_cols if min_col < c <= max_col]
238
- + [max_col + 1]
239
- )
240
- for i in range(len(rows) - 1):
241
- r1, r2 = rows[i], rows[i + 1] - 1
242
- for j in range(len(cols) - 1):
243
- c1, c2 = cols[j], cols[j + 1] - 1
244
- c1_0 = c1 - 1
245
- c2_0 = c2 - 1
246
- results.setdefault(sheet.name, []).append(
247
- PrintArea(r1=r1, c1=c1_0, r2=r2, c2=c2_0)
248
- )
249
- if original_display is not None:
250
- ws_api.DisplayPageBreaks = original_display
251
- except Exception:
252
- try:
253
- if original_display is not None:
254
- ws_api.DisplayPageBreaks = original_display
255
- except Exception:
256
- pass
257
- continue
258
- return results
259
-
260
-
261
- def integrate_sheet_content(
262
- cell_data: dict[str, list[CellRow]],
263
- shape_data: dict[str, list[Shape]],
264
- workbook: xw.Book,
265
- mode: Literal["light", "standard", "verbose"] = "standard",
266
- print_area_data: dict[str, list[PrintArea]] | None = None,
267
- auto_page_break_data: dict[str, list[PrintArea]] | None = None,
268
- ) -> dict[str, SheetData]:
269
- """Integrate cells, shapes, charts, and tables into SheetData per sheet."""
270
- result: dict[str, SheetData] = {}
271
- for sheet_name, rows in cell_data.items():
272
- sheet_shapes = shape_data.get(sheet_name, [])
273
- sheet = workbook.sheets[sheet_name]
274
-
275
- sheet_model = SheetData(
276
- rows=rows,
277
- shapes=sheet_shapes,
278
- charts=[] if mode == "light" else get_charts(sheet, mode=mode),
279
- table_candidates=detect_tables(sheet),
280
- print_areas=print_area_data.get(sheet_name, []) if print_area_data else [],
281
- auto_print_areas=auto_page_break_data.get(sheet_name, [])
282
- if auto_page_break_data
283
- else [],
284
- )
285
-
286
- result[sheet_name] = sheet_model
287
- return result
6
+ from ..models import WorkbookData
7
+ from .pipeline import resolve_extraction_inputs, run_extraction_pipeline
288
8
 
289
9
 
290
10
  def extract_workbook( # noqa: C901
291
11
  file_path: str | Path,
292
12
  mode: Literal["light", "standard", "verbose"] = "standard",
293
13
  *,
294
- include_cell_links: bool = False,
295
- include_print_areas: bool = True,
14
+ include_cell_links: bool | None = None,
15
+ include_print_areas: bool | None = None,
296
16
  include_auto_page_breaks: bool = False,
17
+ include_colors_map: bool | None = None,
18
+ include_default_background: bool = False,
19
+ ignore_colors: set[str] | None = None,
20
+ include_merged_cells: bool | None = None,
297
21
  ) -> WorkbookData:
298
- """Extract workbook and return WorkbookData; fallback to cells+tables if Excel COM is unavailable."""
299
- if mode not in _ALLOWED_MODES:
300
- raise ValueError(f"Unsupported mode: {mode}")
301
-
302
- normalized_file_path = file_path if isinstance(file_path, Path) else Path(file_path)
303
-
304
- cell_data = (
305
- extract_sheet_cells_with_links(normalized_file_path)
306
- if include_cell_links
307
- else extract_sheet_cells(normalized_file_path)
22
+ """Extract workbook and return WorkbookData.
23
+
24
+ Falls back to cells+tables if Excel COM is unavailable.
25
+
26
+ Args:
27
+ file_path: Workbook path.
28
+ mode: Extraction mode.
29
+ include_cell_links: Whether to include cell hyperlinks; None uses mode defaults.
30
+ include_print_areas: Whether to include print areas; None defaults to True.
31
+ include_auto_page_breaks: Whether to include auto page breaks.
32
+ include_colors_map: Whether to include colors map; None uses mode defaults.
33
+ include_default_background: Whether to include default background color.
34
+ ignore_colors: Optional set of color keys to ignore.
35
+ include_merged_cells: Whether to include merged cell ranges; None uses mode defaults.
36
+
37
+ Returns:
38
+ Extracted WorkbookData.
39
+
40
+ Raises:
41
+ ValueError: If mode is unsupported.
42
+ """
43
+ inputs = resolve_extraction_inputs(
44
+ file_path,
45
+ mode=mode,
46
+ include_cell_links=include_cell_links,
47
+ include_print_areas=include_print_areas,
48
+ include_auto_page_breaks=include_auto_page_breaks,
49
+ include_colors_map=include_colors_map,
50
+ include_default_background=include_default_background,
51
+ ignore_colors=ignore_colors,
52
+ include_merged_cells=include_merged_cells,
308
53
  )
309
- print_area_data: dict[str, list[PrintArea]] = {}
310
- if include_print_areas:
311
- print_area_data = _extract_print_areas_openpyxl(normalized_file_path)
312
- auto_page_break_data: dict[str, list[PrintArea]] = {}
313
-
314
- def _cells_and_tables_only(reason: str) -> WorkbookData:
315
- sheets: dict[str, SheetData] = {}
316
- for sheet_name, rows in cell_data.items():
317
- try:
318
- tables = detect_tables_openpyxl(normalized_file_path, sheet_name)
319
- except Exception:
320
- tables = []
321
- sheets[sheet_name] = SheetData(
322
- rows=rows,
323
- shapes=[],
324
- charts=[],
325
- table_candidates=tables,
326
- print_areas=print_area_data.get(sheet_name, [])
327
- if include_print_areas
328
- else [],
329
- auto_print_areas=[],
330
- )
331
- logger.warning(
332
- "%s Falling back to cells+tables only; shapes and charts will be empty.",
333
- reason,
334
- )
335
- return WorkbookData(book_name=normalized_file_path.name, sheets=sheets)
336
-
337
- if mode == "light":
338
- return _cells_and_tables_only("Light mode selected.")
339
-
340
- if os.getenv("SKIP_COM_TESTS"):
341
- return _cells_and_tables_only(
342
- "SKIP_COM_TESTS is set; skipping COM/xlwings access."
343
- )
344
-
345
- try:
346
- wb, close_app = _open_workbook(normalized_file_path)
347
- except Exception as e:
348
- return _cells_and_tables_only(f"xlwings/Excel COM is unavailable. ({e!r})")
349
-
350
- try:
351
- try:
352
- shape_data = get_shapes_with_position(wb, mode=mode)
353
- if include_print_areas and not print_area_data:
354
- # openpyxl couldn't read (e.g., .xls). Try COM as a fallback.
355
- try:
356
- print_area_data = _extract_print_areas_com(wb)
357
- except Exception:
358
- print_area_data = {}
359
- if include_auto_page_breaks:
360
- try:
361
- auto_page_break_data = _compute_auto_page_break_areas(wb)
362
- except Exception:
363
- auto_page_break_data = {}
364
- merged = integrate_sheet_content(
365
- cell_data,
366
- shape_data,
367
- wb,
368
- mode=mode,
369
- print_area_data=print_area_data if include_print_areas else None,
370
- auto_page_break_data=auto_page_break_data
371
- if include_auto_page_breaks
372
- else None,
373
- )
374
- return WorkbookData(book_name=normalized_file_path.name, sheets=merged)
375
- except Exception as e:
376
- logger.warning(
377
- "Shape extraction failed; falling back to cells+tables. (%r)", e
378
- )
379
- return _cells_and_tables_only(f"Shape extraction failed ({e!r}).")
380
- finally:
381
- # Close only if we created the app to avoid shutting user sessions.
382
- try:
383
- if close_app:
384
- app = wb.app
385
- wb.close()
386
- app.quit()
387
- except Exception:
388
- pass
54
+ result = run_extraction_pipeline(inputs)
55
+ return result.workbook
@@ -0,0 +1,16 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+
5
+ from ..errors import FallbackReason
6
+
7
+
8
+ def log_fallback(logger: logging.Logger, reason: FallbackReason, message: str) -> None:
9
+ """Log a standardized fallback warning.
10
+
11
+ Args:
12
+ logger: Logger instance to emit the warning.
13
+ reason: Fallback reason code.
14
+ message: Human-readable detail message.
15
+ """
16
+ logger.warning("[%s] %s", reason.value, message)
@@ -0,0 +1,87 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from ..models import (
6
+ Arrow,
7
+ CellRow,
8
+ Chart,
9
+ MergedCell,
10
+ PrintArea,
11
+ Shape,
12
+ SheetData,
13
+ SmartArt,
14
+ WorkbookData,
15
+ )
16
+
17
+
18
+ @dataclass(frozen=True)
19
+ class SheetRawData:
20
+ """Raw, extracted sheet data before model conversion.
21
+
22
+ Attributes:
23
+ rows: Extracted cell rows.
24
+ shapes: Extracted shapes.
25
+ charts: Extracted charts.
26
+ table_candidates: Detected table ranges.
27
+ print_areas: Extracted print areas.
28
+ auto_print_areas: Extracted auto page-break areas.
29
+ colors_map: Mapping of color keys to (row, column) positions.
30
+ merged_cells: Extracted merged cell ranges.
31
+ """
32
+
33
+ rows: list[CellRow]
34
+ shapes: list[Shape | Arrow | SmartArt]
35
+ charts: list[Chart]
36
+ table_candidates: list[str]
37
+ print_areas: list[PrintArea]
38
+ auto_print_areas: list[PrintArea]
39
+ colors_map: dict[str, list[tuple[int, int]]]
40
+ merged_cells: list[MergedCell]
41
+
42
+
43
+ @dataclass(frozen=True)
44
+ class WorkbookRawData:
45
+ """Raw, extracted workbook data before model conversion.
46
+
47
+ Attributes:
48
+ book_name: Workbook file name.
49
+ sheets: Mapping of sheet name to raw sheet data.
50
+ """
51
+
52
+ book_name: str
53
+ sheets: dict[str, SheetRawData]
54
+
55
+
56
+ def build_sheet_data(raw: SheetRawData) -> SheetData:
57
+ """Build a SheetData model from raw sheet data.
58
+
59
+ Args:
60
+ raw: Raw sheet data.
61
+
62
+ Returns:
63
+ SheetData model instance.
64
+ """
65
+ return SheetData(
66
+ rows=raw.rows,
67
+ shapes=raw.shapes,
68
+ charts=raw.charts,
69
+ table_candidates=raw.table_candidates,
70
+ print_areas=raw.print_areas,
71
+ auto_print_areas=raw.auto_print_areas,
72
+ colors_map=raw.colors_map,
73
+ merged_cells=raw.merged_cells,
74
+ )
75
+
76
+
77
+ def build_workbook_data(raw: WorkbookRawData) -> WorkbookData:
78
+ """Build a WorkbookData model from raw workbook data.
79
+
80
+ Args:
81
+ raw: Raw workbook data.
82
+
83
+ Returns:
84
+ WorkbookData model instance.
85
+ """
86
+ sheets = {name: build_sheet_data(sheet) for name, sheet in raw.sheets.items()}
87
+ return WorkbookData(book_name=raw.book_name, sheets=sheets)