exstruct 0.2.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,241 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Literal
5
+
6
+ import xlwings as xw
7
+
8
+ from ..models import Chart, ChartSeries
9
+ from ..models.maps import XL_CHART_TYPE_MAP
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def _extract_series_args_text(formula: str) -> str | None: # noqa: C901
15
+ """Extract the outer argument text from '=SERIES(...)'; return None if unmatched."""
16
+ if not formula:
17
+ return None
18
+ s = formula.strip()
19
+ if not s.upper().startswith("=SERIES"):
20
+ return None
21
+ try:
22
+ open_idx = s.index("(", s.upper().index("=SERIES"))
23
+ except ValueError:
24
+ return None
25
+ depth_paren = 0
26
+ depth_brace = 0
27
+ in_str = False
28
+ i = open_idx + 1
29
+ start = i
30
+ while i < len(s):
31
+ ch = s[i]
32
+ if in_str:
33
+ if ch == '"':
34
+ if i + 1 < len(s) and s[i + 1] == '"':
35
+ i += 2
36
+ continue
37
+ else:
38
+ in_str = False
39
+ i += 1
40
+ continue
41
+ else:
42
+ i += 1
43
+ continue
44
+ else:
45
+ if ch == '"':
46
+ in_str = True
47
+ i += 1
48
+ continue
49
+ elif ch == "(":
50
+ depth_paren += 1
51
+ elif ch == ")":
52
+ if depth_paren == 0:
53
+ return s[start:i].strip()
54
+ depth_paren -= 1
55
+ elif ch == "{":
56
+ depth_brace += 1
57
+ elif ch == "}":
58
+ if depth_brace > 0:
59
+ depth_brace -= 1
60
+ i += 1
61
+ return None
62
+
63
+
64
+ def _split_top_level_args(args_text: str) -> list[str]: # noqa: C901
65
+ """Split SERIES arguments at top-level separators (',' or ';')."""
66
+ if args_text is None:
67
+ return []
68
+ use_semicolon = (";" in args_text) and ("," not in args_text.split('"')[0])
69
+ sep_chars = (";",) if use_semicolon else (",",)
70
+ args: list[str] = []
71
+ buf: list[str] = []
72
+ depth_paren = 0
73
+ depth_brace = 0
74
+ in_str = False
75
+ i = 0
76
+ while i < len(args_text):
77
+ ch = args_text[i]
78
+ if in_str:
79
+ if ch == '"':
80
+ if i + 1 < len(args_text) and args_text[i + 1] == '"':
81
+ buf.append('"')
82
+ i += 2
83
+ continue
84
+ else:
85
+ in_str = False
86
+ i += 1
87
+ continue
88
+ else:
89
+ buf.append(ch)
90
+ i += 1
91
+ continue
92
+ else:
93
+ if ch == '"':
94
+ in_str = True
95
+ i += 1
96
+ continue
97
+ elif ch == "(":
98
+ depth_paren += 1
99
+ buf.append(ch)
100
+ i += 1
101
+ continue
102
+ elif ch == ")":
103
+ depth_paren = max(0, depth_paren - 1)
104
+ buf.append(ch)
105
+ i += 1
106
+ continue
107
+ elif ch == "{":
108
+ depth_brace += 1
109
+ buf.append(ch)
110
+ i += 1
111
+ continue
112
+ elif ch == "}":
113
+ depth_brace = max(0, depth_brace - 1)
114
+ buf.append(ch)
115
+ i += 1
116
+ continue
117
+ elif (ch in sep_chars) and depth_paren == 0 and depth_brace == 0:
118
+ args.append("".join(buf).strip())
119
+ buf = []
120
+ i += 1
121
+ continue
122
+ else:
123
+ buf.append(ch)
124
+ i += 1
125
+ continue
126
+ if buf or (args and args_text.endswith(sep_chars)):
127
+ args.append("".join(buf).strip())
128
+ return args
129
+
130
+
131
+ def _unquote_excel_string(s: str | None) -> str | None:
132
+ """Decode Excel-style quoted string; return None if not quoted."""
133
+ if s is None:
134
+ return None
135
+ st = s.strip()
136
+ if len(st) >= 2 and st[0] == '"' and st[-1] == '"':
137
+ inner = st[1:-1]
138
+ return inner.replace('""', '"')
139
+ return None
140
+
141
+
142
+ def parse_series_formula(formula: str) -> dict[str, str | None] | None:
143
+ """Parse =SERIES into a dict of references; return None on failure."""
144
+ args_text = _extract_series_args_text(formula)
145
+ if args_text is None:
146
+ return None
147
+ parts = _split_top_level_args(args_text)
148
+ name_part = parts[0].strip() if len(parts) >= 1 and parts[0].strip() != "" else None
149
+ x_part = parts[1].strip() if len(parts) >= 2 and parts[1].strip() != "" else None
150
+ y_part = parts[2].strip() if len(parts) >= 3 and parts[2].strip() != "" else None
151
+ plot_order_part = (
152
+ parts[3].strip() if len(parts) >= 4 and parts[3].strip() != "" else None
153
+ )
154
+ bubble_part = (
155
+ parts[4].strip() if len(parts) >= 5 and parts[4].strip() != "" else None
156
+ )
157
+ name_literal = _unquote_excel_string(name_part)
158
+ name_range = None if name_literal is not None else name_part
159
+ return {
160
+ "name_range": name_range,
161
+ "x_range": x_part,
162
+ "y_range": y_part,
163
+ "plot_order": plot_order_part,
164
+ "bubble_size_range": bubble_part,
165
+ "name_literal": name_literal,
166
+ }
167
+
168
+
169
+ def get_charts(
170
+ sheet: xw.Sheet, mode: Literal["light", "standard", "verbose"] = "standard"
171
+ ) -> list[Chart]:
172
+ """Parse charts in a sheet into Chart models; failed charts carry an error field."""
173
+ charts: list[Chart] = []
174
+ for ch in sheet.charts:
175
+ series_list: list[ChartSeries] = []
176
+ y_axis_title: str = ""
177
+ y_axis_range: list[int] = []
178
+ chart_type_label: str = "unknown"
179
+ error: str | None = None
180
+
181
+ try:
182
+ chart_com = sheet.api.ChartObjects(ch.name).Chart
183
+ chart_type_num = chart_com.ChartType
184
+ chart_type_label = XL_CHART_TYPE_MAP.get(
185
+ chart_type_num, f"unknown_{chart_type_num}"
186
+ )
187
+ chart_width: int | None = None
188
+ chart_height: int | None = None
189
+ try:
190
+ chart_width = int(ch.width)
191
+ chart_height = int(ch.height)
192
+ except Exception:
193
+ chart_width = None
194
+ chart_height = None
195
+
196
+ for s in chart_com.SeriesCollection():
197
+ parsed = parse_series_formula(getattr(s, "Formula", ""))
198
+ name_range = parsed["name_range"] if parsed else None
199
+ x_range = parsed["x_range"] if parsed else None
200
+ y_range = parsed["y_range"] if parsed else None
201
+
202
+ series_list.append(
203
+ ChartSeries(
204
+ name=s.Name,
205
+ name_range=name_range,
206
+ x_range=x_range,
207
+ y_range=y_range,
208
+ )
209
+ )
210
+
211
+ try:
212
+ y_axis = chart_com.Axes(2, 1)
213
+ if y_axis.HasTitle:
214
+ y_axis_title = y_axis.AxisTitle.Text
215
+ y_axis_range = [y_axis.MinimumScale, y_axis.MaximumScale]
216
+ except Exception:
217
+ y_axis_title = ""
218
+ y_axis_range = []
219
+
220
+ title = chart_com.ChartTitle.Text if chart_com.HasTitle else None
221
+ except Exception:
222
+ logger.warning("Failed to parse chart; returning with error string.")
223
+ title = None
224
+ error = "Failed to build chart JSON structure"
225
+
226
+ charts.append(
227
+ Chart(
228
+ name=ch.name,
229
+ chart_type=chart_type_label,
230
+ title=title,
231
+ y_axis_title=y_axis_title,
232
+ y_axis_range=[float(v) for v in y_axis_range],
233
+ w=chart_width,
234
+ h=chart_height,
235
+ series=series_list,
236
+ l=int(ch.left),
237
+ t=int(ch.top),
238
+ error=error,
239
+ )
240
+ )
241
+ return charts
@@ -0,0 +1,388 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any, Literal, cast
7
+
8
+ from openpyxl import load_workbook
9
+ from openpyxl.utils import range_boundaries
10
+ import xlwings as xw
11
+
12
+ from ..models import CellRow, PrintArea, Shape, SheetData, WorkbookData
13
+ from .cells import (
14
+ detect_tables,
15
+ detect_tables_openpyxl,
16
+ extract_sheet_cells,
17
+ extract_sheet_cells_with_links,
18
+ )
19
+ from .charts import get_charts
20
+ from .shapes import get_shapes_with_position
21
+
22
+ logger = logging.getLogger(__name__)
23
+ _ALLOWED_MODES: set[str] = {"light", "standard", "verbose"}
24
+
25
+
26
+ def _find_open_workbook(file_path: Path) -> xw.Book | None:
27
+ """Return an existing workbook if already open in Excel; otherwise None."""
28
+ try:
29
+ for app in xw.apps:
30
+ for wb in app.books:
31
+ try:
32
+ if Path(wb.fullname).resolve() == file_path.resolve():
33
+ return wb
34
+ except Exception:
35
+ continue
36
+ except Exception:
37
+ return None
38
+ return None
39
+
40
+
41
+ def _open_workbook(file_path: Path) -> tuple[xw.Book, bool]:
42
+ """
43
+ Open workbook:
44
+ - If already open, reuse and do not close Excel on exit.
45
+ - Otherwise create invisible Excel (visible=False) and close when done.
46
+ Returns (workbook, should_close_app).
47
+ """
48
+ existing = _find_open_workbook(file_path)
49
+ if existing:
50
+ return existing, False
51
+ app = xw.App(add_book=False, visible=False)
52
+ wb = app.books.open(str(file_path))
53
+ return wb, True
54
+
55
+
56
+ def _parse_print_area_range(
57
+ range_str: str, *, zero_based: bool = True
58
+ ) -> tuple[int, int, int, int] | None:
59
+ """
60
+ Parse an Excel range string into (r1, c1, r2, c2). Returns None on failure.
61
+ """
62
+ cleaned = range_str.strip()
63
+ if not cleaned:
64
+ return None
65
+ if "!" in cleaned:
66
+ cleaned = cleaned.split("!", 1)[1]
67
+ try:
68
+ min_col, min_row, max_col, max_row = range_boundaries(cleaned)
69
+ except Exception:
70
+ return None
71
+ if zero_based:
72
+ return (min_row - 1, min_col - 1, max_row - 1, max_col - 1)
73
+ return (min_row, min_col, max_row, max_col)
74
+
75
+
76
+ def _extract_print_areas_openpyxl( # noqa: C901
77
+ file_path: Path,
78
+ ) -> dict[str, list[PrintArea]]:
79
+ """
80
+ Extract print areas per sheet using openpyxl defined names.
81
+
82
+ Returns {sheet_name: [PrintArea, ...]}.
83
+ """
84
+ try:
85
+ wb = load_workbook(file_path, data_only=True, read_only=True)
86
+ except Exception:
87
+ return {}
88
+
89
+ try:
90
+ defined = wb.defined_names.get("_xlnm.Print_Area")
91
+ areas: dict[str, list[PrintArea]] = {}
92
+ if defined:
93
+ for sheet_name, range_str in defined.destinations:
94
+ if sheet_name not in wb.sheetnames:
95
+ continue
96
+ # A single destination can contain multiple comma-separated ranges.
97
+ for part in str(range_str).split(","):
98
+ parsed = _parse_print_area_range(part)
99
+ if not parsed:
100
+ continue
101
+ r1, c1, r2, c2 = parsed
102
+ areas.setdefault(sheet_name, []).append(
103
+ PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
104
+ )
105
+ # Fallback: some files carry sheet-level print_area without defined name.
106
+ if not areas:
107
+ for ws in wb.worksheets:
108
+ pa = getattr(ws, "_print_area", None)
109
+ if not pa:
110
+ continue
111
+ for part in str(pa).split(","):
112
+ parsed = _parse_print_area_range(part)
113
+ if not parsed:
114
+ continue
115
+ r1, c1, r2, c2 = parsed
116
+ areas.setdefault(ws.title, []).append(
117
+ PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
118
+ )
119
+ return areas
120
+ finally:
121
+ try:
122
+ wb.close()
123
+ except Exception:
124
+ pass
125
+
126
+
127
+ def _extract_print_areas_com(workbook: xw.Book) -> dict[str, list[PrintArea]]:
128
+ """
129
+ Extract print areas per sheet via xlwings/COM.
130
+
131
+ Uses Sheet.PageSetup.PrintArea which may contain comma-separated ranges.
132
+ """
133
+ areas: dict[str, list[PrintArea]] = {}
134
+ for sheet in workbook.sheets:
135
+ try:
136
+ raw = sheet.api.PageSetup.PrintArea or ""
137
+ except Exception:
138
+ continue
139
+ if not raw:
140
+ continue
141
+ parts = str(raw).split(",")
142
+ for part in parts:
143
+ parsed = _parse_print_area_range(part, zero_based=True)
144
+ if not parsed:
145
+ continue
146
+ r1, c1, r2, c2 = parsed
147
+ areas.setdefault(sheet.name, []).append(
148
+ PrintArea(r1=r1, c1=c1, r2=r2, c2=c2)
149
+ )
150
+ return areas
151
+
152
+
153
+ def _normalize_area_for_sheet(part: str, ws_name: str) -> str | None:
154
+ """
155
+ Strip sheet name from a range part when it matches the target sheet; otherwise None.
156
+ """
157
+ s = part.strip()
158
+ if "!" not in s:
159
+ return s
160
+ sheet, rng = s.rsplit("!", 1)
161
+ sheet = sheet.strip()
162
+ if sheet.startswith("'") and sheet.endswith("'"):
163
+ sheet = sheet[1:-1].replace("''", "'")
164
+ return rng if sheet == ws_name else None
165
+
166
+
167
+ def _split_csv_respecting_quotes(raw: str) -> list[str]:
168
+ """
169
+ Split a CSV-like string while keeping commas inside single quotes intact.
170
+ """
171
+ parts: list[str] = []
172
+ buf: list[str] = []
173
+ in_quote = False
174
+ i = 0
175
+ while i < len(raw):
176
+ ch = raw[i]
177
+ if ch == "'":
178
+ if in_quote and i + 1 < len(raw) and raw[i + 1] == "'":
179
+ buf.append("''")
180
+ i += 2
181
+ continue
182
+ in_quote = not in_quote
183
+ buf.append(ch)
184
+ i += 1
185
+ continue
186
+ if ch == "," and not in_quote:
187
+ parts.append("".join(buf).strip())
188
+ buf = []
189
+ i += 1
190
+ continue
191
+ buf.append(ch)
192
+ i += 1
193
+ if buf:
194
+ parts.append("".join(buf).strip())
195
+ return [p for p in parts if p]
196
+
197
+
198
+ def _compute_auto_page_break_areas(workbook: xw.Book) -> dict[str, list[PrintArea]]:
199
+ """
200
+ Compute auto page-break rectangles per sheet using Excel COM.
201
+ Falls back to empty dict on failure.
202
+ """
203
+ results: dict[str, list[PrintArea]] = {}
204
+ for sheet in workbook.sheets:
205
+ try:
206
+ ws_api = cast(Any, sheet.api) # xlwings COM API; treated as Any
207
+ original_display: bool | None = ws_api.DisplayPageBreaks
208
+ ws_api.DisplayPageBreaks = True
209
+ print_area = ws_api.PageSetup.PrintArea or ws_api.UsedRange.Address
210
+ parts_raw = _split_csv_respecting_quotes(str(print_area))
211
+ area_parts: list[str] = []
212
+ for part in parts_raw:
213
+ rng = _normalize_area_for_sheet(part, sheet.name)
214
+ if rng:
215
+ area_parts.append(rng)
216
+ hpb = cast(Any, ws_api.HPageBreaks)
217
+ vpb = cast(Any, ws_api.VPageBreaks)
218
+ h_break_rows = [
219
+ hpb.Item(i).Location.Row for i in range(1, int(hpb.Count) + 1)
220
+ ]
221
+ v_break_cols = [
222
+ vpb.Item(i).Location.Column for i in range(1, int(vpb.Count) + 1)
223
+ ]
224
+ for addr in area_parts:
225
+ range_obj = cast(Any, ws_api.Range(addr))
226
+ min_row = int(range_obj.Row)
227
+ max_row = min_row + int(range_obj.Rows.Count) - 1
228
+ min_col = int(range_obj.Column)
229
+ max_col = min_col + int(range_obj.Columns.Count) - 1
230
+ rows = (
231
+ [min_row]
232
+ + [r for r in h_break_rows if min_row < r <= max_row]
233
+ + [max_row + 1]
234
+ )
235
+ cols = (
236
+ [min_col]
237
+ + [c for c in v_break_cols if min_col < c <= max_col]
238
+ + [max_col + 1]
239
+ )
240
+ for i in range(len(rows) - 1):
241
+ r1, r2 = rows[i], rows[i + 1] - 1
242
+ for j in range(len(cols) - 1):
243
+ c1, c2 = cols[j], cols[j + 1] - 1
244
+ c1_0 = c1 - 1
245
+ c2_0 = c2 - 1
246
+ results.setdefault(sheet.name, []).append(
247
+ PrintArea(r1=r1, c1=c1_0, r2=r2, c2=c2_0)
248
+ )
249
+ if original_display is not None:
250
+ ws_api.DisplayPageBreaks = original_display
251
+ except Exception:
252
+ try:
253
+ if original_display is not None:
254
+ ws_api.DisplayPageBreaks = original_display
255
+ except Exception:
256
+ pass
257
+ continue
258
+ return results
259
+
260
+
261
+ def integrate_sheet_content(
262
+ cell_data: dict[str, list[CellRow]],
263
+ shape_data: dict[str, list[Shape]],
264
+ workbook: xw.Book,
265
+ mode: Literal["light", "standard", "verbose"] = "standard",
266
+ print_area_data: dict[str, list[PrintArea]] | None = None,
267
+ auto_page_break_data: dict[str, list[PrintArea]] | None = None,
268
+ ) -> dict[str, SheetData]:
269
+ """Integrate cells, shapes, charts, and tables into SheetData per sheet."""
270
+ result: dict[str, SheetData] = {}
271
+ for sheet_name, rows in cell_data.items():
272
+ sheet_shapes = shape_data.get(sheet_name, [])
273
+ sheet = workbook.sheets[sheet_name]
274
+
275
+ sheet_model = SheetData(
276
+ rows=rows,
277
+ shapes=sheet_shapes,
278
+ charts=[] if mode == "light" else get_charts(sheet, mode=mode),
279
+ table_candidates=detect_tables(sheet),
280
+ print_areas=print_area_data.get(sheet_name, []) if print_area_data else [],
281
+ auto_print_areas=auto_page_break_data.get(sheet_name, [])
282
+ if auto_page_break_data
283
+ else [],
284
+ )
285
+
286
+ result[sheet_name] = sheet_model
287
+ return result
288
+
289
+
290
+ def extract_workbook( # noqa: C901
291
+ file_path: str | Path,
292
+ mode: Literal["light", "standard", "verbose"] = "standard",
293
+ *,
294
+ include_cell_links: bool = False,
295
+ include_print_areas: bool = True,
296
+ include_auto_page_breaks: bool = False,
297
+ ) -> WorkbookData:
298
+ """Extract workbook and return WorkbookData; fallback to cells+tables if Excel COM is unavailable."""
299
+ if mode not in _ALLOWED_MODES:
300
+ raise ValueError(f"Unsupported mode: {mode}")
301
+
302
+ normalized_file_path = file_path if isinstance(file_path, Path) else Path(file_path)
303
+
304
+ cell_data = (
305
+ extract_sheet_cells_with_links(normalized_file_path)
306
+ if include_cell_links
307
+ else extract_sheet_cells(normalized_file_path)
308
+ )
309
+ print_area_data: dict[str, list[PrintArea]] = {}
310
+ if include_print_areas:
311
+ print_area_data = _extract_print_areas_openpyxl(normalized_file_path)
312
+ auto_page_break_data: dict[str, list[PrintArea]] = {}
313
+
314
+ def _cells_and_tables_only(reason: str) -> WorkbookData:
315
+ sheets: dict[str, SheetData] = {}
316
+ for sheet_name, rows in cell_data.items():
317
+ try:
318
+ tables = detect_tables_openpyxl(normalized_file_path, sheet_name)
319
+ except Exception:
320
+ tables = []
321
+ sheets[sheet_name] = SheetData(
322
+ rows=rows,
323
+ shapes=[],
324
+ charts=[],
325
+ table_candidates=tables,
326
+ print_areas=print_area_data.get(sheet_name, [])
327
+ if include_print_areas
328
+ else [],
329
+ auto_print_areas=[],
330
+ )
331
+ logger.warning(
332
+ "%s Falling back to cells+tables only; shapes and charts will be empty.",
333
+ reason,
334
+ )
335
+ return WorkbookData(book_name=normalized_file_path.name, sheets=sheets)
336
+
337
+ if mode == "light":
338
+ return _cells_and_tables_only("Light mode selected.")
339
+
340
+ if os.getenv("SKIP_COM_TESTS"):
341
+ return _cells_and_tables_only(
342
+ "SKIP_COM_TESTS is set; skipping COM/xlwings access."
343
+ )
344
+
345
+ try:
346
+ wb, close_app = _open_workbook(normalized_file_path)
347
+ except Exception as e:
348
+ return _cells_and_tables_only(f"xlwings/Excel COM is unavailable. ({e!r})")
349
+
350
+ try:
351
+ try:
352
+ shape_data = get_shapes_with_position(wb, mode=mode)
353
+ if include_print_areas and not print_area_data:
354
+ # openpyxl couldn't read (e.g., .xls). Try COM as a fallback.
355
+ try:
356
+ print_area_data = _extract_print_areas_com(wb)
357
+ except Exception:
358
+ print_area_data = {}
359
+ if include_auto_page_breaks:
360
+ try:
361
+ auto_page_break_data = _compute_auto_page_break_areas(wb)
362
+ except Exception:
363
+ auto_page_break_data = {}
364
+ merged = integrate_sheet_content(
365
+ cell_data,
366
+ shape_data,
367
+ wb,
368
+ mode=mode,
369
+ print_area_data=print_area_data if include_print_areas else None,
370
+ auto_page_break_data=auto_page_break_data
371
+ if include_auto_page_breaks
372
+ else None,
373
+ )
374
+ return WorkbookData(book_name=normalized_file_path.name, sheets=merged)
375
+ except Exception as e:
376
+ logger.warning(
377
+ "Shape extraction failed; falling back to cells+tables. (%r)", e
378
+ )
379
+ return _cells_and_tables_only(f"Shape extraction failed ({e!r}).")
380
+ finally:
381
+ # Close only if we created the app to avoid shutting user sessions.
382
+ try:
383
+ if close_app:
384
+ app = wb.app
385
+ wb.close()
386
+ app.quit()
387
+ except Exception:
388
+ pass