exstruct 0.2.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,555 @@
1
+ from __future__ import annotations
2
+
3
+ import importlib
4
+ import json
5
+ import logging
6
+ from pathlib import Path
7
+ import re
8
+ from types import ModuleType
9
+ from typing import Literal, cast
10
+
11
+ from openpyxl.utils import range_boundaries
12
+
13
+ from ..errors import MissingDependencyError, OutputError, SerializationError
14
+ from ..models import CellRow, Chart, PrintArea, PrintAreaView, Shape, WorkbookData
15
+ from ..models.types import JsonStructure
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def dict_without_empty_values(obj: object) -> JsonStructure:
21
+ """Recursively drop empty values from nested structures."""
22
+ if isinstance(obj, dict):
23
+ return {
24
+ k: dict_without_empty_values(v)
25
+ for k, v in obj.items()
26
+ if v not in [None, "", [], {}]
27
+ }
28
+ if isinstance(obj, list):
29
+ return [
30
+ dict_without_empty_values(v) for v in obj if v not in [None, "", [], {}]
31
+ ]
32
+ if isinstance(
33
+ obj,
34
+ WorkbookData | CellRow | Chart | PrintArea | PrintAreaView | Shape,
35
+ ):
36
+ return dict_without_empty_values(obj.model_dump(exclude_none=True))
37
+ return cast(JsonStructure, obj)
38
+
39
+
40
+ def _write_text(path: Path, text: str) -> None:
41
+ """Write UTF-8 text to disk, wrapping IO errors."""
42
+ try:
43
+ path.write_text(text, encoding="utf-8")
44
+ except Exception as exc:
45
+ raise OutputError(f"Failed to write output to '{path}'.") from exc
46
+
47
+
48
+ def save_as_json(
49
+ model: WorkbookData, path: Path, *, pretty: bool = False, indent: int | None = None
50
+ ) -> None:
51
+ text = serialize_workbook(model, fmt="json", pretty=pretty, indent=indent)
52
+ _write_text(path, text)
53
+
54
+
55
+ def save_as_yaml(model: WorkbookData, path: Path) -> None:
56
+ text = serialize_workbook(model, fmt="yaml")
57
+ _write_text(path, text)
58
+
59
+
60
+ def save_as_toon(model: WorkbookData, path: Path) -> None:
61
+ text = serialize_workbook(model, fmt="toon")
62
+ _write_text(path, text)
63
+
64
+
65
+ def _sanitize_sheet_filename(name: str) -> str:
66
+ """Make a sheet name safe for filesystem usage."""
67
+ safe = re.sub(r"[\\/:*?\"<>|]", "_", name)
68
+ return safe or "sheet"
69
+
70
+
71
+ def _parse_range_zero_based(range_str: str) -> tuple[int, int, int, int] | None:
72
+ """
73
+ Parse an Excel range string into zero-based (r1, c1, r2, c2) bounds.
74
+ Returns None on failure.
75
+ """
76
+ cleaned = range_str.strip()
77
+ if not cleaned:
78
+ return None
79
+ if "!" in cleaned:
80
+ cleaned = cleaned.split("!", 1)[1]
81
+ try:
82
+ min_col, min_row, max_col, max_row = range_boundaries(cleaned)
83
+ except Exception:
84
+ return None
85
+ return (min_row - 1, min_col - 1, max_row - 1, max_col - 1)
86
+
87
+
88
+ def _row_in_area(row: CellRow, area: PrintArea) -> bool:
89
+ return area.r1 <= row.r <= area.r2
90
+
91
+
92
+ def _filter_row_to_area(
93
+ row: CellRow, area: PrintArea, *, normalize: bool = False
94
+ ) -> CellRow | None:
95
+ if not _row_in_area(row, area):
96
+ return None
97
+
98
+ filtered_cells: dict[str, int | float | str] = {}
99
+ filtered_links: dict[str, str] = {}
100
+
101
+ for col_idx_str, value in row.c.items():
102
+ try:
103
+ col_idx = int(col_idx_str)
104
+ except Exception:
105
+ continue
106
+ if area.c1 <= col_idx <= area.c2:
107
+ key = str(col_idx - area.c1) if normalize else col_idx_str
108
+ filtered_cells[key] = value
109
+
110
+ if row.links:
111
+ for col_idx_str, url in row.links.items():
112
+ try:
113
+ col_idx = int(col_idx_str)
114
+ except Exception:
115
+ continue
116
+ if area.c1 <= col_idx <= area.c2:
117
+ key = str(col_idx - area.c1) if normalize else col_idx_str
118
+ filtered_links[key] = url
119
+
120
+ if not filtered_cells and not filtered_links:
121
+ return None
122
+
123
+ new_row_idx = row.r - area.r1 if normalize else row.r
124
+ return CellRow(r=new_row_idx, c=filtered_cells, links=filtered_links or None)
125
+
126
+
127
+ def _filter_table_candidates_to_area(
128
+ table_candidates: list[str], area: PrintArea
129
+ ) -> list[str]:
130
+ filtered: list[str] = []
131
+ for candidate in table_candidates:
132
+ bounds = _parse_range_zero_based(candidate)
133
+ if not bounds:
134
+ continue
135
+ r1, c1, r2, c2 = bounds
136
+ if r1 >= area.r1 and r2 <= area.r2 and c1 >= area.c1 and c2 <= area.c2:
137
+ filtered.append(candidate)
138
+ return filtered
139
+
140
+
141
+ def _area_to_px_rect(
142
+ area: PrintArea, *, col_px: int = 64, row_px: int = 20
143
+ ) -> tuple[int, int, int, int]:
144
+ """
145
+ Convert a cell-based print area to an approximate pixel rectangle (l, t, r, b).
146
+ Uses default Excel-like cell sizes; accuracy is highest when shapes/charts are COM-extracted.
147
+ """
148
+ left = area.c1 * col_px
149
+ top = area.r1 * row_px
150
+ right = (area.c2 + 1) * col_px
151
+ bottom = (area.r2 + 1) * row_px
152
+ return left, top, right, bottom
153
+
154
+
155
+ def _rects_overlap(a: tuple[int, int, int, int], b: tuple[int, int, int, int]) -> bool:
156
+ """Return True if rectangles (l, t, r, b) overlap."""
157
+ return not (a[2] <= b[0] or a[0] >= b[2] or a[3] <= b[1] or a[1] >= b[3])
158
+
159
+
160
+ def _filter_shapes_to_area(shapes: list[Shape], area: PrintArea) -> list[Shape]:
161
+ area_rect = _area_to_px_rect(area)
162
+ filtered: list[Shape] = []
163
+ for shp in shapes:
164
+ if shp.w is None or shp.h is None:
165
+ # Fallback: treat shape as a point if size is unknown (standard mode).
166
+ if (
167
+ area_rect[0] <= shp.l <= area_rect[2]
168
+ and area_rect[1] <= shp.t <= area_rect[3]
169
+ ):
170
+ filtered.append(shp)
171
+ continue
172
+ shp_rect = (shp.l, shp.t, shp.l + shp.w, shp.t + shp.h)
173
+ if _rects_overlap(area_rect, shp_rect):
174
+ filtered.append(shp)
175
+ return filtered
176
+
177
+
178
+ def _filter_charts_to_area(charts: list[Chart], area: PrintArea) -> list[Chart]:
179
+ area_rect = _area_to_px_rect(area)
180
+ filtered: list[Chart] = []
181
+ for ch in charts:
182
+ if ch.w is None or ch.h is None:
183
+ continue
184
+ ch_rect = (ch.l, ch.t, ch.l + ch.w, ch.t + ch.h)
185
+ if _rects_overlap(area_rect, ch_rect):
186
+ filtered.append(ch)
187
+ return filtered
188
+
189
+
190
+ def _iter_area_views(
191
+ workbook: WorkbookData,
192
+ *,
193
+ area_attr: Literal["print_areas", "auto_print_areas"],
194
+ normalize: bool,
195
+ include_shapes: bool,
196
+ include_charts: bool,
197
+ include_shape_size: bool,
198
+ include_chart_size: bool,
199
+ ) -> dict[str, list[PrintAreaView]]:
200
+ views: dict[str, list[PrintAreaView]] = {}
201
+ for sheet_name, sheet in workbook.sheets.items():
202
+ areas: list[PrintArea] = getattr(sheet, area_attr)
203
+ if not areas:
204
+ continue
205
+ sheet_views: list[PrintAreaView] = []
206
+ for area in areas:
207
+ rows_in_area: list[CellRow] = []
208
+ for row in sheet.rows:
209
+ filtered_row = _filter_row_to_area(row, area, normalize=normalize)
210
+ if filtered_row:
211
+ rows_in_area.append(filtered_row)
212
+ area_tables = _filter_table_candidates_to_area(sheet.table_candidates, area)
213
+ area_shapes = (
214
+ _filter_shapes_to_area(sheet.shapes, area) if include_shapes else []
215
+ )
216
+ if not include_shape_size:
217
+ area_shapes = [
218
+ s.model_copy(update={"w": None, "h": None}) for s in area_shapes
219
+ ]
220
+ area_charts = (
221
+ _filter_charts_to_area(sheet.charts, area) if include_charts else []
222
+ )
223
+ if not include_chart_size:
224
+ area_charts = [
225
+ c.model_copy(update={"w": None, "h": None}) for c in area_charts
226
+ ]
227
+ sheet_views.append(
228
+ PrintAreaView(
229
+ book_name=workbook.book_name,
230
+ sheet_name=sheet_name,
231
+ area=area,
232
+ shapes=area_shapes,
233
+ charts=area_charts,
234
+ rows=rows_in_area,
235
+ table_candidates=area_tables,
236
+ )
237
+ )
238
+ if sheet_views:
239
+ views[sheet_name] = sheet_views
240
+ return views
241
+
242
+
243
+ def build_print_area_views(
244
+ workbook: WorkbookData,
245
+ *,
246
+ normalize: bool = False,
247
+ include_shapes: bool = True,
248
+ include_charts: bool = True,
249
+ include_shape_size: bool = True,
250
+ include_chart_size: bool = True,
251
+ ) -> dict[str, list[PrintAreaView]]:
252
+ """
253
+ Construct PrintAreaView instances for all print areas in the workbook.
254
+ Returns a mapping of sheet name to ordered list of PrintAreaView.
255
+ """
256
+ return _iter_area_views(
257
+ workbook,
258
+ area_attr="print_areas",
259
+ normalize=normalize,
260
+ include_shapes=include_shapes,
261
+ include_charts=include_charts,
262
+ include_shape_size=include_shape_size,
263
+ include_chart_size=include_chart_size,
264
+ )
265
+
266
+
267
+ def save_print_area_views(
268
+ workbook: WorkbookData,
269
+ output_dir: Path,
270
+ fmt: Literal["json", "yaml", "yml", "toon"] = "json",
271
+ *,
272
+ pretty: bool = False,
273
+ indent: int | None = None,
274
+ normalize: bool = False,
275
+ include_shapes: bool = True,
276
+ include_charts: bool = True,
277
+ include_shape_size: bool = True,
278
+ include_chart_size: bool = True,
279
+ ) -> dict[str, Path]:
280
+ """
281
+ Save each print area as an individual file in the specified format.
282
+ Returns a map of area key (e.g., 'Sheet1#1') to written path.
283
+ """
284
+ format_hint = fmt.lower()
285
+ if format_hint == "yml":
286
+ format_hint = "yaml"
287
+ if format_hint not in ("json", "yaml", "toon"):
288
+ raise SerializationError(
289
+ f"Unsupported print-area export format '{fmt}'. Allowed: json, yaml, yml, toon."
290
+ )
291
+
292
+ views = build_print_area_views(
293
+ workbook,
294
+ normalize=normalize,
295
+ include_shapes=include_shapes,
296
+ include_charts=include_charts,
297
+ include_shape_size=include_shape_size,
298
+ include_chart_size=include_chart_size,
299
+ )
300
+ if not views:
301
+ logger.info("No print areas found; skipping export to %s", output_dir)
302
+ return {}
303
+
304
+ output_dir.mkdir(parents=True, exist_ok=True)
305
+ written: dict[str, Path] = {}
306
+ suffix = {"json": ".json", "yaml": ".yaml", "toon": ".toon"}[format_hint]
307
+
308
+ for sheet_name, sheet_views in views.items():
309
+ for idx, view in enumerate(sheet_views):
310
+ key = f"{sheet_name}#{idx + 1}"
311
+ area = view.area
312
+ file_name = (
313
+ f"{_sanitize_sheet_filename(sheet_name)}"
314
+ f"_area{idx + 1}_r{area.r1}-{area.r2}_c{area.c1}-{area.c2}{suffix}"
315
+ )
316
+ path = output_dir / file_name
317
+ match format_hint:
318
+ case "json":
319
+ indent_val = 2 if pretty and indent is None else indent
320
+ text = view.to_json(pretty=pretty, indent=indent_val)
321
+ case "yaml":
322
+ text = view.to_yaml()
323
+ case "toon":
324
+ text = view.to_toon()
325
+ case _:
326
+ raise SerializationError(
327
+ f"Unsupported print-area export format '{fmt}'. Allowed: json, yaml, yml, toon."
328
+ )
329
+ _write_text(path, text)
330
+ written[key] = path
331
+ return written
332
+
333
+
334
+ def save_auto_page_break_views(
335
+ workbook: WorkbookData,
336
+ output_dir: Path,
337
+ fmt: Literal["json", "yaml", "yml", "toon"] = "json",
338
+ *,
339
+ pretty: bool = False,
340
+ indent: int | None = None,
341
+ normalize: bool = False,
342
+ include_shapes: bool = True,
343
+ include_charts: bool = True,
344
+ include_shape_size: bool = True,
345
+ include_chart_size: bool = True,
346
+ ) -> dict[str, Path]:
347
+ """
348
+ Save auto page-break areas (computed via Excel COM) per sheet in the specified format.
349
+ Returns a map of area key (e.g., 'Sheet1#auto#1') to written path.
350
+ """
351
+ format_hint = fmt.lower()
352
+ if format_hint == "yml":
353
+ format_hint = "yaml"
354
+ if format_hint not in ("json", "yaml", "toon"):
355
+ raise SerializationError(
356
+ f"Unsupported auto page-break export format '{fmt}'. Allowed: json, yaml, yml, toon."
357
+ )
358
+
359
+ views = _iter_area_views(
360
+ workbook,
361
+ area_attr="auto_print_areas",
362
+ normalize=normalize,
363
+ include_shapes=include_shapes,
364
+ include_charts=include_charts,
365
+ include_shape_size=include_shape_size,
366
+ include_chart_size=include_chart_size,
367
+ )
368
+ if not views:
369
+ logger.info("No auto page-break areas found; skipping export to %s", output_dir)
370
+ return {}
371
+
372
+ output_dir.mkdir(parents=True, exist_ok=True)
373
+ written: dict[str, Path] = {}
374
+ suffix = {"json": ".json", "yaml": ".yaml", "toon": ".toon"}[format_hint]
375
+
376
+ for sheet_name, sheet_views in views.items():
377
+ for idx, view in enumerate(sheet_views):
378
+ key = f"{sheet_name}#auto#{idx + 1}"
379
+ area = view.area
380
+ file_name = (
381
+ f"{_sanitize_sheet_filename(sheet_name)}"
382
+ f"_auto_page{idx + 1}_r{area.r1}-{area.r2}_c{area.c1}-{area.c2}{suffix}"
383
+ )
384
+ path = output_dir / file_name
385
+ match format_hint:
386
+ case "json":
387
+ indent_val = 2 if pretty and indent is None else indent
388
+ text = view.to_json(pretty=pretty, indent=indent_val)
389
+ case "yaml":
390
+ text = view.to_yaml()
391
+ case "toon":
392
+ text = view.to_toon()
393
+ case _:
394
+ raise SerializationError(
395
+ f"Unsupported auto page-break export format '{fmt}'. Allowed: json, yaml, yml, toon."
396
+ )
397
+ _write_text(path, text)
398
+ written[key] = path
399
+ return written
400
+
401
+
402
+ def serialize_workbook(
403
+ model: WorkbookData,
404
+ fmt: Literal["json", "yaml", "yml", "toon"] = "json",
405
+ *,
406
+ pretty: bool = False,
407
+ indent: int | None = None,
408
+ ) -> str:
409
+ """
410
+ Convert WorkbookData to string in the requested format without writing to disk.
411
+ """
412
+ format_hint = fmt.lower()
413
+ if format_hint == "yml":
414
+ format_hint = "yaml"
415
+ filtered_dict = dict_without_empty_values(model.model_dump(exclude_none=True))
416
+
417
+ match format_hint:
418
+ case "json":
419
+ indent_val = 2 if pretty and indent is None else indent
420
+ return json.dumps(filtered_dict, ensure_ascii=False, indent=indent_val)
421
+ case "yaml":
422
+ yaml = _require_yaml()
423
+ return str(
424
+ yaml.safe_dump(
425
+ filtered_dict,
426
+ allow_unicode=True,
427
+ sort_keys=False,
428
+ indent=2,
429
+ )
430
+ )
431
+ case "toon":
432
+ toon = _require_toon()
433
+ return str(toon.encode(filtered_dict))
434
+ case _:
435
+ raise SerializationError(
436
+ f"Unsupported export format '{fmt}'. Allowed: json, yaml, yml, toon."
437
+ )
438
+
439
+
440
+ def save_sheets_as_json(
441
+ workbook: WorkbookData,
442
+ output_dir: Path,
443
+ *,
444
+ pretty: bool = False,
445
+ indent: int | None = None,
446
+ ) -> dict[str, Path]:
447
+ """
448
+ Save each sheet as an individual JSON file.
449
+ Contents include book_name and the sheet's SheetData.
450
+ Returns a map of sheet name -> written path.
451
+ """
452
+ output_dir.mkdir(parents=True, exist_ok=True)
453
+ written: dict[str, Path] = {}
454
+ for sheet_name, sheet_data in workbook.sheets.items():
455
+ payload = dict_without_empty_values(
456
+ {
457
+ "book_name": workbook.book_name,
458
+ "sheet_name": sheet_name,
459
+ "sheet": sheet_data.model_dump(exclude_none=True),
460
+ }
461
+ )
462
+ file_name = f"{_sanitize_sheet_filename(sheet_name)}.json"
463
+ path = output_dir / file_name
464
+ indent_val = 2 if pretty and indent is None else indent
465
+ _write_text(path, json.dumps(payload, ensure_ascii=False, indent=indent_val))
466
+ written[sheet_name] = path
467
+ return written
468
+
469
+
470
+ def save_sheets(
471
+ workbook: WorkbookData,
472
+ output_dir: Path,
473
+ fmt: Literal["json", "yaml", "yml", "toon"] = "json",
474
+ *,
475
+ pretty: bool = False,
476
+ indent: int | None = None,
477
+ ) -> dict[str, Path]:
478
+ """
479
+ Save each sheet as an individual file in the specified format (json/yaml/toon).
480
+ Payload includes book_name and the sheet's SheetData.
481
+ """
482
+ format_hint = fmt.lower()
483
+ if format_hint == "yml":
484
+ format_hint = "yaml"
485
+ if format_hint not in ("json", "yaml", "toon"):
486
+ raise ValueError(f"Unsupported sheet export format: {fmt}")
487
+
488
+ output_dir.mkdir(parents=True, exist_ok=True)
489
+ written: dict[str, Path] = {}
490
+ for sheet_name, sheet_data in workbook.sheets.items():
491
+ payload = dict_without_empty_values(
492
+ {
493
+ "book_name": workbook.book_name,
494
+ "sheet_name": sheet_name,
495
+ "sheet": sheet_data.model_dump(exclude_none=True),
496
+ }
497
+ )
498
+ suffix = {"json": ".json", "yaml": ".yaml", "toon": ".toon"}[format_hint]
499
+ file_name = f"{_sanitize_sheet_filename(sheet_name)}{suffix}"
500
+ path = output_dir / file_name
501
+ match format_hint:
502
+ case "json":
503
+ indent_val = 2 if pretty and indent is None else indent
504
+ text = json.dumps(payload, ensure_ascii=False, indent=indent_val)
505
+ case "yaml":
506
+ yaml = _require_yaml()
507
+ text = str(
508
+ yaml.safe_dump(
509
+ payload, allow_unicode=True, sort_keys=False, indent=2
510
+ )
511
+ )
512
+ case "toon":
513
+ toon = _require_toon()
514
+ text = str(toon.encode(payload))
515
+ case _:
516
+ raise SerializationError(
517
+ f"Unsupported sheet export format '{format_hint}'. Allowed: json, yaml, yml, toon."
518
+ )
519
+ _write_text(path, text)
520
+ written[sheet_name] = path
521
+ return written
522
+
523
+
524
+ def _require_yaml() -> ModuleType:
525
+ try:
526
+ module = importlib.import_module("yaml")
527
+ except ImportError as e:
528
+ raise MissingDependencyError(
529
+ "YAML export requires pyyaml. Install it via `pip install pyyaml` or add the 'yaml' extra."
530
+ ) from e
531
+ return module
532
+
533
+
534
+ def _require_toon() -> ModuleType:
535
+ try:
536
+ module = importlib.import_module("toon")
537
+ except ImportError as e:
538
+ raise MissingDependencyError(
539
+ "TOON export requires python-toon. Install it via `pip install python-toon` or add the 'toon' extra."
540
+ ) from e
541
+ return module
542
+
543
+
544
+ __all__ = [
545
+ "dict_without_empty_values",
546
+ "save_as_json",
547
+ "save_as_yaml",
548
+ "save_as_toon",
549
+ "save_sheets",
550
+ "save_sheets_as_json",
551
+ "build_print_area_views",
552
+ "save_print_area_views",
553
+ "save_auto_page_break_views",
554
+ "serialize_workbook",
555
+ ]