exstruct 0.2.80__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exstruct/__init__.py +387 -0
- exstruct/cli/availability.py +49 -0
- exstruct/cli/main.py +134 -0
- exstruct/core/__init__.py +0 -0
- exstruct/core/cells.py +1039 -0
- exstruct/core/charts.py +241 -0
- exstruct/core/integrate.py +388 -0
- exstruct/core/shapes.py +275 -0
- exstruct/engine.py +643 -0
- exstruct/errors.py +35 -0
- exstruct/io/__init__.py +555 -0
- exstruct/models/__init__.py +335 -0
- exstruct/models/maps.py +335 -0
- exstruct/models/types.py +8 -0
- exstruct/py.typed +0 -0
- exstruct/render/__init__.py +118 -0
- exstruct-0.2.80.dist-info/METADATA +435 -0
- exstruct-0.2.80.dist-info/RECORD +20 -0
- exstruct-0.2.80.dist-info/WHEEL +4 -0
- exstruct-0.2.80.dist-info/entry_points.txt +3 -0
exstruct/engine.py
ADDED
|
@@ -0,0 +1,643 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Literal, TextIO, TypedDict, cast
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
10
|
+
|
|
11
|
+
from .core import cells as _cells
|
|
12
|
+
from .core.cells import set_table_detection_params
|
|
13
|
+
from .core.integrate import extract_workbook
|
|
14
|
+
from .io import (
|
|
15
|
+
save_auto_page_break_views,
|
|
16
|
+
save_print_area_views,
|
|
17
|
+
save_sheets,
|
|
18
|
+
serialize_workbook,
|
|
19
|
+
)
|
|
20
|
+
from .models import SheetData, WorkbookData
|
|
21
|
+
from .render import export_pdf, export_sheet_images
|
|
22
|
+
|
|
23
|
+
ExtractionMode = Literal["light", "standard", "verbose"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TableParams(TypedDict, total=False):
|
|
27
|
+
table_score_threshold: float
|
|
28
|
+
density_min: float
|
|
29
|
+
coverage_min: float
|
|
30
|
+
min_nonempty_cells: int
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class StructOptions:
|
|
35
|
+
"""
|
|
36
|
+
Extraction-time options for ExStructEngine.
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
mode: Extraction mode. One of "light", "standard", "verbose".
|
|
40
|
+
- light: cells + table candidates only (no COM, shapes/charts empty)
|
|
41
|
+
- standard: texted shapes + arrows + charts (if COM available)
|
|
42
|
+
- verbose: all shapes (width/height), charts, table candidates
|
|
43
|
+
table_params: Optional dict passed to `set_table_detection_params(**table_params)`
|
|
44
|
+
before extraction. Use this to tweak table detection heuristics
|
|
45
|
+
per engine instance without touching global state.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
mode: ExtractionMode = "standard"
|
|
49
|
+
table_params: TableParams | None = (
|
|
50
|
+
None # forwarded to set_table_detection_params if provided
|
|
51
|
+
)
|
|
52
|
+
include_cell_links: bool | None = None # None -> auto: verbose=True, others=False
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class FormatOptions(BaseModel):
|
|
56
|
+
"""Formatting options for serialization."""
|
|
57
|
+
|
|
58
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
59
|
+
fmt: Literal["json", "yaml", "yml", "toon"] = Field(
|
|
60
|
+
default="json", description="Serialization format."
|
|
61
|
+
)
|
|
62
|
+
pretty: bool = Field(default=False, description="Pretty-print JSON output.")
|
|
63
|
+
indent: int | None = Field(
|
|
64
|
+
default=None,
|
|
65
|
+
description="Indent width for JSON (defaults to 2 when pretty is True).",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class FilterOptions(BaseModel):
|
|
70
|
+
"""Include/exclude filters for output."""
|
|
71
|
+
|
|
72
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
73
|
+
include_rows: bool = Field(default=True, description="Include cell rows.")
|
|
74
|
+
include_shapes: bool = Field(default=True, description="Include shapes.")
|
|
75
|
+
include_shape_size: bool | None = Field(
|
|
76
|
+
default=None,
|
|
77
|
+
description="Include shape size; None -> auto (verbose=True, others=False).",
|
|
78
|
+
)
|
|
79
|
+
include_charts: bool = Field(default=True, description="Include charts.")
|
|
80
|
+
include_chart_size: bool | None = Field(
|
|
81
|
+
default=None,
|
|
82
|
+
description="Include chart size; None -> auto (verbose=True, others=False).",
|
|
83
|
+
)
|
|
84
|
+
include_tables: bool = Field(
|
|
85
|
+
default=True, description="Include table candidate ranges."
|
|
86
|
+
)
|
|
87
|
+
include_print_areas: bool | None = Field(
|
|
88
|
+
default=None,
|
|
89
|
+
description="Include print areas; None -> auto (light=False, others=True).",
|
|
90
|
+
)
|
|
91
|
+
include_auto_print_areas: bool = Field(
|
|
92
|
+
default=False, description="Include COM-computed auto page-break areas."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class DestinationOptions(BaseModel):
|
|
97
|
+
"""Destinations for optional side outputs."""
|
|
98
|
+
|
|
99
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
100
|
+
sheets_dir: str | Path | None = Field(
|
|
101
|
+
default=None, description="Directory to write per-sheet files."
|
|
102
|
+
)
|
|
103
|
+
print_areas_dir: str | Path | None = Field(
|
|
104
|
+
default=None, description="Directory to write per-print-area files."
|
|
105
|
+
)
|
|
106
|
+
auto_page_breaks_dir: str | Path | None = Field(
|
|
107
|
+
default=None, description="Directory to write auto page-break files."
|
|
108
|
+
)
|
|
109
|
+
stream: TextIO | None = Field(
|
|
110
|
+
default=None, description="Stream override for primary output (stdout/file)."
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
class OutputOptions(BaseModel):
|
|
115
|
+
"""
|
|
116
|
+
Output-time options for ExStructEngine.
|
|
117
|
+
|
|
118
|
+
- format: serialization format/indent.
|
|
119
|
+
- filters: include/exclude flags (rows/shapes/charts/tables/print_areas, size flags).
|
|
120
|
+
- destinations: side outputs (per-sheet, per-print-area, stream override).
|
|
121
|
+
|
|
122
|
+
Legacy flat fields (fmt, pretty, indent, include_*, sheets_dir, print_areas_dir, stream)
|
|
123
|
+
are still accepted and normalized into the nested structures.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
format: FormatOptions = Field(
|
|
127
|
+
default_factory=FormatOptions, description="Formatting options."
|
|
128
|
+
)
|
|
129
|
+
filters: FilterOptions = Field(
|
|
130
|
+
default_factory=FilterOptions, description="Include/exclude flags."
|
|
131
|
+
)
|
|
132
|
+
destinations: DestinationOptions = Field(
|
|
133
|
+
default_factory=DestinationOptions, description="Side output destinations."
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
@model_validator(mode="before")
|
|
137
|
+
@classmethod
|
|
138
|
+
def _coerce_legacy(cls, values: dict[str, object]) -> dict[str, object]:
|
|
139
|
+
if not isinstance(values, dict):
|
|
140
|
+
return values
|
|
141
|
+
# Normalize legacy flat fields into nested configs
|
|
142
|
+
fmt_cfg = {
|
|
143
|
+
"fmt": values.pop("fmt", None),
|
|
144
|
+
"pretty": values.pop("pretty", None),
|
|
145
|
+
"indent": values.pop("indent", None),
|
|
146
|
+
}
|
|
147
|
+
filt_cfg = {
|
|
148
|
+
"include_rows": values.pop("include_rows", None),
|
|
149
|
+
"include_shapes": values.pop("include_shapes", None),
|
|
150
|
+
"include_shape_size": values.pop("include_shape_size", None),
|
|
151
|
+
"include_charts": values.pop("include_charts", None),
|
|
152
|
+
"include_chart_size": values.pop("include_chart_size", None),
|
|
153
|
+
"include_tables": values.pop("include_tables", None),
|
|
154
|
+
"include_print_areas": values.pop("include_print_areas", None),
|
|
155
|
+
}
|
|
156
|
+
dest_cfg = {
|
|
157
|
+
"sheets_dir": values.pop("sheets_dir", None),
|
|
158
|
+
"print_areas_dir": values.pop("print_areas_dir", None),
|
|
159
|
+
"auto_page_breaks_dir": values.pop("auto_page_breaks_dir", None),
|
|
160
|
+
"stream": values.pop("stream", None),
|
|
161
|
+
}
|
|
162
|
+
# Drop None to let defaults apply
|
|
163
|
+
fmt_cfg = {k: v for k, v in fmt_cfg.items() if v is not None}
|
|
164
|
+
filt_cfg = {k: v for k, v in filt_cfg.items() if v is not None}
|
|
165
|
+
dest_cfg = {k: v for k, v in dest_cfg.items() if v is not None}
|
|
166
|
+
|
|
167
|
+
merged = dict(values)
|
|
168
|
+
if "format" not in merged and fmt_cfg:
|
|
169
|
+
merged["format"] = fmt_cfg
|
|
170
|
+
if "filters" not in merged and filt_cfg:
|
|
171
|
+
merged["filters"] = filt_cfg
|
|
172
|
+
if "destinations" not in merged and dest_cfg:
|
|
173
|
+
merged["destinations"] = dest_cfg
|
|
174
|
+
return merged
|
|
175
|
+
|
|
176
|
+
# Legacy compatibility properties
|
|
177
|
+
@property
|
|
178
|
+
def fmt(self) -> Literal["json", "yaml", "yml", "toon"]:
|
|
179
|
+
return self.format.fmt
|
|
180
|
+
|
|
181
|
+
@property
|
|
182
|
+
def pretty(self) -> bool:
|
|
183
|
+
return self.format.pretty
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def indent(self) -> int | None:
|
|
187
|
+
return self.format.indent
|
|
188
|
+
|
|
189
|
+
@property
|
|
190
|
+
def include_rows(self) -> bool:
|
|
191
|
+
return self.filters.include_rows
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def include_shapes(self) -> bool:
|
|
195
|
+
return self.filters.include_shapes
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def include_shape_size(self) -> bool | None:
|
|
199
|
+
return self.filters.include_shape_size
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def include_charts(self) -> bool:
|
|
203
|
+
return self.filters.include_charts
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def include_chart_size(self) -> bool | None:
|
|
207
|
+
return self.filters.include_chart_size
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def include_tables(self) -> bool:
|
|
211
|
+
return self.filters.include_tables
|
|
212
|
+
|
|
213
|
+
@property
|
|
214
|
+
def include_print_areas(self) -> bool | None:
|
|
215
|
+
return self.filters.include_print_areas
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def sheets_dir(self) -> Path | None:
|
|
219
|
+
resolved = self.destinations.sheets_dir
|
|
220
|
+
if resolved is None:
|
|
221
|
+
return None
|
|
222
|
+
return resolved if isinstance(resolved, Path) else Path(resolved)
|
|
223
|
+
|
|
224
|
+
@property
|
|
225
|
+
def print_areas_dir(self) -> Path | None:
|
|
226
|
+
resolved = self.destinations.print_areas_dir
|
|
227
|
+
if resolved is None:
|
|
228
|
+
return None
|
|
229
|
+
return resolved if isinstance(resolved, Path) else Path(resolved)
|
|
230
|
+
|
|
231
|
+
@property
|
|
232
|
+
def stream(self) -> TextIO | None:
|
|
233
|
+
return self.destinations.stream
|
|
234
|
+
|
|
235
|
+
@property
|
|
236
|
+
def auto_page_breaks_dir(self) -> Path | None:
|
|
237
|
+
resolved = self.destinations.auto_page_breaks_dir
|
|
238
|
+
if resolved is None:
|
|
239
|
+
return None
|
|
240
|
+
return resolved if isinstance(resolved, Path) else Path(resolved)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class ExStructEngine:
|
|
244
|
+
"""
|
|
245
|
+
Configurable engine for ExStruct extraction and export.
|
|
246
|
+
|
|
247
|
+
Instances are immutable; override options per call if needed.
|
|
248
|
+
|
|
249
|
+
Key behaviors:
|
|
250
|
+
- StructOptions: extraction mode and optional table detection params.
|
|
251
|
+
- OutputOptions: serialization format/pretty-print, include/exclude filters, per-sheet/per-print-area output dirs, etc.
|
|
252
|
+
- Main methods:
|
|
253
|
+
extract(path, mode=None) -> WorkbookData
|
|
254
|
+
- Modes: light/standard/verbose
|
|
255
|
+
- light: COM-free; cells + tables + print areas only (shapes/charts empty)
|
|
256
|
+
serialize(workbook, ...) -> str
|
|
257
|
+
- Applies include_* filters, then serializes
|
|
258
|
+
export(workbook, ...)
|
|
259
|
+
- Writes to file/stdout; optionally per-sheet and per-print-area files
|
|
260
|
+
process(file_path, ...)
|
|
261
|
+
- One-shot extract->export (CLI equivalent), with optional PDF/PNG
|
|
262
|
+
"""
|
|
263
|
+
|
|
264
|
+
def __init__(
|
|
265
|
+
self,
|
|
266
|
+
options: StructOptions | None = None,
|
|
267
|
+
output: OutputOptions | None = None,
|
|
268
|
+
) -> None:
|
|
269
|
+
self.options = options or StructOptions()
|
|
270
|
+
self.output = output or OutputOptions()
|
|
271
|
+
|
|
272
|
+
@staticmethod
|
|
273
|
+
def from_defaults() -> ExStructEngine:
|
|
274
|
+
"""Factory to create an engine with default options."""
|
|
275
|
+
return ExStructEngine()
|
|
276
|
+
|
|
277
|
+
def _apply_table_params(self) -> None:
|
|
278
|
+
if self.options.table_params:
|
|
279
|
+
set_table_detection_params(**self.options.table_params)
|
|
280
|
+
|
|
281
|
+
@contextmanager
|
|
282
|
+
def _table_params_scope(self) -> Iterator[None]:
|
|
283
|
+
"""
|
|
284
|
+
Temporarily apply table_params and restore previous global config afterward.
|
|
285
|
+
"""
|
|
286
|
+
if not self.options.table_params:
|
|
287
|
+
yield
|
|
288
|
+
return
|
|
289
|
+
prev = cast(TableParams, dict(_cells._DETECTION_CONFIG))
|
|
290
|
+
set_table_detection_params(**self.options.table_params)
|
|
291
|
+
try:
|
|
292
|
+
yield
|
|
293
|
+
finally:
|
|
294
|
+
set_table_detection_params(**prev)
|
|
295
|
+
|
|
296
|
+
def _resolve_size_flags(self) -> tuple[bool, bool]:
|
|
297
|
+
"""
|
|
298
|
+
Determine whether to include Shape/Chart size fields in output.
|
|
299
|
+
Auto: verbose -> include, others -> exclude.
|
|
300
|
+
"""
|
|
301
|
+
include_shape_size = (
|
|
302
|
+
self.output.filters.include_shape_size
|
|
303
|
+
if self.output.filters.include_shape_size is not None
|
|
304
|
+
else self.options.mode == "verbose"
|
|
305
|
+
)
|
|
306
|
+
include_chart_size = (
|
|
307
|
+
self.output.filters.include_chart_size
|
|
308
|
+
if self.output.filters.include_chart_size is not None
|
|
309
|
+
else self.options.mode == "verbose"
|
|
310
|
+
)
|
|
311
|
+
return include_shape_size, include_chart_size
|
|
312
|
+
|
|
313
|
+
def _include_print_areas(self) -> bool:
|
|
314
|
+
"""
|
|
315
|
+
Decide whether to include print areas in output.
|
|
316
|
+
Auto: light -> False, others -> True.
|
|
317
|
+
"""
|
|
318
|
+
if self.output.filters.include_print_areas is None:
|
|
319
|
+
return self.options.mode != "light"
|
|
320
|
+
return self.output.filters.include_print_areas
|
|
321
|
+
|
|
322
|
+
def _include_auto_print_areas(self) -> bool:
|
|
323
|
+
"""
|
|
324
|
+
Decide whether to include auto page-break areas in output.
|
|
325
|
+
Defaults to False unless explicitly enabled.
|
|
326
|
+
"""
|
|
327
|
+
return self.output.filters.include_auto_print_areas
|
|
328
|
+
|
|
329
|
+
def _filter_sheet(
|
|
330
|
+
self, sheet: SheetData, include_auto_override: bool | None = None
|
|
331
|
+
) -> SheetData:
|
|
332
|
+
include_shape_size, include_chart_size = self._resolve_size_flags()
|
|
333
|
+
include_print_areas = self._include_print_areas()
|
|
334
|
+
include_auto_print_areas = (
|
|
335
|
+
include_auto_override
|
|
336
|
+
if include_auto_override is not None
|
|
337
|
+
else self._include_auto_print_areas()
|
|
338
|
+
)
|
|
339
|
+
return SheetData(
|
|
340
|
+
rows=sheet.rows if self.output.filters.include_rows else [],
|
|
341
|
+
shapes=[
|
|
342
|
+
s if include_shape_size else s.model_copy(update={"w": None, "h": None})
|
|
343
|
+
for s in sheet.shapes
|
|
344
|
+
]
|
|
345
|
+
if self.output.filters.include_shapes
|
|
346
|
+
else [],
|
|
347
|
+
charts=[
|
|
348
|
+
c if include_chart_size else c.model_copy(update={"w": None, "h": None})
|
|
349
|
+
for c in sheet.charts
|
|
350
|
+
]
|
|
351
|
+
if self.output.filters.include_charts
|
|
352
|
+
else [],
|
|
353
|
+
table_candidates=sheet.table_candidates
|
|
354
|
+
if self.output.filters.include_tables
|
|
355
|
+
else [],
|
|
356
|
+
print_areas=sheet.print_areas if include_print_areas else [],
|
|
357
|
+
auto_print_areas=sheet.auto_print_areas if include_auto_print_areas else [],
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
def _filter_workbook(
|
|
361
|
+
self, wb: WorkbookData, *, include_auto_override: bool | None = None
|
|
362
|
+
) -> WorkbookData:
|
|
363
|
+
filtered = {
|
|
364
|
+
name: self._filter_sheet(sheet, include_auto_override=include_auto_override)
|
|
365
|
+
for name, sheet in wb.sheets.items()
|
|
366
|
+
}
|
|
367
|
+
return WorkbookData(book_name=wb.book_name, sheets=filtered)
|
|
368
|
+
|
|
369
|
+
@staticmethod
|
|
370
|
+
def _ensure_path(path: str | Path) -> Path:
|
|
371
|
+
"""Normalize a string or Path input to a Path instance.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
path: Path-like input value.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
Path constructed from the given value.
|
|
378
|
+
"""
|
|
379
|
+
|
|
380
|
+
return path if isinstance(path, Path) else Path(path)
|
|
381
|
+
|
|
382
|
+
@classmethod
|
|
383
|
+
def _ensure_optional_path(cls, path: str | Path | None) -> Path | None:
|
|
384
|
+
"""Normalize an optional path-like value to Path when provided.
|
|
385
|
+
|
|
386
|
+
Args:
|
|
387
|
+
path: Optional path-like input value.
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
Normalized Path when provided, otherwise None.
|
|
391
|
+
"""
|
|
392
|
+
|
|
393
|
+
if path is None:
|
|
394
|
+
return None
|
|
395
|
+
return cls._ensure_path(path)
|
|
396
|
+
|
|
397
|
+
def extract(
|
|
398
|
+
self, file_path: str | Path, *, mode: ExtractionMode | None = None
|
|
399
|
+
) -> WorkbookData:
|
|
400
|
+
"""
|
|
401
|
+
Extract a workbook and return normalized workbook data.
|
|
402
|
+
|
|
403
|
+
Args:
|
|
404
|
+
file_path: Path to the .xlsx/.xlsm/.xls file to extract.
|
|
405
|
+
mode: Extraction mode; defaults to the engine's StructOptions.mode.
|
|
406
|
+
- light: COM-free; cells, table candidates, and print areas only.
|
|
407
|
+
- standard: Shapes with text/arrows plus charts; print areas included;
|
|
408
|
+
size fields retained but hidden from default output.
|
|
409
|
+
- verbose: All shapes (with size) and charts (with size).
|
|
410
|
+
"""
|
|
411
|
+
chosen_mode = mode or self.options.mode
|
|
412
|
+
if chosen_mode not in ("light", "standard", "verbose"):
|
|
413
|
+
raise ValueError(f"Unsupported mode: {chosen_mode}")
|
|
414
|
+
include_links = (
|
|
415
|
+
self.options.include_cell_links
|
|
416
|
+
if self.options.include_cell_links is not None
|
|
417
|
+
else chosen_mode == "verbose"
|
|
418
|
+
)
|
|
419
|
+
include_print_areas = True # Extract print areas even in light mode
|
|
420
|
+
include_auto_page_breaks = (
|
|
421
|
+
self.output.filters.include_auto_print_areas
|
|
422
|
+
or self.output.destinations.auto_page_breaks_dir is not None
|
|
423
|
+
)
|
|
424
|
+
normalized_file_path = self._ensure_path(file_path)
|
|
425
|
+
with self._table_params_scope():
|
|
426
|
+
return extract_workbook(
|
|
427
|
+
normalized_file_path,
|
|
428
|
+
mode=chosen_mode,
|
|
429
|
+
include_cell_links=include_links,
|
|
430
|
+
include_print_areas=include_print_areas,
|
|
431
|
+
include_auto_page_breaks=include_auto_page_breaks,
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
def serialize(
|
|
435
|
+
self,
|
|
436
|
+
data: WorkbookData,
|
|
437
|
+
*,
|
|
438
|
+
fmt: Literal["json", "yaml", "yml", "toon"] | None = None,
|
|
439
|
+
pretty: bool | None = None,
|
|
440
|
+
indent: int | None = None,
|
|
441
|
+
) -> str:
|
|
442
|
+
"""
|
|
443
|
+
Serialize a workbook after applying include/exclude filters.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
data: Workbook to serialize after filtering.
|
|
447
|
+
fmt: Serialization format; defaults to OutputOptions.fmt.
|
|
448
|
+
pretty: Whether to pretty-print JSON output.
|
|
449
|
+
indent: Indentation to use when pretty-printing JSON.
|
|
450
|
+
"""
|
|
451
|
+
filtered = self._filter_workbook(data)
|
|
452
|
+
use_fmt = fmt or self.output.format.fmt
|
|
453
|
+
use_pretty = self.output.format.pretty if pretty is None else pretty
|
|
454
|
+
use_indent = self.output.format.indent if indent is None else indent
|
|
455
|
+
return serialize_workbook(
|
|
456
|
+
filtered, fmt=use_fmt, pretty=use_pretty, indent=use_indent
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
def export(
|
|
460
|
+
self,
|
|
461
|
+
data: WorkbookData,
|
|
462
|
+
output_path: str | Path | None = None,
|
|
463
|
+
*,
|
|
464
|
+
fmt: Literal["json", "yaml", "yml", "toon"] | None = None,
|
|
465
|
+
pretty: bool | None = None,
|
|
466
|
+
indent: int | None = None,
|
|
467
|
+
sheets_dir: str | Path | None = None,
|
|
468
|
+
print_areas_dir: str | Path | None = None,
|
|
469
|
+
auto_page_breaks_dir: str | Path | None = None,
|
|
470
|
+
stream: TextIO | None = None,
|
|
471
|
+
) -> None:
|
|
472
|
+
"""
|
|
473
|
+
Write filtered workbook data to a file or stream.
|
|
474
|
+
|
|
475
|
+
Includes optional per-sheet and per-print-area outputs when destinations are
|
|
476
|
+
provided.
|
|
477
|
+
|
|
478
|
+
Args:
|
|
479
|
+
data: Workbook to serialize and write.
|
|
480
|
+
output_path: Target file path (str or Path); writes to stdout when None.
|
|
481
|
+
fmt: Serialization format; defaults to OutputOptions.fmt.
|
|
482
|
+
pretty: Whether to pretty-print JSON output.
|
|
483
|
+
indent: Indentation to use when pretty-printing JSON.
|
|
484
|
+
sheets_dir: Directory for per-sheet outputs when provided (str or Path).
|
|
485
|
+
print_areas_dir: Directory for per-print-area outputs when provided (str or Path).
|
|
486
|
+
auto_page_breaks_dir: Directory for auto page-break outputs (str or Path; COM
|
|
487
|
+
environments only).
|
|
488
|
+
stream: Stream override when output_path is None.
|
|
489
|
+
"""
|
|
490
|
+
text = self.serialize(data, fmt=fmt, pretty=pretty, indent=indent)
|
|
491
|
+
target_stream = stream or self.output.destinations.stream
|
|
492
|
+
chosen_fmt = fmt or self.output.format.fmt
|
|
493
|
+
chosen_sheets_dir = (
|
|
494
|
+
sheets_dir
|
|
495
|
+
if sheets_dir is not None
|
|
496
|
+
else self.output.destinations.sheets_dir
|
|
497
|
+
)
|
|
498
|
+
chosen_print_areas_dir = (
|
|
499
|
+
print_areas_dir
|
|
500
|
+
if print_areas_dir is not None
|
|
501
|
+
else self.output.destinations.print_areas_dir
|
|
502
|
+
)
|
|
503
|
+
chosen_auto_page_breaks_dir = (
|
|
504
|
+
auto_page_breaks_dir
|
|
505
|
+
if auto_page_breaks_dir is not None
|
|
506
|
+
else self.output.destinations.auto_page_breaks_dir
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
normalized_output_path = self._ensure_optional_path(output_path)
|
|
510
|
+
normalized_sheets_dir = self._ensure_optional_path(chosen_sheets_dir)
|
|
511
|
+
normalized_print_areas_dir = self._ensure_optional_path(chosen_print_areas_dir)
|
|
512
|
+
normalized_auto_page_breaks_dir = self._ensure_optional_path(
|
|
513
|
+
chosen_auto_page_breaks_dir
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
if normalized_output_path is not None:
|
|
517
|
+
normalized_output_path.write_text(text, encoding="utf-8")
|
|
518
|
+
elif (
|
|
519
|
+
normalized_output_path is None
|
|
520
|
+
and chosen_sheets_dir is None
|
|
521
|
+
and chosen_print_areas_dir is None
|
|
522
|
+
and chosen_auto_page_breaks_dir is None
|
|
523
|
+
):
|
|
524
|
+
import sys
|
|
525
|
+
|
|
526
|
+
stream_target = target_stream or sys.stdout
|
|
527
|
+
stream_target.write(text)
|
|
528
|
+
if not text.endswith("\n"):
|
|
529
|
+
stream_target.write("\n")
|
|
530
|
+
|
|
531
|
+
if normalized_sheets_dir is not None:
|
|
532
|
+
filtered = self._filter_workbook(data)
|
|
533
|
+
save_sheets(
|
|
534
|
+
filtered,
|
|
535
|
+
normalized_sheets_dir,
|
|
536
|
+
fmt=chosen_fmt,
|
|
537
|
+
pretty=self.output.format.pretty if pretty is None else pretty,
|
|
538
|
+
indent=self.output.format.indent if indent is None else indent,
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
if normalized_print_areas_dir is not None:
|
|
542
|
+
include_shape_size, include_chart_size = self._resolve_size_flags()
|
|
543
|
+
if self._include_print_areas():
|
|
544
|
+
filtered = self._filter_workbook(data)
|
|
545
|
+
save_print_area_views(
|
|
546
|
+
filtered,
|
|
547
|
+
normalized_print_areas_dir,
|
|
548
|
+
fmt=chosen_fmt,
|
|
549
|
+
pretty=self.output.format.pretty if pretty is None else pretty,
|
|
550
|
+
indent=self.output.format.indent if indent is None else indent,
|
|
551
|
+
include_shapes=self.output.filters.include_shapes,
|
|
552
|
+
include_charts=self.output.filters.include_charts,
|
|
553
|
+
include_shape_size=include_shape_size,
|
|
554
|
+
include_chart_size=include_chart_size,
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
if normalized_auto_page_breaks_dir is not None:
|
|
558
|
+
include_shape_size, include_chart_size = self._resolve_size_flags()
|
|
559
|
+
filtered = self._filter_workbook(data, include_auto_override=True)
|
|
560
|
+
save_auto_page_break_views(
|
|
561
|
+
filtered,
|
|
562
|
+
normalized_auto_page_breaks_dir,
|
|
563
|
+
fmt=chosen_fmt,
|
|
564
|
+
pretty=self.output.format.pretty if pretty is None else pretty,
|
|
565
|
+
indent=self.output.format.indent if indent is None else indent,
|
|
566
|
+
include_shapes=self.output.filters.include_shapes,
|
|
567
|
+
include_charts=self.output.filters.include_charts,
|
|
568
|
+
include_shape_size=include_shape_size,
|
|
569
|
+
include_chart_size=include_chart_size,
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
return None
|
|
573
|
+
|
|
574
|
+
def process(
|
|
575
|
+
self,
|
|
576
|
+
file_path: str | Path,
|
|
577
|
+
output_path: str | Path | None = None,
|
|
578
|
+
*,
|
|
579
|
+
out_fmt: str | None = None,
|
|
580
|
+
image: bool = False,
|
|
581
|
+
pdf: bool = False,
|
|
582
|
+
dpi: int = 72,
|
|
583
|
+
mode: ExtractionMode | None = None,
|
|
584
|
+
pretty: bool | None = None,
|
|
585
|
+
indent: int | None = None,
|
|
586
|
+
sheets_dir: str | Path | None = None,
|
|
587
|
+
print_areas_dir: str | Path | None = None,
|
|
588
|
+
auto_page_breaks_dir: str | Path | None = None,
|
|
589
|
+
stream: TextIO | None = None,
|
|
590
|
+
) -> None:
|
|
591
|
+
"""
|
|
592
|
+
One-shot extract->export wrapper (CLI equivalent) with optional PDF/PNG output.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
file_path: Input Excel workbook path (str or Path).
|
|
596
|
+
output_path: Target file path (str or Path); writes to stdout when None.
|
|
597
|
+
out_fmt: Serialization format for structured output.
|
|
598
|
+
image: Whether to export PNGs alongside structured output.
|
|
599
|
+
pdf: Whether to export a PDF snapshot alongside structured output.
|
|
600
|
+
dpi: DPI to use when rendering images.
|
|
601
|
+
mode: Extraction mode; defaults to the engine's StructOptions.mode.
|
|
602
|
+
pretty: Whether to pretty-print JSON output.
|
|
603
|
+
indent: Indentation to use when pretty-printing JSON.
|
|
604
|
+
sheets_dir: Directory for per-sheet structured outputs (str or Path).
|
|
605
|
+
print_areas_dir: Directory for per-print-area structured outputs (str or Path).
|
|
606
|
+
auto_page_breaks_dir: Directory for auto page-break outputs (str or Path).
|
|
607
|
+
stream: Stream override when writing to stdout.
|
|
608
|
+
"""
|
|
609
|
+
normalized_file_path = self._ensure_path(file_path)
|
|
610
|
+
normalized_output_path = self._ensure_optional_path(output_path)
|
|
611
|
+
normalized_sheets_dir = self._ensure_optional_path(sheets_dir)
|
|
612
|
+
normalized_print_areas_dir = self._ensure_optional_path(print_areas_dir)
|
|
613
|
+
normalized_auto_page_breaks_dir = self._ensure_optional_path(
|
|
614
|
+
auto_page_breaks_dir
|
|
615
|
+
)
|
|
616
|
+
|
|
617
|
+
wb = self.extract(normalized_file_path, mode=mode)
|
|
618
|
+
chosen_fmt = out_fmt or self.output.format.fmt
|
|
619
|
+
self.export(
|
|
620
|
+
wb,
|
|
621
|
+
output_path=normalized_output_path,
|
|
622
|
+
fmt=chosen_fmt, # type: ignore[arg-type]
|
|
623
|
+
pretty=pretty,
|
|
624
|
+
indent=indent,
|
|
625
|
+
sheets_dir=normalized_sheets_dir,
|
|
626
|
+
print_areas_dir=normalized_print_areas_dir,
|
|
627
|
+
auto_page_breaks_dir=normalized_auto_page_breaks_dir,
|
|
628
|
+
stream=stream,
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
if pdf or image:
|
|
632
|
+
base_target = normalized_output_path or normalized_file_path.with_suffix(
|
|
633
|
+
".yaml"
|
|
634
|
+
if chosen_fmt in ("yaml", "yml")
|
|
635
|
+
else ".toon"
|
|
636
|
+
if chosen_fmt == "toon"
|
|
637
|
+
else ".json"
|
|
638
|
+
)
|
|
639
|
+
pdf_path = base_target.with_suffix(".pdf")
|
|
640
|
+
export_pdf(normalized_file_path, pdf_path)
|
|
641
|
+
if image:
|
|
642
|
+
images_dir = pdf_path.parent / f"{pdf_path.stem}_images"
|
|
643
|
+
export_sheet_images(normalized_file_path, images_dir, dpi=dpi)
|
exstruct/errors.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
"""Project-specific exception hierarchy for ExStruct."""
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ExstructError(Exception):
|
|
7
|
+
"""Base exception for ExStruct."""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfigError(ExstructError):
|
|
11
|
+
"""Raised when user-provided configuration or parameters are invalid."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ExtractionError(ExstructError):
|
|
15
|
+
"""Raised when workbook extraction fails."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SerializationError(ExstructError):
|
|
19
|
+
"""Raised when serialization fails or an unsupported format is requested."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class MissingDependencyError(ExstructError):
|
|
23
|
+
"""Raised when an optional dependency required for the requested operation is missing."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RenderError(ExstructError):
|
|
27
|
+
"""Raised when rendering (PDF/PNG) fails."""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class OutputError(ExstructError):
|
|
31
|
+
"""Raised when writing outputs to disk or streams fails."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PrintAreaError(ExstructError, ValueError):
|
|
35
|
+
"""Raised when print-area specific processing fails (also a ValueError for compatibility)."""
|