exstruct 0.2.80__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
exstruct/__init__.py ADDED
@@ -0,0 +1,387 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Literal, TextIO
6
+
7
+ from .core.cells import set_table_detection_params
8
+ from .core.integrate import extract_workbook
9
+ from .engine import (
10
+ DestinationOptions,
11
+ ExStructEngine,
12
+ FilterOptions,
13
+ OutputOptions,
14
+ StructOptions,
15
+ )
16
+ from .errors import (
17
+ ConfigError,
18
+ ExstructError,
19
+ MissingDependencyError,
20
+ PrintAreaError,
21
+ RenderError,
22
+ SerializationError,
23
+ )
24
+ from .io import (
25
+ save_as_json,
26
+ save_as_toon,
27
+ save_as_yaml,
28
+ save_auto_page_break_views,
29
+ save_print_area_views,
30
+ save_sheets,
31
+ serialize_workbook,
32
+ )
33
+ from .models import (
34
+ CellRow,
35
+ Chart,
36
+ ChartSeries,
37
+ PrintArea,
38
+ PrintAreaView,
39
+ Shape,
40
+ SheetData,
41
+ WorkbookData,
42
+ )
43
+ from .render import export_pdf, export_sheet_images
44
+
45
+ logger = logging.getLogger(__name__)
46
+
47
+ __all__ = [
48
+ "extract",
49
+ "export",
50
+ "export_sheets",
51
+ "export_sheets_as",
52
+ "export_print_areas_as",
53
+ "export_auto_page_breaks",
54
+ "export_pdf",
55
+ "export_sheet_images",
56
+ "ExstructError",
57
+ "ConfigError",
58
+ "MissingDependencyError",
59
+ "RenderError",
60
+ "SerializationError",
61
+ "PrintAreaError",
62
+ "process_excel",
63
+ "ExtractionMode",
64
+ "CellRow",
65
+ "Shape",
66
+ "ChartSeries",
67
+ "Chart",
68
+ "SheetData",
69
+ "WorkbookData",
70
+ "PrintArea",
71
+ "PrintAreaView",
72
+ "set_table_detection_params",
73
+ "extract_workbook",
74
+ "ExStructEngine",
75
+ "StructOptions",
76
+ "OutputOptions",
77
+ "FilterOptions",
78
+ "DestinationOptions",
79
+ "serialize_workbook",
80
+ "export_auto_page_breaks",
81
+ ]
82
+
83
+
84
+ ExtractionMode = Literal["light", "standard", "verbose"]
85
+
86
+
87
+ def extract(file_path: str | Path, mode: ExtractionMode = "standard") -> WorkbookData:
88
+ """
89
+ Extract an Excel workbook into WorkbookData.
90
+
91
+ Args:
92
+ file_path: Path to .xlsx/.xlsm/.xls.
93
+ mode: "light" / "standard" / "verbose"
94
+ - light: cells + table detection only (no COM, shapes/charts empty). Print areas via openpyxl.
95
+ - standard: texted shapes + arrows + charts (COM if available), print areas included. Shape/chart size is kept but hidden by default in output.
96
+ - verbose: all shapes (including textless) with size, charts with size.
97
+
98
+ Returns:
99
+ WorkbookData containing sheets, rows, shapes, charts, and print areas.
100
+
101
+ Raises:
102
+ ValueError: If an invalid mode is provided.
103
+
104
+ Examples:
105
+ Extract with hyperlinks (verbose) and inspect table candidates:
106
+
107
+ >>> from exstruct import extract
108
+ >>> wb = extract("input.xlsx", mode="verbose")
109
+ >>> wb.sheets["Sheet1"].table_candidates
110
+ ['A1:B5']
111
+ """
112
+ include_links = True if mode == "verbose" else False
113
+ engine = ExStructEngine(
114
+ options=StructOptions(mode=mode, include_cell_links=include_links)
115
+ )
116
+ return engine.extract(file_path, mode=mode)
117
+
118
+
119
+ def export(
120
+ data: WorkbookData,
121
+ path: str | Path,
122
+ fmt: Literal["json", "yaml", "yml", "toon"] | None = None,
123
+ *,
124
+ pretty: bool = False,
125
+ indent: int | None = None,
126
+ ) -> None:
127
+ """
128
+ Save WorkbookData to a file (format inferred from extension).
129
+
130
+ Args:
131
+ data: WorkbookData from `extract` or similar
132
+ path: destination path; extension is used to infer format
133
+ fmt: explicitly set format if desired (json/yaml/yml/toon)
134
+ pretty: pretty-print JSON
135
+ indent: JSON indent width (defaults to 2 when pretty=True and indent is None)
136
+
137
+ Raises:
138
+ ValueError: If the format is unsupported.
139
+
140
+ Examples:
141
+ Write pretty JSON and YAML (requires pyyaml):
142
+
143
+ >>> from exstruct import export, extract
144
+ >>> wb = extract("input.xlsx")
145
+ >>> export(wb, "out.json", pretty=True)
146
+ >>> export(wb, "out.yaml", fmt="yaml") # doctest: +SKIP
147
+ """
148
+ dest = Path(path)
149
+ format_hint = (fmt or dest.suffix.lstrip(".") or "json").lower()
150
+ match format_hint:
151
+ case "json":
152
+ save_as_json(data, dest, pretty=pretty, indent=indent)
153
+ case "yaml" | "yml":
154
+ save_as_yaml(data, dest)
155
+ case "toon":
156
+ save_as_toon(data, dest)
157
+ case _:
158
+ raise ValueError(f"Unsupported export format: {format_hint}")
159
+
160
+
161
+ def export_sheets(data: WorkbookData, dir_path: str | Path) -> dict[str, Path]:
162
+ """
163
+ Export each sheet as an individual JSON file.
164
+
165
+ - Payload: {book_name, sheet_name, sheet: SheetData}
166
+ - Returns: {sheet_name: Path}
167
+
168
+ Args:
169
+ data: WorkbookData to split by sheet.
170
+ dir_path: Output directory.
171
+
172
+ Returns:
173
+ Mapping from sheet name to written JSON path.
174
+
175
+ Examples:
176
+ >>> from exstruct import export_sheets, extract
177
+ >>> wb = extract("input.xlsx")
178
+ >>> paths = export_sheets(wb, "out_sheets")
179
+ >>> "Sheet1" in paths
180
+ True
181
+ """
182
+ return save_sheets(data, Path(dir_path), fmt="json")
183
+
184
+
185
+ def export_sheets_as(
186
+ data: WorkbookData,
187
+ dir_path: str | Path,
188
+ fmt: Literal["json", "yaml", "yml", "toon"] = "json",
189
+ *,
190
+ pretty: bool = False,
191
+ indent: int | None = None,
192
+ ) -> dict[str, Path]:
193
+ """
194
+ Export each sheet in the given format (json/yaml/toon); returns sheet name to path map.
195
+
196
+ Args:
197
+ data: WorkbookData to split by sheet.
198
+ dir_path: Output directory.
199
+ fmt: Output format; inferred defaults to json.
200
+ pretty: Pretty-print JSON.
201
+ indent: JSON indent width (defaults to 2 when pretty=True and indent is None).
202
+
203
+ Returns:
204
+ Mapping from sheet name to written file path.
205
+
206
+ Raises:
207
+ ValueError: If an unsupported format is passed.
208
+
209
+ Examples:
210
+ Export per sheet as YAML (requires pyyaml):
211
+
212
+ >>> from exstruct import export_sheets_as, extract
213
+ >>> wb = extract("input.xlsx")
214
+ >>> _ = export_sheets_as(wb, "out_yaml", fmt="yaml") # doctest: +SKIP
215
+ """
216
+ return save_sheets(data, Path(dir_path), fmt=fmt, pretty=pretty, indent=indent)
217
+
218
+
219
+ def export_print_areas_as(
220
+ data: WorkbookData,
221
+ dir_path: str | Path,
222
+ fmt: Literal["json", "yaml", "yml", "toon"] = "json",
223
+ *,
224
+ pretty: bool = False,
225
+ indent: int | None = None,
226
+ normalize: bool = False,
227
+ ) -> dict[str, Path]:
228
+ """
229
+ Export each print area as a PrintAreaView.
230
+
231
+ Args:
232
+ data: WorkbookData that contains print areas
233
+ dir_path: output directory
234
+ fmt: json/yaml/yml/toon
235
+ pretty: Pretty-print JSON output.
236
+ indent: JSON indent width (defaults to 2 when pretty is True and indent is None).
237
+ normalize: rebase row/col indices to the print-area origin when True
238
+
239
+ Returns:
240
+ dict mapping area key to path (e.g., "Sheet1#1": /.../Sheet1_area1_...json)
241
+
242
+ Examples:
243
+ Export print areas when present:
244
+
245
+ >>> from exstruct import export_print_areas_as, extract
246
+ >>> wb = extract("input.xlsx", mode="standard")
247
+ >>> paths = export_print_areas_as(wb, "areas")
248
+ >>> isinstance(paths, dict)
249
+ True
250
+ """
251
+ return save_print_area_views(
252
+ data,
253
+ Path(dir_path),
254
+ fmt=fmt,
255
+ pretty=pretty,
256
+ indent=indent,
257
+ normalize=normalize,
258
+ )
259
+
260
+
261
+ def export_auto_page_breaks(
262
+ data: WorkbookData,
263
+ dir_path: str | Path,
264
+ fmt: Literal["json", "yaml", "yml", "toon"] = "json",
265
+ *,
266
+ pretty: bool = False,
267
+ indent: int | None = None,
268
+ normalize: bool = False,
269
+ ) -> dict[str, Path]:
270
+ """
271
+ Export auto page-break areas (COM-computed) as PrintAreaView files.
272
+
273
+ Args:
274
+ data: WorkbookData containing auto_print_areas (COM extraction with auto breaks enabled)
275
+ dir_path: output directory
276
+ fmt: json/yaml/yml/toon
277
+ pretty: Pretty-print JSON output.
278
+ indent: JSON indent width (defaults to 2 when pretty is True and indent is None).
279
+ normalize: rebase row/col indices to the area origin when True
280
+
281
+ Returns:
282
+ dict mapping area key to path (e.g., "Sheet1#1": /.../Sheet1_auto_page1_...json)
283
+
284
+ Raises:
285
+ PrintAreaError: If no auto page-break areas are present.
286
+
287
+ Examples:
288
+ >>> from exstruct import export_auto_page_breaks, extract
289
+ >>> wb = extract("input.xlsx", mode="standard")
290
+ >>> try:
291
+ ... export_auto_page_breaks(wb, "auto_areas")
292
+ ... except PrintAreaError:
293
+ ... pass
294
+ """
295
+ if not any(sheet.auto_print_areas for sheet in data.sheets.values()):
296
+ message = "No auto page-break areas found. Enable COM-based auto page breaks before exporting."
297
+ logger.warning(message)
298
+ raise PrintAreaError(message)
299
+ return save_auto_page_break_views(
300
+ data,
301
+ Path(dir_path),
302
+ fmt=fmt,
303
+ pretty=pretty,
304
+ indent=indent,
305
+ normalize=normalize,
306
+ )
307
+
308
+
309
+ def process_excel(
310
+ file_path: str | Path,
311
+ output_path: str | Path | None = None,
312
+ out_fmt: str = "json",
313
+ image: bool = False,
314
+ pdf: bool = False,
315
+ dpi: int = 72,
316
+ mode: ExtractionMode = "standard",
317
+ pretty: bool = False,
318
+ indent: int | None = None,
319
+ sheets_dir: str | Path | None = None,
320
+ print_areas_dir: str | Path | None = None,
321
+ auto_page_breaks_dir: str | Path | None = None,
322
+ stream: TextIO | None = None,
323
+ ) -> None:
324
+ """
325
+ Convenience wrapper: extract -> serialize (file or stdout) -> optional PDF/PNG.
326
+
327
+ Args:
328
+ file_path: Input Excel workbook (path string or Path).
329
+ output_path: None for stdout; otherwise, write to file (string or Path).
330
+ out_fmt: json/yaml/yml/toon.
331
+ image: True to also output PNGs (requires Excel + COM + pypdfium2).
332
+ pdf: True to also output PDF (requires Excel + COM + pypdfium2).
333
+ dpi: DPI for image output.
334
+ mode: light/standard/verbose (same meaning as `extract`).
335
+ pretty: Pretty-print JSON.
336
+ indent: JSON indent width.
337
+ sheets_dir: Directory to write per-sheet files (string or Path).
338
+ print_areas_dir: Directory to write per-print-area files (string or Path).
339
+ auto_page_breaks_dir: Directory to write per-auto-page-break files (COM only).
340
+ stream: IO override when output_path is None.
341
+
342
+ Raises:
343
+ ValueError: If an unsupported format or mode is given.
344
+ PrintAreaError: When exporting auto page breaks without available data.
345
+ RenderError: When rendering fails (Excel/COM/pypdfium2 issues).
346
+
347
+ Examples:
348
+ Extract and write JSON to stdout, plus per-sheet files:
349
+
350
+ >>> from pathlib import Path
351
+ >>> from exstruct import process_excel
352
+ >>> process_excel(Path("input.xlsx"), output_path=None, sheets_dir=Path("sheets"))
353
+
354
+ Render PDF only (COM + Excel required):
355
+
356
+ >>> process_excel(Path("input.xlsx"), output_path=Path("out.json"), pdf=True) # doctest: +SKIP
357
+ """
358
+ engine = ExStructEngine(
359
+ options=StructOptions(mode=mode),
360
+ output=OutputOptions(
361
+ fmt=out_fmt,
362
+ pretty=pretty,
363
+ indent=indent,
364
+ sheets_dir=sheets_dir,
365
+ print_areas_dir=print_areas_dir,
366
+ auto_page_breaks_dir=auto_page_breaks_dir,
367
+ include_print_areas=None if mode == "light" else True,
368
+ include_shape_size=True if mode == "verbose" else False,
369
+ include_chart_size=True if mode == "verbose" else False,
370
+ stream=stream,
371
+ ),
372
+ )
373
+ engine.process(
374
+ file_path=file_path,
375
+ output_path=output_path,
376
+ out_fmt=out_fmt,
377
+ image=image,
378
+ pdf=pdf,
379
+ dpi=dpi,
380
+ mode=mode,
381
+ pretty=pretty,
382
+ indent=indent,
383
+ sheets_dir=sheets_dir,
384
+ print_areas_dir=print_areas_dir,
385
+ auto_page_breaks_dir=auto_page_breaks_dir,
386
+ stream=stream,
387
+ )
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import os
5
+ import sys
6
+
7
+ from pydantic import BaseModel, Field
8
+ import xlwings as xw
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class ComAvailability(BaseModel):
14
+ """Availability information for Excel COM-dependent features."""
15
+
16
+ available: bool = Field(
17
+ ..., description="True when Excel COM can be used from this environment."
18
+ )
19
+ reason: str | None = Field(
20
+ default=None, description="Reason COM features are unavailable."
21
+ )
22
+
23
+
24
+ def get_com_availability() -> ComAvailability:
25
+ """Detect whether Excel COM is available for CLI features.
26
+
27
+ Returns:
28
+ ComAvailability describing whether COM features can be used.
29
+ """
30
+ if os.getenv("SKIP_COM_TESTS"):
31
+ return ComAvailability(available=False, reason="SKIP_COM_TESTS is set.")
32
+
33
+ if sys.platform != "win32":
34
+ return ComAvailability(available=False, reason="Non-Windows platform.")
35
+
36
+ try:
37
+ app = xw.App(add_book=False, visible=False)
38
+ except Exception as exc:
39
+ return ComAvailability(
40
+ available=False,
41
+ reason=f"Excel COM is unavailable ({exc.__class__.__name__}).",
42
+ )
43
+
44
+ try:
45
+ app.quit()
46
+ except Exception:
47
+ logger.warning("Failed to quit Excel during COM availability check.")
48
+
49
+ return ComAvailability(available=True, reason=None)
exstruct/cli/main.py ADDED
@@ -0,0 +1,134 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from exstruct import process_excel
7
+ from exstruct.cli.availability import ComAvailability, get_com_availability
8
+
9
+
10
+ def _add_auto_page_breaks_argument(
11
+ parser: argparse.ArgumentParser, availability: ComAvailability
12
+ ) -> None:
13
+ """Add auto page-break export option when COM is available."""
14
+ if not availability.available:
15
+ return
16
+ parser.add_argument(
17
+ "--auto-page-breaks-dir",
18
+ type=Path,
19
+ help="Optional directory to write one file per auto page-break area (COM only).",
20
+ )
21
+
22
+
23
+ def build_parser(
24
+ availability: ComAvailability | None = None,
25
+ ) -> argparse.ArgumentParser:
26
+ """Build the CLI argument parser.
27
+
28
+ Args:
29
+ availability: Optional COM availability for tests or overrides.
30
+
31
+ Returns:
32
+ Configured argument parser.
33
+ """
34
+ parser = argparse.ArgumentParser(
35
+ description="Dev-only CLI stub for ExStruct extraction."
36
+ )
37
+ parser.add_argument("input", type=Path, help="Excel file (.xlsx/.xlsm/.xls)")
38
+ parser.add_argument(
39
+ "-o",
40
+ "--output",
41
+ type=Path,
42
+ help="Output path. If omitted, writes to stdout.",
43
+ )
44
+ parser.add_argument(
45
+ "-f",
46
+ "--format",
47
+ default="json",
48
+ choices=["json", "yaml", "yml", "toon"],
49
+ help="Export format",
50
+ )
51
+ parser.add_argument(
52
+ "--image",
53
+ action="store_true",
54
+ help="(placeholder) Render PNG alongside JSON",
55
+ )
56
+ parser.add_argument(
57
+ "--pdf",
58
+ action="store_true",
59
+ help="(placeholder) Render PDF alongside JSON",
60
+ )
61
+ parser.add_argument(
62
+ "--dpi",
63
+ type=int,
64
+ default=144,
65
+ help="DPI for image rendering (placeholder)",
66
+ )
67
+ parser.add_argument(
68
+ "-m",
69
+ "--mode",
70
+ default="standard",
71
+ choices=["light", "standard", "verbose"],
72
+ help="Extraction detail level",
73
+ )
74
+ parser.add_argument(
75
+ "--pretty",
76
+ action="store_true",
77
+ help="Pretty-print JSON output (indent=2). Default is compact JSON.",
78
+ )
79
+ parser.add_argument(
80
+ "--sheets-dir",
81
+ type=Path,
82
+ help="Optional directory to write one file per sheet (format follows --format).",
83
+ )
84
+ parser.add_argument(
85
+ "--print-areas-dir",
86
+ type=Path,
87
+ help="Optional directory to write one file per print area (format follows --format).",
88
+ )
89
+ resolved_availability = (
90
+ availability if availability is not None else get_com_availability()
91
+ )
92
+ _add_auto_page_breaks_argument(parser, resolved_availability)
93
+ return parser
94
+
95
+
96
+ def main(argv: list[str] | None = None) -> int:
97
+ """Run the CLI entrypoint.
98
+
99
+ Args:
100
+ argv: Optional argument list for testing.
101
+
102
+ Returns:
103
+ Exit code (0 for success, 1 for failure).
104
+ """
105
+ parser = build_parser()
106
+ args = parser.parse_args(argv)
107
+
108
+ input_path: Path = args.input
109
+ if not input_path.exists():
110
+ print(f"File not found: {input_path}", flush=True)
111
+ return 0
112
+
113
+ try:
114
+ process_excel(
115
+ file_path=input_path,
116
+ output_path=args.output,
117
+ out_fmt=args.format,
118
+ image=args.image,
119
+ pdf=args.pdf,
120
+ dpi=args.dpi,
121
+ mode=args.mode,
122
+ pretty=args.pretty,
123
+ sheets_dir=args.sheets_dir,
124
+ print_areas_dir=args.print_areas_dir,
125
+ auto_page_breaks_dir=getattr(args, "auto_page_breaks_dir", None),
126
+ )
127
+ return 0
128
+ except Exception as e:
129
+ print(f"Error: {e}", flush=True)
130
+ return 1
131
+
132
+
133
+ if __name__ == "__main__":
134
+ raise SystemExit(main())
File without changes