exstruct 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exstruct/__init__.py +23 -12
- exstruct/cli/main.py +20 -0
- exstruct/core/backends/__init__.py +7 -0
- exstruct/core/backends/base.py +42 -0
- exstruct/core/backends/com_backend.py +230 -0
- exstruct/core/backends/openpyxl_backend.py +191 -0
- exstruct/core/cells.py +999 -483
- exstruct/core/charts.py +243 -241
- exstruct/core/integrate.py +42 -375
- exstruct/core/logging_utils.py +16 -0
- exstruct/core/modeling.py +87 -0
- exstruct/core/pipeline.py +749 -0
- exstruct/core/ranges.py +48 -0
- exstruct/core/shapes.py +282 -36
- exstruct/core/workbook.py +114 -0
- exstruct/engine.py +51 -123
- exstruct/errors.py +12 -1
- exstruct/io/__init__.py +130 -138
- exstruct/io/serialize.py +112 -0
- exstruct/models/__init__.py +58 -8
- exstruct/render/__init__.py +3 -7
- {exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/METADATA +133 -18
- exstruct-0.3.2.dist-info/RECORD +30 -0
- exstruct-0.2.80.dist-info/RECORD +0 -20
- {exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/WHEEL +0 -0
- {exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,749 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Callable, Sequence
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Literal
|
|
9
|
+
|
|
10
|
+
import xlwings as xw
|
|
11
|
+
|
|
12
|
+
from ..errors import FallbackReason
|
|
13
|
+
from ..models import (
|
|
14
|
+
Arrow,
|
|
15
|
+
CellRow,
|
|
16
|
+
Chart,
|
|
17
|
+
MergedCell,
|
|
18
|
+
PrintArea,
|
|
19
|
+
Shape,
|
|
20
|
+
SmartArt,
|
|
21
|
+
WorkbookData,
|
|
22
|
+
)
|
|
23
|
+
from .backends.com_backend import ComBackend
|
|
24
|
+
from .backends.openpyxl_backend import OpenpyxlBackend
|
|
25
|
+
from .cells import WorkbookColorsMap, detect_tables
|
|
26
|
+
from .charts import get_charts
|
|
27
|
+
from .logging_utils import log_fallback
|
|
28
|
+
from .modeling import SheetRawData, WorkbookRawData, build_workbook_data
|
|
29
|
+
from .shapes import get_shapes_with_position
|
|
30
|
+
from .workbook import xlwings_workbook
|
|
31
|
+
|
|
32
|
+
ExtractionMode = Literal["light", "standard", "verbose"]
|
|
33
|
+
CellData = dict[str, list[CellRow]]
|
|
34
|
+
PrintAreaData = dict[str, list[PrintArea]]
|
|
35
|
+
MergedCellData = dict[str, list[MergedCell]]
|
|
36
|
+
ShapeData = dict[str, list[Shape | Arrow | SmartArt]]
|
|
37
|
+
ChartData = dict[str, list[Chart]]
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class ExtractionInputs:
|
|
44
|
+
"""Immutable inputs for pipeline steps.
|
|
45
|
+
|
|
46
|
+
Attributes:
|
|
47
|
+
file_path: Path to the Excel workbook.
|
|
48
|
+
mode: Extraction mode (light/standard/verbose).
|
|
49
|
+
include_cell_links: Whether to include cell hyperlinks.
|
|
50
|
+
include_print_areas: Whether to include print areas.
|
|
51
|
+
include_auto_page_breaks: Whether to include auto page breaks.
|
|
52
|
+
include_colors_map: Whether to include background colors map.
|
|
53
|
+
include_default_background: Whether to include default background color.
|
|
54
|
+
ignore_colors: Optional set of color keys to ignore.
|
|
55
|
+
include_merged_cells: Whether to include merged cell ranges.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
file_path: Path
|
|
59
|
+
mode: ExtractionMode
|
|
60
|
+
include_cell_links: bool
|
|
61
|
+
include_print_areas: bool
|
|
62
|
+
include_auto_page_breaks: bool
|
|
63
|
+
include_colors_map: bool
|
|
64
|
+
include_default_background: bool
|
|
65
|
+
ignore_colors: set[str] | None
|
|
66
|
+
include_merged_cells: bool
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class ExtractionArtifacts:
|
|
71
|
+
"""Mutable artifacts collected by pipeline steps.
|
|
72
|
+
|
|
73
|
+
Attributes:
|
|
74
|
+
cell_data: Extracted cell rows per sheet.
|
|
75
|
+
print_area_data: Extracted print areas per sheet.
|
|
76
|
+
auto_page_break_data: Extracted auto page-break areas per sheet.
|
|
77
|
+
colors_map_data: Extracted colors map for workbook sheets.
|
|
78
|
+
shape_data: Extracted shapes per sheet.
|
|
79
|
+
chart_data: Extracted charts per sheet.
|
|
80
|
+
merged_cell_data: Extracted merged cell ranges per sheet.
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
cell_data: CellData = field(default_factory=dict)
|
|
84
|
+
print_area_data: PrintAreaData = field(default_factory=dict)
|
|
85
|
+
auto_page_break_data: PrintAreaData = field(default_factory=dict)
|
|
86
|
+
colors_map_data: WorkbookColorsMap | None = None
|
|
87
|
+
shape_data: ShapeData = field(default_factory=dict)
|
|
88
|
+
chart_data: ChartData = field(default_factory=dict)
|
|
89
|
+
merged_cell_data: MergedCellData = field(default_factory=dict)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
ExtractionStep = Callable[[ExtractionInputs, ExtractionArtifacts], None]
|
|
93
|
+
ComExtractionStep = Callable[[ExtractionInputs, ExtractionArtifacts, xw.Book], None]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass(frozen=True)
|
|
97
|
+
class PipelinePlan:
|
|
98
|
+
"""Resolved pipeline plan for an extraction run.
|
|
99
|
+
|
|
100
|
+
Attributes:
|
|
101
|
+
pre_com_steps: Ordered list of steps to run before COM access.
|
|
102
|
+
com_steps: Ordered list of steps to run with COM access.
|
|
103
|
+
use_com: Whether COM-based extraction should be attempted.
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
pre_com_steps: list[ExtractionStep]
|
|
107
|
+
com_steps: list[ComExtractionStep]
|
|
108
|
+
use_com: bool
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass(frozen=True)
|
|
112
|
+
class StepConfig:
|
|
113
|
+
"""Configuration for a pipeline step.
|
|
114
|
+
|
|
115
|
+
Attributes:
|
|
116
|
+
name: Step name for debugging.
|
|
117
|
+
step: Callable to execute.
|
|
118
|
+
enabled: Predicate to include the step in the pipeline.
|
|
119
|
+
"""
|
|
120
|
+
|
|
121
|
+
name: str
|
|
122
|
+
step: ExtractionStep
|
|
123
|
+
enabled: Callable[[ExtractionInputs], bool]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass(frozen=True)
|
|
127
|
+
class ComStepConfig:
|
|
128
|
+
"""Configuration for a COM pipeline step.
|
|
129
|
+
|
|
130
|
+
Attributes:
|
|
131
|
+
name: Step name for debugging.
|
|
132
|
+
step: Callable to execute with COM workbook.
|
|
133
|
+
enabled: Predicate to include the step in the pipeline.
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
name: str
|
|
137
|
+
step: ComExtractionStep
|
|
138
|
+
enabled: Callable[[ExtractionInputs], bool]
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
@dataclass
|
|
142
|
+
class PipelineState:
|
|
143
|
+
"""Mutable execution state for a pipeline run.
|
|
144
|
+
|
|
145
|
+
Attributes:
|
|
146
|
+
com_attempted: Whether COM access was attempted.
|
|
147
|
+
com_succeeded: Whether COM steps completed successfully.
|
|
148
|
+
fallback_reason: Optional fallback reason code.
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
com_attempted: bool = False
|
|
152
|
+
com_succeeded: bool = False
|
|
153
|
+
fallback_reason: FallbackReason | None = None
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@dataclass(frozen=True)
|
|
157
|
+
class PipelineResult:
|
|
158
|
+
"""Result of a pipeline run.
|
|
159
|
+
|
|
160
|
+
Attributes:
|
|
161
|
+
workbook: Extracted workbook data.
|
|
162
|
+
artifacts: Collected extraction artifacts.
|
|
163
|
+
state: Pipeline execution state.
|
|
164
|
+
"""
|
|
165
|
+
|
|
166
|
+
workbook: WorkbookData
|
|
167
|
+
artifacts: ExtractionArtifacts
|
|
168
|
+
state: PipelineState
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def resolve_extraction_inputs(
|
|
172
|
+
file_path: str | Path,
|
|
173
|
+
*,
|
|
174
|
+
mode: ExtractionMode,
|
|
175
|
+
include_cell_links: bool | None,
|
|
176
|
+
include_print_areas: bool | None,
|
|
177
|
+
include_auto_page_breaks: bool,
|
|
178
|
+
include_colors_map: bool | None,
|
|
179
|
+
include_default_background: bool,
|
|
180
|
+
ignore_colors: set[str] | None,
|
|
181
|
+
include_merged_cells: bool | None,
|
|
182
|
+
) -> ExtractionInputs:
|
|
183
|
+
"""Resolve include flags and normalize inputs for the pipeline.
|
|
184
|
+
|
|
185
|
+
Args:
|
|
186
|
+
file_path: Workbook path (str or Path).
|
|
187
|
+
mode: Extraction mode.
|
|
188
|
+
include_cell_links: Whether to include hyperlinks; None uses mode defaults.
|
|
189
|
+
include_print_areas: Whether to include print areas; None defaults to True.
|
|
190
|
+
include_auto_page_breaks: Whether to include auto page breaks.
|
|
191
|
+
include_colors_map: Whether to include background colors; None uses mode defaults.
|
|
192
|
+
include_default_background: Include default background colors when colors_map is enabled.
|
|
193
|
+
ignore_colors: Optional set of colors to ignore when colors_map is enabled.
|
|
194
|
+
include_merged_cells: Whether to include merged cell ranges; None uses mode defaults.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Resolved ExtractionInputs.
|
|
198
|
+
|
|
199
|
+
Raises:
|
|
200
|
+
ValueError: If an unsupported mode is provided.
|
|
201
|
+
"""
|
|
202
|
+
allowed_modes: set[str] = {"light", "standard", "verbose"}
|
|
203
|
+
if mode not in allowed_modes:
|
|
204
|
+
raise ValueError(f"Unsupported mode: {mode}")
|
|
205
|
+
|
|
206
|
+
normalized_file_path = file_path if isinstance(file_path, Path) else Path(file_path)
|
|
207
|
+
resolved_cell_links = (
|
|
208
|
+
include_cell_links if include_cell_links is not None else mode == "verbose"
|
|
209
|
+
)
|
|
210
|
+
resolved_print_areas = (
|
|
211
|
+
include_print_areas if include_print_areas is not None else True
|
|
212
|
+
)
|
|
213
|
+
resolved_colors_map = (
|
|
214
|
+
include_colors_map if include_colors_map is not None else mode == "verbose"
|
|
215
|
+
)
|
|
216
|
+
resolved_default_background = (
|
|
217
|
+
include_default_background if resolved_colors_map else False
|
|
218
|
+
)
|
|
219
|
+
resolved_ignore_colors = ignore_colors if resolved_colors_map else None
|
|
220
|
+
if resolved_colors_map and resolved_ignore_colors is None:
|
|
221
|
+
resolved_ignore_colors = set()
|
|
222
|
+
resolved_merged_cells = (
|
|
223
|
+
include_merged_cells if include_merged_cells is not None else mode != "light"
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
return ExtractionInputs(
|
|
227
|
+
file_path=normalized_file_path,
|
|
228
|
+
mode=mode,
|
|
229
|
+
include_cell_links=resolved_cell_links,
|
|
230
|
+
include_print_areas=resolved_print_areas,
|
|
231
|
+
include_auto_page_breaks=include_auto_page_breaks,
|
|
232
|
+
include_colors_map=resolved_colors_map,
|
|
233
|
+
include_default_background=resolved_default_background,
|
|
234
|
+
ignore_colors=resolved_ignore_colors,
|
|
235
|
+
include_merged_cells=resolved_merged_cells,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def build_pipeline_plan(inputs: ExtractionInputs) -> PipelinePlan:
|
|
240
|
+
"""Build a pipeline plan based on resolved inputs.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
inputs: Resolved pipeline inputs.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
PipelinePlan containing pre-COM/COM steps and COM usage flag.
|
|
247
|
+
"""
|
|
248
|
+
return PipelinePlan(
|
|
249
|
+
pre_com_steps=build_pre_com_pipeline(inputs),
|
|
250
|
+
com_steps=build_com_pipeline(inputs),
|
|
251
|
+
use_com=inputs.mode != "light",
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def build_pre_com_pipeline(inputs: ExtractionInputs) -> list[ExtractionStep]:
|
|
256
|
+
"""Build pipeline steps that run before COM/Excel access.
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
inputs: Pipeline inputs describing extraction flags.
|
|
260
|
+
|
|
261
|
+
Returns:
|
|
262
|
+
Ordered list of extraction steps to run before COM.
|
|
263
|
+
"""
|
|
264
|
+
step_table: dict[ExtractionMode, Sequence[StepConfig]] = {
|
|
265
|
+
"light": (
|
|
266
|
+
StepConfig(
|
|
267
|
+
name="cells",
|
|
268
|
+
step=step_extract_cells,
|
|
269
|
+
enabled=lambda _inputs: True,
|
|
270
|
+
),
|
|
271
|
+
StepConfig(
|
|
272
|
+
name="print_areas_openpyxl",
|
|
273
|
+
step=step_extract_print_areas_openpyxl,
|
|
274
|
+
enabled=lambda _inputs: _inputs.include_print_areas,
|
|
275
|
+
),
|
|
276
|
+
StepConfig(
|
|
277
|
+
name="colors_map_openpyxl",
|
|
278
|
+
step=step_extract_colors_map_openpyxl,
|
|
279
|
+
enabled=lambda _inputs: _inputs.include_colors_map,
|
|
280
|
+
),
|
|
281
|
+
StepConfig(
|
|
282
|
+
name="merged_cells_openpyxl",
|
|
283
|
+
step=step_extract_merged_cells_openpyxl,
|
|
284
|
+
enabled=lambda _inputs: _inputs.include_merged_cells,
|
|
285
|
+
),
|
|
286
|
+
),
|
|
287
|
+
"standard": (
|
|
288
|
+
StepConfig(
|
|
289
|
+
name="cells",
|
|
290
|
+
step=step_extract_cells,
|
|
291
|
+
enabled=lambda _inputs: True,
|
|
292
|
+
),
|
|
293
|
+
StepConfig(
|
|
294
|
+
name="print_areas_openpyxl",
|
|
295
|
+
step=step_extract_print_areas_openpyxl,
|
|
296
|
+
enabled=lambda _inputs: _inputs.include_print_areas,
|
|
297
|
+
),
|
|
298
|
+
StepConfig(
|
|
299
|
+
name="colors_map_openpyxl_if_skip_com",
|
|
300
|
+
step=step_extract_colors_map_openpyxl,
|
|
301
|
+
enabled=lambda _inputs: _inputs.include_colors_map
|
|
302
|
+
and bool(os.getenv("SKIP_COM_TESTS")),
|
|
303
|
+
),
|
|
304
|
+
StepConfig(
|
|
305
|
+
name="merged_cells_openpyxl",
|
|
306
|
+
step=step_extract_merged_cells_openpyxl,
|
|
307
|
+
enabled=lambda _inputs: _inputs.include_merged_cells,
|
|
308
|
+
),
|
|
309
|
+
),
|
|
310
|
+
"verbose": (
|
|
311
|
+
StepConfig(
|
|
312
|
+
name="cells",
|
|
313
|
+
step=step_extract_cells,
|
|
314
|
+
enabled=lambda _inputs: True,
|
|
315
|
+
),
|
|
316
|
+
StepConfig(
|
|
317
|
+
name="print_areas_openpyxl",
|
|
318
|
+
step=step_extract_print_areas_openpyxl,
|
|
319
|
+
enabled=lambda _inputs: _inputs.include_print_areas,
|
|
320
|
+
),
|
|
321
|
+
StepConfig(
|
|
322
|
+
name="colors_map_openpyxl_if_skip_com",
|
|
323
|
+
step=step_extract_colors_map_openpyxl,
|
|
324
|
+
enabled=lambda _inputs: _inputs.include_colors_map
|
|
325
|
+
and bool(os.getenv("SKIP_COM_TESTS")),
|
|
326
|
+
),
|
|
327
|
+
StepConfig(
|
|
328
|
+
name="merged_cells_openpyxl",
|
|
329
|
+
step=step_extract_merged_cells_openpyxl,
|
|
330
|
+
enabled=lambda _inputs: _inputs.include_merged_cells,
|
|
331
|
+
),
|
|
332
|
+
),
|
|
333
|
+
}
|
|
334
|
+
steps: list[ExtractionStep] = []
|
|
335
|
+
for config in step_table[inputs.mode]:
|
|
336
|
+
if config.enabled(inputs):
|
|
337
|
+
steps.append(config.step)
|
|
338
|
+
return steps
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def build_com_pipeline(inputs: ExtractionInputs) -> list[ComExtractionStep]:
|
|
342
|
+
"""Build pipeline steps that require COM/Excel access.
|
|
343
|
+
|
|
344
|
+
Args:
|
|
345
|
+
inputs: Pipeline inputs describing extraction flags.
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
Ordered list of COM extraction steps.
|
|
349
|
+
"""
|
|
350
|
+
if inputs.mode == "light":
|
|
351
|
+
return []
|
|
352
|
+
step_table: Sequence[ComStepConfig] = (
|
|
353
|
+
ComStepConfig(
|
|
354
|
+
name="shapes_com",
|
|
355
|
+
step=step_extract_shapes_com,
|
|
356
|
+
enabled=lambda _inputs: True,
|
|
357
|
+
),
|
|
358
|
+
ComStepConfig(
|
|
359
|
+
name="charts_com",
|
|
360
|
+
step=step_extract_charts_com,
|
|
361
|
+
enabled=lambda _inputs: True,
|
|
362
|
+
),
|
|
363
|
+
ComStepConfig(
|
|
364
|
+
name="print_areas_com",
|
|
365
|
+
step=step_extract_print_areas_com,
|
|
366
|
+
enabled=lambda _inputs: _inputs.include_print_areas,
|
|
367
|
+
),
|
|
368
|
+
ComStepConfig(
|
|
369
|
+
name="auto_page_breaks_com",
|
|
370
|
+
step=step_extract_auto_page_breaks_com,
|
|
371
|
+
enabled=lambda _inputs: _inputs.include_auto_page_breaks,
|
|
372
|
+
),
|
|
373
|
+
ComStepConfig(
|
|
374
|
+
name="colors_map_com",
|
|
375
|
+
step=step_extract_colors_map_com,
|
|
376
|
+
enabled=lambda _inputs: _inputs.include_colors_map,
|
|
377
|
+
),
|
|
378
|
+
)
|
|
379
|
+
steps: list[ComExtractionStep] = []
|
|
380
|
+
for config in step_table:
|
|
381
|
+
if config.enabled(inputs):
|
|
382
|
+
steps.append(config.step)
|
|
383
|
+
return steps
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def run_pipeline(
|
|
387
|
+
steps: Sequence[ExtractionStep],
|
|
388
|
+
inputs: ExtractionInputs,
|
|
389
|
+
artifacts: ExtractionArtifacts,
|
|
390
|
+
) -> ExtractionArtifacts:
|
|
391
|
+
"""Run steps in order and return updated artifacts.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
steps: Ordered extraction steps.
|
|
395
|
+
inputs: Pipeline inputs.
|
|
396
|
+
artifacts: Artifact container to update.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Updated artifacts after running all steps.
|
|
400
|
+
"""
|
|
401
|
+
for step in steps:
|
|
402
|
+
step(inputs, artifacts)
|
|
403
|
+
return artifacts
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def run_com_pipeline(
|
|
407
|
+
steps: Sequence[ComExtractionStep],
|
|
408
|
+
inputs: ExtractionInputs,
|
|
409
|
+
artifacts: ExtractionArtifacts,
|
|
410
|
+
workbook: xw.Book,
|
|
411
|
+
) -> ExtractionArtifacts:
|
|
412
|
+
"""Run COM steps in order and return updated artifacts.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
steps: Ordered COM extraction steps.
|
|
416
|
+
inputs: Pipeline inputs.
|
|
417
|
+
artifacts: Artifact container to update.
|
|
418
|
+
workbook: xlwings workbook instance.
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
Updated artifacts after running all COM steps.
|
|
422
|
+
"""
|
|
423
|
+
for step in steps:
|
|
424
|
+
step(inputs, artifacts, workbook)
|
|
425
|
+
return artifacts
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
def step_extract_cells(
|
|
429
|
+
inputs: ExtractionInputs, artifacts: ExtractionArtifacts
|
|
430
|
+
) -> None:
|
|
431
|
+
"""Extract cell rows, optionally including hyperlinks.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
inputs: Pipeline inputs.
|
|
435
|
+
artifacts: Artifact container to update.
|
|
436
|
+
"""
|
|
437
|
+
backend = OpenpyxlBackend(inputs.file_path)
|
|
438
|
+
artifacts.cell_data = backend.extract_cells(include_links=inputs.include_cell_links)
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def step_extract_print_areas_openpyxl(
|
|
442
|
+
inputs: ExtractionInputs, artifacts: ExtractionArtifacts
|
|
443
|
+
) -> None:
|
|
444
|
+
"""Extract print areas via openpyxl.
|
|
445
|
+
|
|
446
|
+
Args:
|
|
447
|
+
inputs: Pipeline inputs.
|
|
448
|
+
artifacts: Artifact container to update.
|
|
449
|
+
"""
|
|
450
|
+
backend = OpenpyxlBackend(inputs.file_path)
|
|
451
|
+
artifacts.print_area_data = backend.extract_print_areas()
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def step_extract_colors_map_openpyxl(
|
|
455
|
+
inputs: ExtractionInputs, artifacts: ExtractionArtifacts
|
|
456
|
+
) -> None:
|
|
457
|
+
"""Extract colors_map via openpyxl; logs and skips on failure.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
inputs: Pipeline inputs.
|
|
461
|
+
artifacts: Artifact container to update.
|
|
462
|
+
"""
|
|
463
|
+
backend = OpenpyxlBackend(inputs.file_path)
|
|
464
|
+
artifacts.colors_map_data = backend.extract_colors_map(
|
|
465
|
+
include_default_background=inputs.include_default_background,
|
|
466
|
+
ignore_colors=inputs.ignore_colors,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def step_extract_merged_cells_openpyxl(
|
|
471
|
+
inputs: ExtractionInputs, artifacts: ExtractionArtifacts
|
|
472
|
+
) -> None:
|
|
473
|
+
"""Extract merged cell ranges via openpyxl.
|
|
474
|
+
|
|
475
|
+
Args:
|
|
476
|
+
inputs: Pipeline inputs.
|
|
477
|
+
artifacts: Artifact container to update.
|
|
478
|
+
"""
|
|
479
|
+
backend = OpenpyxlBackend(inputs.file_path)
|
|
480
|
+
artifacts.merged_cell_data = backend.extract_merged_cells()
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
def step_extract_shapes_com(
|
|
484
|
+
inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
|
|
485
|
+
) -> None:
|
|
486
|
+
"""Extract shapes via COM.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
inputs: Pipeline inputs.
|
|
490
|
+
artifacts: Artifact container to update.
|
|
491
|
+
workbook: xlwings workbook instance.
|
|
492
|
+
"""
|
|
493
|
+
artifacts.shape_data = get_shapes_with_position(workbook, mode=inputs.mode)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def step_extract_charts_com(
|
|
497
|
+
inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
|
|
498
|
+
) -> None:
|
|
499
|
+
"""Extract charts via COM.
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
inputs: Pipeline inputs.
|
|
503
|
+
artifacts: Artifact container to update.
|
|
504
|
+
workbook: xlwings workbook instance.
|
|
505
|
+
"""
|
|
506
|
+
chart_data: ChartData = {}
|
|
507
|
+
for sheet in workbook.sheets:
|
|
508
|
+
chart_data[sheet.name] = get_charts(sheet, mode=inputs.mode)
|
|
509
|
+
artifacts.chart_data = chart_data
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def step_extract_print_areas_com(
|
|
513
|
+
inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
|
|
514
|
+
) -> None:
|
|
515
|
+
"""Extract print areas via COM when openpyxl data is unavailable.
|
|
516
|
+
|
|
517
|
+
Args:
|
|
518
|
+
inputs: Pipeline inputs.
|
|
519
|
+
artifacts: Artifact container to update.
|
|
520
|
+
workbook: xlwings workbook instance.
|
|
521
|
+
"""
|
|
522
|
+
if artifacts.print_area_data:
|
|
523
|
+
return
|
|
524
|
+
artifacts.print_area_data = ComBackend(workbook).extract_print_areas()
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
def step_extract_auto_page_breaks_com(
|
|
528
|
+
inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
|
|
529
|
+
) -> None:
|
|
530
|
+
"""Extract auto page breaks via COM.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
inputs: Pipeline inputs.
|
|
534
|
+
artifacts: Artifact container to update.
|
|
535
|
+
workbook: xlwings workbook instance.
|
|
536
|
+
"""
|
|
537
|
+
artifacts.auto_page_break_data = ComBackend(workbook).extract_auto_page_breaks()
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def step_extract_colors_map_com(
|
|
541
|
+
inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
|
|
542
|
+
) -> None:
|
|
543
|
+
"""Extract colors_map via COM, falling back to openpyxl when needed.
|
|
544
|
+
|
|
545
|
+
Args:
|
|
546
|
+
inputs: Pipeline inputs.
|
|
547
|
+
artifacts: Artifact container to update.
|
|
548
|
+
workbook: xlwings workbook instance.
|
|
549
|
+
"""
|
|
550
|
+
com_result = ComBackend(workbook).extract_colors_map(
|
|
551
|
+
include_default_background=inputs.include_default_background,
|
|
552
|
+
ignore_colors=inputs.ignore_colors,
|
|
553
|
+
)
|
|
554
|
+
if com_result is not None:
|
|
555
|
+
artifacts.colors_map_data = com_result
|
|
556
|
+
return
|
|
557
|
+
if artifacts.colors_map_data is None:
|
|
558
|
+
artifacts.colors_map_data = OpenpyxlBackend(
|
|
559
|
+
inputs.file_path
|
|
560
|
+
).extract_colors_map(
|
|
561
|
+
include_default_background=inputs.include_default_background,
|
|
562
|
+
ignore_colors=inputs.ignore_colors,
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def _resolve_sheet_colors_map(
|
|
567
|
+
colors_map_data: WorkbookColorsMap | None, sheet_name: str
|
|
568
|
+
) -> dict[str, list[tuple[int, int]]]:
|
|
569
|
+
"""Resolve colors_map for a single sheet.
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
colors_map_data: Optional workbook colors map container.
|
|
573
|
+
sheet_name: Target sheet name.
|
|
574
|
+
|
|
575
|
+
Returns:
|
|
576
|
+
colors_map dictionary for the sheet, or empty dict if unavailable.
|
|
577
|
+
"""
|
|
578
|
+
if not colors_map_data:
|
|
579
|
+
return {}
|
|
580
|
+
sheet_colors = colors_map_data.get_sheet(sheet_name)
|
|
581
|
+
if sheet_colors is None:
|
|
582
|
+
return {}
|
|
583
|
+
return sheet_colors.colors_map
|
|
584
|
+
|
|
585
|
+
|
|
586
|
+
def collect_sheet_raw_data(
|
|
587
|
+
*,
|
|
588
|
+
cell_data: CellData,
|
|
589
|
+
shape_data: ShapeData,
|
|
590
|
+
chart_data: ChartData,
|
|
591
|
+
merged_cell_data: MergedCellData,
|
|
592
|
+
workbook: xw.Book,
|
|
593
|
+
mode: ExtractionMode = "standard",
|
|
594
|
+
print_area_data: PrintAreaData | None = None,
|
|
595
|
+
auto_page_break_data: PrintAreaData | None = None,
|
|
596
|
+
colors_map_data: WorkbookColorsMap | None = None,
|
|
597
|
+
) -> dict[str, SheetRawData]:
|
|
598
|
+
"""Collect per-sheet raw data from extraction artifacts.
|
|
599
|
+
|
|
600
|
+
Args:
|
|
601
|
+
cell_data: Extracted cell rows per sheet.
|
|
602
|
+
shape_data: Extracted shapes per sheet.
|
|
603
|
+
chart_data: Extracted charts per sheet.
|
|
604
|
+
merged_cell_data: Extracted merged cells per sheet.
|
|
605
|
+
workbook: xlwings workbook instance.
|
|
606
|
+
mode: Extraction mode.
|
|
607
|
+
print_area_data: Optional print area data per sheet.
|
|
608
|
+
auto_page_break_data: Optional auto page-break data per sheet.
|
|
609
|
+
colors_map_data: Optional colors map data.
|
|
610
|
+
|
|
611
|
+
Returns:
|
|
612
|
+
Mapping of sheet name to raw sheet data.
|
|
613
|
+
"""
|
|
614
|
+
result: dict[str, SheetRawData] = {}
|
|
615
|
+
for sheet_name, rows in cell_data.items():
|
|
616
|
+
sheet = workbook.sheets[sheet_name]
|
|
617
|
+
sheet_raw = SheetRawData(
|
|
618
|
+
rows=rows,
|
|
619
|
+
shapes=shape_data.get(sheet_name, []),
|
|
620
|
+
charts=chart_data.get(sheet_name, []) if mode != "light" else [],
|
|
621
|
+
table_candidates=detect_tables(sheet),
|
|
622
|
+
print_areas=print_area_data.get(sheet_name, []) if print_area_data else [],
|
|
623
|
+
auto_print_areas=auto_page_break_data.get(sheet_name, [])
|
|
624
|
+
if auto_page_break_data
|
|
625
|
+
else [],
|
|
626
|
+
colors_map=_resolve_sheet_colors_map(colors_map_data, sheet_name),
|
|
627
|
+
merged_cells=merged_cell_data.get(sheet_name, []),
|
|
628
|
+
)
|
|
629
|
+
result[sheet_name] = sheet_raw
|
|
630
|
+
return result
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
def run_extraction_pipeline(inputs: ExtractionInputs) -> PipelineResult:
|
|
634
|
+
"""Run the full extraction pipeline and return the result.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
inputs: Resolved pipeline inputs.
|
|
638
|
+
|
|
639
|
+
Returns:
|
|
640
|
+
PipelineResult with workbook data, artifacts, and execution state.
|
|
641
|
+
"""
|
|
642
|
+
plan = build_pipeline_plan(inputs)
|
|
643
|
+
artifacts = run_pipeline(plan.pre_com_steps, inputs, ExtractionArtifacts())
|
|
644
|
+
state = PipelineState()
|
|
645
|
+
|
|
646
|
+
def _fallback(message: str, reason: FallbackReason) -> PipelineResult:
|
|
647
|
+
state.fallback_reason = reason
|
|
648
|
+
log_fallback(logger, reason, message)
|
|
649
|
+
workbook = build_cells_tables_workbook(
|
|
650
|
+
inputs=inputs,
|
|
651
|
+
artifacts=artifacts,
|
|
652
|
+
reason=message,
|
|
653
|
+
)
|
|
654
|
+
return PipelineResult(workbook=workbook, artifacts=artifacts, state=state)
|
|
655
|
+
|
|
656
|
+
if not plan.use_com:
|
|
657
|
+
return _fallback("Light mode selected.", FallbackReason.LIGHT_MODE)
|
|
658
|
+
|
|
659
|
+
if os.getenv("SKIP_COM_TESTS"):
|
|
660
|
+
return _fallback(
|
|
661
|
+
"SKIP_COM_TESTS is set; skipping COM/xlwings access.",
|
|
662
|
+
FallbackReason.SKIP_COM_TESTS,
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
try:
|
|
666
|
+
with xlwings_workbook(inputs.file_path) as workbook:
|
|
667
|
+
state.com_attempted = True
|
|
668
|
+
try:
|
|
669
|
+
run_com_pipeline(plan.com_steps, inputs, artifacts, workbook)
|
|
670
|
+
raw_sheets = collect_sheet_raw_data(
|
|
671
|
+
cell_data=artifacts.cell_data,
|
|
672
|
+
shape_data=artifacts.shape_data,
|
|
673
|
+
chart_data=artifacts.chart_data,
|
|
674
|
+
merged_cell_data=artifacts.merged_cell_data,
|
|
675
|
+
workbook=workbook,
|
|
676
|
+
mode=inputs.mode,
|
|
677
|
+
print_area_data=artifacts.print_area_data
|
|
678
|
+
if inputs.include_print_areas
|
|
679
|
+
else None,
|
|
680
|
+
auto_page_break_data=artifacts.auto_page_break_data
|
|
681
|
+
if inputs.include_auto_page_breaks
|
|
682
|
+
else None,
|
|
683
|
+
colors_map_data=artifacts.colors_map_data,
|
|
684
|
+
)
|
|
685
|
+
raw_workbook = WorkbookRawData(
|
|
686
|
+
book_name=inputs.file_path.name, sheets=raw_sheets
|
|
687
|
+
)
|
|
688
|
+
state.com_succeeded = True
|
|
689
|
+
return PipelineResult(
|
|
690
|
+
workbook=build_workbook_data(raw_workbook),
|
|
691
|
+
artifacts=artifacts,
|
|
692
|
+
state=state,
|
|
693
|
+
)
|
|
694
|
+
except Exception as exc:
|
|
695
|
+
return _fallback(
|
|
696
|
+
f"COM pipeline failed ({exc!r}).",
|
|
697
|
+
FallbackReason.COM_PIPELINE_FAILED,
|
|
698
|
+
)
|
|
699
|
+
except Exception as exc:
|
|
700
|
+
return _fallback(
|
|
701
|
+
f"xlwings/Excel COM is unavailable. ({exc!r})",
|
|
702
|
+
FallbackReason.COM_UNAVAILABLE,
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
|
|
706
|
+
def build_cells_tables_workbook(
|
|
707
|
+
*,
|
|
708
|
+
inputs: ExtractionInputs,
|
|
709
|
+
artifacts: ExtractionArtifacts,
|
|
710
|
+
reason: str,
|
|
711
|
+
) -> WorkbookData:
|
|
712
|
+
"""Build a WorkbookData containing cells + table_candidates (fallback).
|
|
713
|
+
|
|
714
|
+
Args:
|
|
715
|
+
inputs: Pipeline inputs.
|
|
716
|
+
artifacts: Collected artifacts from extraction steps.
|
|
717
|
+
reason: Reason to log for fallback.
|
|
718
|
+
|
|
719
|
+
Returns:
|
|
720
|
+
WorkbookData constructed from cells and detected tables.
|
|
721
|
+
"""
|
|
722
|
+
logger.debug("Building fallback workbook: %s", reason)
|
|
723
|
+
backend = OpenpyxlBackend(inputs.file_path)
|
|
724
|
+
colors_map_data = artifacts.colors_map_data
|
|
725
|
+
if inputs.include_colors_map and colors_map_data is None:
|
|
726
|
+
colors_map_data = backend.extract_colors_map(
|
|
727
|
+
include_default_background=inputs.include_default_background,
|
|
728
|
+
ignore_colors=inputs.ignore_colors,
|
|
729
|
+
)
|
|
730
|
+
sheets: dict[str, SheetRawData] = {}
|
|
731
|
+
for sheet_name, rows in artifacts.cell_data.items():
|
|
732
|
+
sheet_colors = (
|
|
733
|
+
colors_map_data.get_sheet(sheet_name) if colors_map_data else None
|
|
734
|
+
)
|
|
735
|
+
tables = backend.detect_tables(sheet_name)
|
|
736
|
+
sheets[sheet_name] = SheetRawData(
|
|
737
|
+
rows=rows,
|
|
738
|
+
shapes=[],
|
|
739
|
+
charts=[],
|
|
740
|
+
table_candidates=tables,
|
|
741
|
+
print_areas=artifacts.print_area_data.get(sheet_name, [])
|
|
742
|
+
if inputs.include_print_areas
|
|
743
|
+
else [],
|
|
744
|
+
auto_print_areas=[],
|
|
745
|
+
colors_map=sheet_colors.colors_map if sheet_colors else {},
|
|
746
|
+
merged_cells=artifacts.merged_cell_data.get(sheet_name, []),
|
|
747
|
+
)
|
|
748
|
+
raw = WorkbookRawData(book_name=inputs.file_path.name, sheets=sheets)
|
|
749
|
+
return build_workbook_data(raw)
|