exstruct 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,749 @@
1
+ from __future__ import annotations
2
+
3
+ from collections.abc import Callable, Sequence
4
+ from dataclasses import dataclass, field
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+ from typing import Literal
9
+
10
+ import xlwings as xw
11
+
12
+ from ..errors import FallbackReason
13
+ from ..models import (
14
+ Arrow,
15
+ CellRow,
16
+ Chart,
17
+ MergedCell,
18
+ PrintArea,
19
+ Shape,
20
+ SmartArt,
21
+ WorkbookData,
22
+ )
23
+ from .backends.com_backend import ComBackend
24
+ from .backends.openpyxl_backend import OpenpyxlBackend
25
+ from .cells import WorkbookColorsMap, detect_tables
26
+ from .charts import get_charts
27
+ from .logging_utils import log_fallback
28
+ from .modeling import SheetRawData, WorkbookRawData, build_workbook_data
29
+ from .shapes import get_shapes_with_position
30
+ from .workbook import xlwings_workbook
31
+
32
+ ExtractionMode = Literal["light", "standard", "verbose"]
33
+ CellData = dict[str, list[CellRow]]
34
+ PrintAreaData = dict[str, list[PrintArea]]
35
+ MergedCellData = dict[str, list[MergedCell]]
36
+ ShapeData = dict[str, list[Shape | Arrow | SmartArt]]
37
+ ChartData = dict[str, list[Chart]]
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class ExtractionInputs:
44
+ """Immutable inputs for pipeline steps.
45
+
46
+ Attributes:
47
+ file_path: Path to the Excel workbook.
48
+ mode: Extraction mode (light/standard/verbose).
49
+ include_cell_links: Whether to include cell hyperlinks.
50
+ include_print_areas: Whether to include print areas.
51
+ include_auto_page_breaks: Whether to include auto page breaks.
52
+ include_colors_map: Whether to include background colors map.
53
+ include_default_background: Whether to include default background color.
54
+ ignore_colors: Optional set of color keys to ignore.
55
+ include_merged_cells: Whether to include merged cell ranges.
56
+ """
57
+
58
+ file_path: Path
59
+ mode: ExtractionMode
60
+ include_cell_links: bool
61
+ include_print_areas: bool
62
+ include_auto_page_breaks: bool
63
+ include_colors_map: bool
64
+ include_default_background: bool
65
+ ignore_colors: set[str] | None
66
+ include_merged_cells: bool
67
+
68
+
69
+ @dataclass
70
+ class ExtractionArtifacts:
71
+ """Mutable artifacts collected by pipeline steps.
72
+
73
+ Attributes:
74
+ cell_data: Extracted cell rows per sheet.
75
+ print_area_data: Extracted print areas per sheet.
76
+ auto_page_break_data: Extracted auto page-break areas per sheet.
77
+ colors_map_data: Extracted colors map for workbook sheets.
78
+ shape_data: Extracted shapes per sheet.
79
+ chart_data: Extracted charts per sheet.
80
+ merged_cell_data: Extracted merged cell ranges per sheet.
81
+ """
82
+
83
+ cell_data: CellData = field(default_factory=dict)
84
+ print_area_data: PrintAreaData = field(default_factory=dict)
85
+ auto_page_break_data: PrintAreaData = field(default_factory=dict)
86
+ colors_map_data: WorkbookColorsMap | None = None
87
+ shape_data: ShapeData = field(default_factory=dict)
88
+ chart_data: ChartData = field(default_factory=dict)
89
+ merged_cell_data: MergedCellData = field(default_factory=dict)
90
+
91
+
92
+ ExtractionStep = Callable[[ExtractionInputs, ExtractionArtifacts], None]
93
+ ComExtractionStep = Callable[[ExtractionInputs, ExtractionArtifacts, xw.Book], None]
94
+
95
+
96
+ @dataclass(frozen=True)
97
+ class PipelinePlan:
98
+ """Resolved pipeline plan for an extraction run.
99
+
100
+ Attributes:
101
+ pre_com_steps: Ordered list of steps to run before COM access.
102
+ com_steps: Ordered list of steps to run with COM access.
103
+ use_com: Whether COM-based extraction should be attempted.
104
+ """
105
+
106
+ pre_com_steps: list[ExtractionStep]
107
+ com_steps: list[ComExtractionStep]
108
+ use_com: bool
109
+
110
+
111
+ @dataclass(frozen=True)
112
+ class StepConfig:
113
+ """Configuration for a pipeline step.
114
+
115
+ Attributes:
116
+ name: Step name for debugging.
117
+ step: Callable to execute.
118
+ enabled: Predicate to include the step in the pipeline.
119
+ """
120
+
121
+ name: str
122
+ step: ExtractionStep
123
+ enabled: Callable[[ExtractionInputs], bool]
124
+
125
+
126
+ @dataclass(frozen=True)
127
+ class ComStepConfig:
128
+ """Configuration for a COM pipeline step.
129
+
130
+ Attributes:
131
+ name: Step name for debugging.
132
+ step: Callable to execute with COM workbook.
133
+ enabled: Predicate to include the step in the pipeline.
134
+ """
135
+
136
+ name: str
137
+ step: ComExtractionStep
138
+ enabled: Callable[[ExtractionInputs], bool]
139
+
140
+
141
+ @dataclass
142
+ class PipelineState:
143
+ """Mutable execution state for a pipeline run.
144
+
145
+ Attributes:
146
+ com_attempted: Whether COM access was attempted.
147
+ com_succeeded: Whether COM steps completed successfully.
148
+ fallback_reason: Optional fallback reason code.
149
+ """
150
+
151
+ com_attempted: bool = False
152
+ com_succeeded: bool = False
153
+ fallback_reason: FallbackReason | None = None
154
+
155
+
156
+ @dataclass(frozen=True)
157
+ class PipelineResult:
158
+ """Result of a pipeline run.
159
+
160
+ Attributes:
161
+ workbook: Extracted workbook data.
162
+ artifacts: Collected extraction artifacts.
163
+ state: Pipeline execution state.
164
+ """
165
+
166
+ workbook: WorkbookData
167
+ artifacts: ExtractionArtifacts
168
+ state: PipelineState
169
+
170
+
171
+ def resolve_extraction_inputs(
172
+ file_path: str | Path,
173
+ *,
174
+ mode: ExtractionMode,
175
+ include_cell_links: bool | None,
176
+ include_print_areas: bool | None,
177
+ include_auto_page_breaks: bool,
178
+ include_colors_map: bool | None,
179
+ include_default_background: bool,
180
+ ignore_colors: set[str] | None,
181
+ include_merged_cells: bool | None,
182
+ ) -> ExtractionInputs:
183
+ """Resolve include flags and normalize inputs for the pipeline.
184
+
185
+ Args:
186
+ file_path: Workbook path (str or Path).
187
+ mode: Extraction mode.
188
+ include_cell_links: Whether to include hyperlinks; None uses mode defaults.
189
+ include_print_areas: Whether to include print areas; None defaults to True.
190
+ include_auto_page_breaks: Whether to include auto page breaks.
191
+ include_colors_map: Whether to include background colors; None uses mode defaults.
192
+ include_default_background: Include default background colors when colors_map is enabled.
193
+ ignore_colors: Optional set of colors to ignore when colors_map is enabled.
194
+ include_merged_cells: Whether to include merged cell ranges; None uses mode defaults.
195
+
196
+ Returns:
197
+ Resolved ExtractionInputs.
198
+
199
+ Raises:
200
+ ValueError: If an unsupported mode is provided.
201
+ """
202
+ allowed_modes: set[str] = {"light", "standard", "verbose"}
203
+ if mode not in allowed_modes:
204
+ raise ValueError(f"Unsupported mode: {mode}")
205
+
206
+ normalized_file_path = file_path if isinstance(file_path, Path) else Path(file_path)
207
+ resolved_cell_links = (
208
+ include_cell_links if include_cell_links is not None else mode == "verbose"
209
+ )
210
+ resolved_print_areas = (
211
+ include_print_areas if include_print_areas is not None else True
212
+ )
213
+ resolved_colors_map = (
214
+ include_colors_map if include_colors_map is not None else mode == "verbose"
215
+ )
216
+ resolved_default_background = (
217
+ include_default_background if resolved_colors_map else False
218
+ )
219
+ resolved_ignore_colors = ignore_colors if resolved_colors_map else None
220
+ if resolved_colors_map and resolved_ignore_colors is None:
221
+ resolved_ignore_colors = set()
222
+ resolved_merged_cells = (
223
+ include_merged_cells if include_merged_cells is not None else mode != "light"
224
+ )
225
+
226
+ return ExtractionInputs(
227
+ file_path=normalized_file_path,
228
+ mode=mode,
229
+ include_cell_links=resolved_cell_links,
230
+ include_print_areas=resolved_print_areas,
231
+ include_auto_page_breaks=include_auto_page_breaks,
232
+ include_colors_map=resolved_colors_map,
233
+ include_default_background=resolved_default_background,
234
+ ignore_colors=resolved_ignore_colors,
235
+ include_merged_cells=resolved_merged_cells,
236
+ )
237
+
238
+
239
+ def build_pipeline_plan(inputs: ExtractionInputs) -> PipelinePlan:
240
+ """Build a pipeline plan based on resolved inputs.
241
+
242
+ Args:
243
+ inputs: Resolved pipeline inputs.
244
+
245
+ Returns:
246
+ PipelinePlan containing pre-COM/COM steps and COM usage flag.
247
+ """
248
+ return PipelinePlan(
249
+ pre_com_steps=build_pre_com_pipeline(inputs),
250
+ com_steps=build_com_pipeline(inputs),
251
+ use_com=inputs.mode != "light",
252
+ )
253
+
254
+
255
+ def build_pre_com_pipeline(inputs: ExtractionInputs) -> list[ExtractionStep]:
256
+ """Build pipeline steps that run before COM/Excel access.
257
+
258
+ Args:
259
+ inputs: Pipeline inputs describing extraction flags.
260
+
261
+ Returns:
262
+ Ordered list of extraction steps to run before COM.
263
+ """
264
+ step_table: dict[ExtractionMode, Sequence[StepConfig]] = {
265
+ "light": (
266
+ StepConfig(
267
+ name="cells",
268
+ step=step_extract_cells,
269
+ enabled=lambda _inputs: True,
270
+ ),
271
+ StepConfig(
272
+ name="print_areas_openpyxl",
273
+ step=step_extract_print_areas_openpyxl,
274
+ enabled=lambda _inputs: _inputs.include_print_areas,
275
+ ),
276
+ StepConfig(
277
+ name="colors_map_openpyxl",
278
+ step=step_extract_colors_map_openpyxl,
279
+ enabled=lambda _inputs: _inputs.include_colors_map,
280
+ ),
281
+ StepConfig(
282
+ name="merged_cells_openpyxl",
283
+ step=step_extract_merged_cells_openpyxl,
284
+ enabled=lambda _inputs: _inputs.include_merged_cells,
285
+ ),
286
+ ),
287
+ "standard": (
288
+ StepConfig(
289
+ name="cells",
290
+ step=step_extract_cells,
291
+ enabled=lambda _inputs: True,
292
+ ),
293
+ StepConfig(
294
+ name="print_areas_openpyxl",
295
+ step=step_extract_print_areas_openpyxl,
296
+ enabled=lambda _inputs: _inputs.include_print_areas,
297
+ ),
298
+ StepConfig(
299
+ name="colors_map_openpyxl_if_skip_com",
300
+ step=step_extract_colors_map_openpyxl,
301
+ enabled=lambda _inputs: _inputs.include_colors_map
302
+ and bool(os.getenv("SKIP_COM_TESTS")),
303
+ ),
304
+ StepConfig(
305
+ name="merged_cells_openpyxl",
306
+ step=step_extract_merged_cells_openpyxl,
307
+ enabled=lambda _inputs: _inputs.include_merged_cells,
308
+ ),
309
+ ),
310
+ "verbose": (
311
+ StepConfig(
312
+ name="cells",
313
+ step=step_extract_cells,
314
+ enabled=lambda _inputs: True,
315
+ ),
316
+ StepConfig(
317
+ name="print_areas_openpyxl",
318
+ step=step_extract_print_areas_openpyxl,
319
+ enabled=lambda _inputs: _inputs.include_print_areas,
320
+ ),
321
+ StepConfig(
322
+ name="colors_map_openpyxl_if_skip_com",
323
+ step=step_extract_colors_map_openpyxl,
324
+ enabled=lambda _inputs: _inputs.include_colors_map
325
+ and bool(os.getenv("SKIP_COM_TESTS")),
326
+ ),
327
+ StepConfig(
328
+ name="merged_cells_openpyxl",
329
+ step=step_extract_merged_cells_openpyxl,
330
+ enabled=lambda _inputs: _inputs.include_merged_cells,
331
+ ),
332
+ ),
333
+ }
334
+ steps: list[ExtractionStep] = []
335
+ for config in step_table[inputs.mode]:
336
+ if config.enabled(inputs):
337
+ steps.append(config.step)
338
+ return steps
339
+
340
+
341
+ def build_com_pipeline(inputs: ExtractionInputs) -> list[ComExtractionStep]:
342
+ """Build pipeline steps that require COM/Excel access.
343
+
344
+ Args:
345
+ inputs: Pipeline inputs describing extraction flags.
346
+
347
+ Returns:
348
+ Ordered list of COM extraction steps.
349
+ """
350
+ if inputs.mode == "light":
351
+ return []
352
+ step_table: Sequence[ComStepConfig] = (
353
+ ComStepConfig(
354
+ name="shapes_com",
355
+ step=step_extract_shapes_com,
356
+ enabled=lambda _inputs: True,
357
+ ),
358
+ ComStepConfig(
359
+ name="charts_com",
360
+ step=step_extract_charts_com,
361
+ enabled=lambda _inputs: True,
362
+ ),
363
+ ComStepConfig(
364
+ name="print_areas_com",
365
+ step=step_extract_print_areas_com,
366
+ enabled=lambda _inputs: _inputs.include_print_areas,
367
+ ),
368
+ ComStepConfig(
369
+ name="auto_page_breaks_com",
370
+ step=step_extract_auto_page_breaks_com,
371
+ enabled=lambda _inputs: _inputs.include_auto_page_breaks,
372
+ ),
373
+ ComStepConfig(
374
+ name="colors_map_com",
375
+ step=step_extract_colors_map_com,
376
+ enabled=lambda _inputs: _inputs.include_colors_map,
377
+ ),
378
+ )
379
+ steps: list[ComExtractionStep] = []
380
+ for config in step_table:
381
+ if config.enabled(inputs):
382
+ steps.append(config.step)
383
+ return steps
384
+
385
+
386
+ def run_pipeline(
387
+ steps: Sequence[ExtractionStep],
388
+ inputs: ExtractionInputs,
389
+ artifacts: ExtractionArtifacts,
390
+ ) -> ExtractionArtifacts:
391
+ """Run steps in order and return updated artifacts.
392
+
393
+ Args:
394
+ steps: Ordered extraction steps.
395
+ inputs: Pipeline inputs.
396
+ artifacts: Artifact container to update.
397
+
398
+ Returns:
399
+ Updated artifacts after running all steps.
400
+ """
401
+ for step in steps:
402
+ step(inputs, artifacts)
403
+ return artifacts
404
+
405
+
406
+ def run_com_pipeline(
407
+ steps: Sequence[ComExtractionStep],
408
+ inputs: ExtractionInputs,
409
+ artifacts: ExtractionArtifacts,
410
+ workbook: xw.Book,
411
+ ) -> ExtractionArtifacts:
412
+ """Run COM steps in order and return updated artifacts.
413
+
414
+ Args:
415
+ steps: Ordered COM extraction steps.
416
+ inputs: Pipeline inputs.
417
+ artifacts: Artifact container to update.
418
+ workbook: xlwings workbook instance.
419
+
420
+ Returns:
421
+ Updated artifacts after running all COM steps.
422
+ """
423
+ for step in steps:
424
+ step(inputs, artifacts, workbook)
425
+ return artifacts
426
+
427
+
428
+ def step_extract_cells(
429
+ inputs: ExtractionInputs, artifacts: ExtractionArtifacts
430
+ ) -> None:
431
+ """Extract cell rows, optionally including hyperlinks.
432
+
433
+ Args:
434
+ inputs: Pipeline inputs.
435
+ artifacts: Artifact container to update.
436
+ """
437
+ backend = OpenpyxlBackend(inputs.file_path)
438
+ artifacts.cell_data = backend.extract_cells(include_links=inputs.include_cell_links)
439
+
440
+
441
+ def step_extract_print_areas_openpyxl(
442
+ inputs: ExtractionInputs, artifacts: ExtractionArtifacts
443
+ ) -> None:
444
+ """Extract print areas via openpyxl.
445
+
446
+ Args:
447
+ inputs: Pipeline inputs.
448
+ artifacts: Artifact container to update.
449
+ """
450
+ backend = OpenpyxlBackend(inputs.file_path)
451
+ artifacts.print_area_data = backend.extract_print_areas()
452
+
453
+
454
+ def step_extract_colors_map_openpyxl(
455
+ inputs: ExtractionInputs, artifacts: ExtractionArtifacts
456
+ ) -> None:
457
+ """Extract colors_map via openpyxl; logs and skips on failure.
458
+
459
+ Args:
460
+ inputs: Pipeline inputs.
461
+ artifacts: Artifact container to update.
462
+ """
463
+ backend = OpenpyxlBackend(inputs.file_path)
464
+ artifacts.colors_map_data = backend.extract_colors_map(
465
+ include_default_background=inputs.include_default_background,
466
+ ignore_colors=inputs.ignore_colors,
467
+ )
468
+
469
+
470
+ def step_extract_merged_cells_openpyxl(
471
+ inputs: ExtractionInputs, artifacts: ExtractionArtifacts
472
+ ) -> None:
473
+ """Extract merged cell ranges via openpyxl.
474
+
475
+ Args:
476
+ inputs: Pipeline inputs.
477
+ artifacts: Artifact container to update.
478
+ """
479
+ backend = OpenpyxlBackend(inputs.file_path)
480
+ artifacts.merged_cell_data = backend.extract_merged_cells()
481
+
482
+
483
+ def step_extract_shapes_com(
484
+ inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
485
+ ) -> None:
486
+ """Extract shapes via COM.
487
+
488
+ Args:
489
+ inputs: Pipeline inputs.
490
+ artifacts: Artifact container to update.
491
+ workbook: xlwings workbook instance.
492
+ """
493
+ artifacts.shape_data = get_shapes_with_position(workbook, mode=inputs.mode)
494
+
495
+
496
+ def step_extract_charts_com(
497
+ inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
498
+ ) -> None:
499
+ """Extract charts via COM.
500
+
501
+ Args:
502
+ inputs: Pipeline inputs.
503
+ artifacts: Artifact container to update.
504
+ workbook: xlwings workbook instance.
505
+ """
506
+ chart_data: ChartData = {}
507
+ for sheet in workbook.sheets:
508
+ chart_data[sheet.name] = get_charts(sheet, mode=inputs.mode)
509
+ artifacts.chart_data = chart_data
510
+
511
+
512
+ def step_extract_print_areas_com(
513
+ inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
514
+ ) -> None:
515
+ """Extract print areas via COM when openpyxl data is unavailable.
516
+
517
+ Args:
518
+ inputs: Pipeline inputs.
519
+ artifacts: Artifact container to update.
520
+ workbook: xlwings workbook instance.
521
+ """
522
+ if artifacts.print_area_data:
523
+ return
524
+ artifacts.print_area_data = ComBackend(workbook).extract_print_areas()
525
+
526
+
527
+ def step_extract_auto_page_breaks_com(
528
+ inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
529
+ ) -> None:
530
+ """Extract auto page breaks via COM.
531
+
532
+ Args:
533
+ inputs: Pipeline inputs.
534
+ artifacts: Artifact container to update.
535
+ workbook: xlwings workbook instance.
536
+ """
537
+ artifacts.auto_page_break_data = ComBackend(workbook).extract_auto_page_breaks()
538
+
539
+
540
+ def step_extract_colors_map_com(
541
+ inputs: ExtractionInputs, artifacts: ExtractionArtifacts, workbook: xw.Book
542
+ ) -> None:
543
+ """Extract colors_map via COM, falling back to openpyxl when needed.
544
+
545
+ Args:
546
+ inputs: Pipeline inputs.
547
+ artifacts: Artifact container to update.
548
+ workbook: xlwings workbook instance.
549
+ """
550
+ com_result = ComBackend(workbook).extract_colors_map(
551
+ include_default_background=inputs.include_default_background,
552
+ ignore_colors=inputs.ignore_colors,
553
+ )
554
+ if com_result is not None:
555
+ artifacts.colors_map_data = com_result
556
+ return
557
+ if artifacts.colors_map_data is None:
558
+ artifacts.colors_map_data = OpenpyxlBackend(
559
+ inputs.file_path
560
+ ).extract_colors_map(
561
+ include_default_background=inputs.include_default_background,
562
+ ignore_colors=inputs.ignore_colors,
563
+ )
564
+
565
+
566
+ def _resolve_sheet_colors_map(
567
+ colors_map_data: WorkbookColorsMap | None, sheet_name: str
568
+ ) -> dict[str, list[tuple[int, int]]]:
569
+ """Resolve colors_map for a single sheet.
570
+
571
+ Args:
572
+ colors_map_data: Optional workbook colors map container.
573
+ sheet_name: Target sheet name.
574
+
575
+ Returns:
576
+ colors_map dictionary for the sheet, or empty dict if unavailable.
577
+ """
578
+ if not colors_map_data:
579
+ return {}
580
+ sheet_colors = colors_map_data.get_sheet(sheet_name)
581
+ if sheet_colors is None:
582
+ return {}
583
+ return sheet_colors.colors_map
584
+
585
+
586
+ def collect_sheet_raw_data(
587
+ *,
588
+ cell_data: CellData,
589
+ shape_data: ShapeData,
590
+ chart_data: ChartData,
591
+ merged_cell_data: MergedCellData,
592
+ workbook: xw.Book,
593
+ mode: ExtractionMode = "standard",
594
+ print_area_data: PrintAreaData | None = None,
595
+ auto_page_break_data: PrintAreaData | None = None,
596
+ colors_map_data: WorkbookColorsMap | None = None,
597
+ ) -> dict[str, SheetRawData]:
598
+ """Collect per-sheet raw data from extraction artifacts.
599
+
600
+ Args:
601
+ cell_data: Extracted cell rows per sheet.
602
+ shape_data: Extracted shapes per sheet.
603
+ chart_data: Extracted charts per sheet.
604
+ merged_cell_data: Extracted merged cells per sheet.
605
+ workbook: xlwings workbook instance.
606
+ mode: Extraction mode.
607
+ print_area_data: Optional print area data per sheet.
608
+ auto_page_break_data: Optional auto page-break data per sheet.
609
+ colors_map_data: Optional colors map data.
610
+
611
+ Returns:
612
+ Mapping of sheet name to raw sheet data.
613
+ """
614
+ result: dict[str, SheetRawData] = {}
615
+ for sheet_name, rows in cell_data.items():
616
+ sheet = workbook.sheets[sheet_name]
617
+ sheet_raw = SheetRawData(
618
+ rows=rows,
619
+ shapes=shape_data.get(sheet_name, []),
620
+ charts=chart_data.get(sheet_name, []) if mode != "light" else [],
621
+ table_candidates=detect_tables(sheet),
622
+ print_areas=print_area_data.get(sheet_name, []) if print_area_data else [],
623
+ auto_print_areas=auto_page_break_data.get(sheet_name, [])
624
+ if auto_page_break_data
625
+ else [],
626
+ colors_map=_resolve_sheet_colors_map(colors_map_data, sheet_name),
627
+ merged_cells=merged_cell_data.get(sheet_name, []),
628
+ )
629
+ result[sheet_name] = sheet_raw
630
+ return result
631
+
632
+
633
+ def run_extraction_pipeline(inputs: ExtractionInputs) -> PipelineResult:
634
+ """Run the full extraction pipeline and return the result.
635
+
636
+ Args:
637
+ inputs: Resolved pipeline inputs.
638
+
639
+ Returns:
640
+ PipelineResult with workbook data, artifacts, and execution state.
641
+ """
642
+ plan = build_pipeline_plan(inputs)
643
+ artifacts = run_pipeline(plan.pre_com_steps, inputs, ExtractionArtifacts())
644
+ state = PipelineState()
645
+
646
+ def _fallback(message: str, reason: FallbackReason) -> PipelineResult:
647
+ state.fallback_reason = reason
648
+ log_fallback(logger, reason, message)
649
+ workbook = build_cells_tables_workbook(
650
+ inputs=inputs,
651
+ artifacts=artifacts,
652
+ reason=message,
653
+ )
654
+ return PipelineResult(workbook=workbook, artifacts=artifacts, state=state)
655
+
656
+ if not plan.use_com:
657
+ return _fallback("Light mode selected.", FallbackReason.LIGHT_MODE)
658
+
659
+ if os.getenv("SKIP_COM_TESTS"):
660
+ return _fallback(
661
+ "SKIP_COM_TESTS is set; skipping COM/xlwings access.",
662
+ FallbackReason.SKIP_COM_TESTS,
663
+ )
664
+
665
+ try:
666
+ with xlwings_workbook(inputs.file_path) as workbook:
667
+ state.com_attempted = True
668
+ try:
669
+ run_com_pipeline(plan.com_steps, inputs, artifacts, workbook)
670
+ raw_sheets = collect_sheet_raw_data(
671
+ cell_data=artifacts.cell_data,
672
+ shape_data=artifacts.shape_data,
673
+ chart_data=artifacts.chart_data,
674
+ merged_cell_data=artifacts.merged_cell_data,
675
+ workbook=workbook,
676
+ mode=inputs.mode,
677
+ print_area_data=artifacts.print_area_data
678
+ if inputs.include_print_areas
679
+ else None,
680
+ auto_page_break_data=artifacts.auto_page_break_data
681
+ if inputs.include_auto_page_breaks
682
+ else None,
683
+ colors_map_data=artifacts.colors_map_data,
684
+ )
685
+ raw_workbook = WorkbookRawData(
686
+ book_name=inputs.file_path.name, sheets=raw_sheets
687
+ )
688
+ state.com_succeeded = True
689
+ return PipelineResult(
690
+ workbook=build_workbook_data(raw_workbook),
691
+ artifacts=artifacts,
692
+ state=state,
693
+ )
694
+ except Exception as exc:
695
+ return _fallback(
696
+ f"COM pipeline failed ({exc!r}).",
697
+ FallbackReason.COM_PIPELINE_FAILED,
698
+ )
699
+ except Exception as exc:
700
+ return _fallback(
701
+ f"xlwings/Excel COM is unavailable. ({exc!r})",
702
+ FallbackReason.COM_UNAVAILABLE,
703
+ )
704
+
705
+
706
+ def build_cells_tables_workbook(
707
+ *,
708
+ inputs: ExtractionInputs,
709
+ artifacts: ExtractionArtifacts,
710
+ reason: str,
711
+ ) -> WorkbookData:
712
+ """Build a WorkbookData containing cells + table_candidates (fallback).
713
+
714
+ Args:
715
+ inputs: Pipeline inputs.
716
+ artifacts: Collected artifacts from extraction steps.
717
+ reason: Reason to log for fallback.
718
+
719
+ Returns:
720
+ WorkbookData constructed from cells and detected tables.
721
+ """
722
+ logger.debug("Building fallback workbook: %s", reason)
723
+ backend = OpenpyxlBackend(inputs.file_path)
724
+ colors_map_data = artifacts.colors_map_data
725
+ if inputs.include_colors_map and colors_map_data is None:
726
+ colors_map_data = backend.extract_colors_map(
727
+ include_default_background=inputs.include_default_background,
728
+ ignore_colors=inputs.ignore_colors,
729
+ )
730
+ sheets: dict[str, SheetRawData] = {}
731
+ for sheet_name, rows in artifacts.cell_data.items():
732
+ sheet_colors = (
733
+ colors_map_data.get_sheet(sheet_name) if colors_map_data else None
734
+ )
735
+ tables = backend.detect_tables(sheet_name)
736
+ sheets[sheet_name] = SheetRawData(
737
+ rows=rows,
738
+ shapes=[],
739
+ charts=[],
740
+ table_candidates=tables,
741
+ print_areas=artifacts.print_area_data.get(sheet_name, [])
742
+ if inputs.include_print_areas
743
+ else [],
744
+ auto_print_areas=[],
745
+ colors_map=sheet_colors.colors_map if sheet_colors else {},
746
+ merged_cells=artifacts.merged_cell_data.get(sheet_name, []),
747
+ )
748
+ raw = WorkbookRawData(book_name=inputs.file_path.name, sheets=sheets)
749
+ return build_workbook_data(raw)