exstruct 0.2.80__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exstruct/__init__.py +387 -0
- exstruct/cli/availability.py +49 -0
- exstruct/cli/main.py +134 -0
- exstruct/core/__init__.py +0 -0
- exstruct/core/cells.py +1039 -0
- exstruct/core/charts.py +241 -0
- exstruct/core/integrate.py +388 -0
- exstruct/core/shapes.py +275 -0
- exstruct/engine.py +643 -0
- exstruct/errors.py +35 -0
- exstruct/io/__init__.py +555 -0
- exstruct/models/__init__.py +335 -0
- exstruct/models/maps.py +335 -0
- exstruct/models/types.py +8 -0
- exstruct/py.typed +0 -0
- exstruct/render/__init__.py +118 -0
- exstruct-0.2.80.dist-info/METADATA +435 -0
- exstruct-0.2.80.dist-info/RECORD +20 -0
- exstruct-0.2.80.dist-info/WHEEL +4 -0
- exstruct-0.2.80.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
import shutil
|
|
6
|
+
import tempfile
|
|
7
|
+
from types import ModuleType
|
|
8
|
+
from typing import Any, cast
|
|
9
|
+
|
|
10
|
+
import xlwings as xw
|
|
11
|
+
|
|
12
|
+
from ..errors import MissingDependencyError, RenderError
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _require_excel_app() -> xw.App:
|
|
18
|
+
"""Ensure Excel COM is available and return an App; otherwise raise."""
|
|
19
|
+
try:
|
|
20
|
+
app = xw.App(add_book=False, visible=False)
|
|
21
|
+
return app
|
|
22
|
+
except Exception as e:
|
|
23
|
+
raise RenderError(
|
|
24
|
+
"Excel (COM) is not available. Rendering (PDF/image) requires a desktop Excel installation."
|
|
25
|
+
) from e
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def export_pdf(excel_path: str | Path, output_pdf: str | Path) -> list[str]:
|
|
29
|
+
"""Export an Excel workbook to PDF via Excel COM and return sheet names in order."""
|
|
30
|
+
normalized_excel_path = Path(excel_path)
|
|
31
|
+
normalized_output_pdf = Path(output_pdf)
|
|
32
|
+
normalized_output_pdf.parent.mkdir(parents=True, exist_ok=True)
|
|
33
|
+
|
|
34
|
+
with tempfile.TemporaryDirectory() as td:
|
|
35
|
+
temp_dir = Path(td)
|
|
36
|
+
temp_xlsx = temp_dir / "book.xlsx"
|
|
37
|
+
temp_pdf = temp_dir / "book.pdf"
|
|
38
|
+
shutil.copy(normalized_excel_path, temp_xlsx)
|
|
39
|
+
|
|
40
|
+
app: xw.App | None = None
|
|
41
|
+
wb: xw.Book | None = None
|
|
42
|
+
try:
|
|
43
|
+
app = _require_excel_app()
|
|
44
|
+
wb = app.books.open(str(temp_xlsx))
|
|
45
|
+
sheet_names = [s.name for s in wb.sheets]
|
|
46
|
+
wb.api.ExportAsFixedFormat(0, str(temp_pdf))
|
|
47
|
+
shutil.copy(temp_pdf, normalized_output_pdf)
|
|
48
|
+
except RenderError:
|
|
49
|
+
raise
|
|
50
|
+
except Exception as exc:
|
|
51
|
+
raise RenderError(
|
|
52
|
+
(
|
|
53
|
+
"Failed to export PDF for "
|
|
54
|
+
f"'{normalized_excel_path}' to '{normalized_output_pdf}'."
|
|
55
|
+
)
|
|
56
|
+
) from exc
|
|
57
|
+
finally:
|
|
58
|
+
if wb is not None:
|
|
59
|
+
wb.close()
|
|
60
|
+
if app is not None:
|
|
61
|
+
app.quit()
|
|
62
|
+
if not normalized_output_pdf.exists():
|
|
63
|
+
raise RenderError(
|
|
64
|
+
f"Failed to export PDF to '{normalized_output_pdf}'."
|
|
65
|
+
)
|
|
66
|
+
return sheet_names
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _require_pdfium() -> ModuleType:
|
|
70
|
+
"""Ensure pypdfium2 is installed; otherwise raise with guidance."""
|
|
71
|
+
try:
|
|
72
|
+
import pypdfium2 as pdfium
|
|
73
|
+
except ImportError as e:
|
|
74
|
+
raise MissingDependencyError(
|
|
75
|
+
"Image rendering requires pypdfium2. Install it via `pip install pypdfium2 pillow` or add the 'render' extra."
|
|
76
|
+
) from e
|
|
77
|
+
return cast(ModuleType, pdfium)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def export_sheet_images(
|
|
81
|
+
excel_path: str | Path, output_dir: str | Path, dpi: int = 144
|
|
82
|
+
) -> list[Path]:
|
|
83
|
+
"""Export each sheet as PNG (via PDF then pypdfium2 rasterization) and return paths in sheet order."""
|
|
84
|
+
pdfium = cast(Any, _require_pdfium())
|
|
85
|
+
normalized_excel_path = Path(excel_path)
|
|
86
|
+
normalized_output_dir = Path(output_dir)
|
|
87
|
+
normalized_output_dir.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
|
|
89
|
+
try:
|
|
90
|
+
with tempfile.TemporaryDirectory() as td:
|
|
91
|
+
tmp_pdf = Path(td) / "book.pdf"
|
|
92
|
+
sheet_names = export_pdf(normalized_excel_path, tmp_pdf)
|
|
93
|
+
|
|
94
|
+
scale = dpi / 72.0
|
|
95
|
+
written: list[Path] = []
|
|
96
|
+
with pdfium.PdfDocument(str(tmp_pdf)) as pdf:
|
|
97
|
+
for i, sheet_name in enumerate(sheet_names):
|
|
98
|
+
page = pdf[i]
|
|
99
|
+
bitmap = page.render(scale=scale)
|
|
100
|
+
pil_image = bitmap.to_pil()
|
|
101
|
+
safe_name = _sanitize_sheet_filename(sheet_name)
|
|
102
|
+
img_path = normalized_output_dir / f"{i + 1:02d}_{safe_name}.png"
|
|
103
|
+
pil_image.save(img_path, format="PNG", dpi=(dpi, dpi))
|
|
104
|
+
written.append(img_path)
|
|
105
|
+
return written
|
|
106
|
+
except RenderError:
|
|
107
|
+
raise
|
|
108
|
+
except Exception as exc:
|
|
109
|
+
raise RenderError(
|
|
110
|
+
f"Failed to export sheet images to '{normalized_output_dir}'."
|
|
111
|
+
) from exc
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _sanitize_sheet_filename(name: str) -> str:
|
|
115
|
+
return "".join("_" if c in '\\/:*?"<>|' else c for c in name).strip() or "sheet"
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
__all__ = ["export_pdf", "export_sheet_images"]
|
|
@@ -0,0 +1,435 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: exstruct
|
|
3
|
+
Version: 0.2.80
|
|
4
|
+
Summary: Excel to structured JSON (tables, shapes, charts) for LLM/RAG pipelines
|
|
5
|
+
Keywords: excel,structure,data,exstruct
|
|
6
|
+
Author: harumiWeb
|
|
7
|
+
License: BSD 3-Clause License
|
|
8
|
+
|
|
9
|
+
Copyright (c) 2025, ExStruct Contributors
|
|
10
|
+
All rights reserved.
|
|
11
|
+
|
|
12
|
+
Redistribution and use in source and binary forms, with or without
|
|
13
|
+
modification, are permitted provided that the following conditions are met:
|
|
14
|
+
|
|
15
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
|
16
|
+
list of conditions and the following disclaimer.
|
|
17
|
+
|
|
18
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
19
|
+
this list of conditions and the following disclaimer in the documentation
|
|
20
|
+
and/or other materials provided with the distribution.
|
|
21
|
+
|
|
22
|
+
3. Neither the name of the copyright holder nor the names of its
|
|
23
|
+
contributors may be used to endorse or promote products derived from
|
|
24
|
+
this software without specific prior written permission.
|
|
25
|
+
|
|
26
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
27
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
28
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
29
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
30
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
31
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
32
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
33
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
34
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
35
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
36
|
+
Requires-Dist: numpy>=2.3.5
|
|
37
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
38
|
+
Requires-Dist: pandas>=2.3.3
|
|
39
|
+
Requires-Dist: pydantic>=2.12.5
|
|
40
|
+
Requires-Dist: scipy>=1.16.3
|
|
41
|
+
Requires-Dist: xlwings>=0.33.16
|
|
42
|
+
Requires-Dist: pypdfium2>=5.1.0 ; extra == 'render'
|
|
43
|
+
Requires-Dist: pillow>=12.0.0 ; extra == 'render'
|
|
44
|
+
Requires-Dist: python-toon>=0.1.3 ; extra == 'toon'
|
|
45
|
+
Requires-Dist: pyyaml>=6.0.3 ; extra == 'yaml'
|
|
46
|
+
Requires-Python: >=3.11
|
|
47
|
+
Project-URL: Documentation, https://harumiweb.github.io/exstruct/
|
|
48
|
+
Project-URL: Homepage, https://harumiweb.github.io/exstruct/
|
|
49
|
+
Project-URL: Issues, https://github.com/harumiWeb/exstruct/issues
|
|
50
|
+
Project-URL: Repository, https://github.com/harumiWeb/exstruct
|
|
51
|
+
Provides-Extra: render
|
|
52
|
+
Provides-Extra: toon
|
|
53
|
+
Provides-Extra: yaml
|
|
54
|
+
Description-Content-Type: text/markdown
|
|
55
|
+
|
|
56
|
+
# ExStruct β Excel Structured Extraction Engine
|
|
57
|
+
|
|
58
|
+
[](https://pypi.org/project/exstruct/) [](https://pepy.tech/projects/exstruct)  [](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
|
|
59
|
+
|
|
60
|
+

|
|
61
|
+
|
|
62
|
+
ExStruct reads Excel workbooks and outputs structured data (cells, table candidates, shapes, charts, print areas/views, auto page-break areas, hyperlinks) as JSON by default, with optional YAML/TOON formats. It targets both COM/Excel environments (rich extraction) and non-COM environments (cells + table candidates + print areas), with tunable detection heuristics and multiple output modes to fit LLM/RAG pipelines.
|
|
63
|
+
|
|
64
|
+
[ζ₯ζ¬ηREADME](README.ja.md)
|
|
65
|
+
|
|
66
|
+
## Features
|
|
67
|
+
|
|
68
|
+
- **Excel β Structured JSON**: cells, shapes, charts, table candidates, print areas/views, and auto page-break areas per sheet.
|
|
69
|
+
- **Output modes**: `light` (cells + table candidates + print areas; no COM, shapes/charts empty), `standard` (texted shapes + arrows, charts, print areas), `verbose` (all shapes with width/height, charts with size, print areas). Verbose also emits cell hyperlinks. Size output is flag-controlled.
|
|
70
|
+
- **Auto page-break export (COM only)**: capture Excel-computed auto page breaks and write per-area JSON/YAML/TOON when requested (CLI option appears only when COM is available).
|
|
71
|
+
- **Formats**: JSON (compact by default, `--pretty` available), YAML, TOON (optional dependencies).
|
|
72
|
+
- **Table detection tuning**: adjust heuristics at runtime via API.
|
|
73
|
+
- **CLI rendering** (Excel required): optional PDF and per-sheet PNGs.
|
|
74
|
+
- **Graceful fallback**: if Excel COM is unavailable, extraction falls back to cells + table candidates without crashing.
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install exstruct
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Optional extras:
|
|
83
|
+
|
|
84
|
+
- YAML: `pip install pyyaml`
|
|
85
|
+
- TOON: `pip install python-toon`
|
|
86
|
+
- Rendering (PDF/PNG): Excel + `pip install pypdfium2 pillow`
|
|
87
|
+
- All extras at once: `pip install exstruct[yaml,toon,render]`
|
|
88
|
+
|
|
89
|
+
Platform note:
|
|
90
|
+
|
|
91
|
+
- Full extraction (shapes/charts) targets Windows + Excel (COM via xlwings). On other platforms, use `mode=light` to get cells + `table_candidates`.
|
|
92
|
+
|
|
93
|
+
## Quick Start (CLI)
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
exstruct input.xlsx > output.json # compact JSON to stdout (default)
|
|
97
|
+
exstruct input.xlsx -o out.json --pretty # pretty JSON to a file
|
|
98
|
+
exstruct input.xlsx --format yaml # YAML (needs pyyaml)
|
|
99
|
+
exstruct input.xlsx --format toon # TOON (needs python-toon)
|
|
100
|
+
exstruct input.xlsx --sheets-dir sheets/ # split per sheet in chosen format
|
|
101
|
+
exstruct input.xlsx --print-areas-dir areas/ # split per print area (if any)
|
|
102
|
+
exstruct input.xlsx --auto-page-breaks-dir auto_areas/ # COM only; option appears when available
|
|
103
|
+
exstruct input.xlsx --mode light # cells + table candidates only
|
|
104
|
+
exstruct input.xlsx --pdf --image # PDF and PNGs (Excel required)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Auto page-break exports are available via API and CLI when Excel/COM is available; the CLI exposes `--auto-page-breaks-dir` only in COM-capable environments.
|
|
108
|
+
|
|
109
|
+
## Quick Start (Python)
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from pathlib import Path
|
|
113
|
+
from exstruct import extract, export, set_table_detection_params
|
|
114
|
+
|
|
115
|
+
# Tune table detection (optional)
|
|
116
|
+
set_table_detection_params(table_score_threshold=0.3, density_min=0.04)
|
|
117
|
+
|
|
118
|
+
# Extract with modes: "light", "standard", "verbose"
|
|
119
|
+
wb = extract("input.xlsx", mode="standard")
|
|
120
|
+
export(wb, Path("out.json"), pretty=False) # compact JSON
|
|
121
|
+
|
|
122
|
+
# Model helpers: iterate, index, and serialize directly
|
|
123
|
+
first_sheet = wb["Sheet1"] # __getitem__ access
|
|
124
|
+
for name, sheet in wb: # __iter__ yields (name, SheetData)
|
|
125
|
+
print(name, len(sheet.rows))
|
|
126
|
+
wb.save("out.json", pretty=True) # WorkbookData β file (by extension)
|
|
127
|
+
first_sheet.save("sheet.json") # SheetData β file (by extension)
|
|
128
|
+
print(first_sheet.to_yaml()) # YAML text (requires pyyaml)
|
|
129
|
+
|
|
130
|
+
# ExStructEngine: per-instance options (nested configs)
|
|
131
|
+
from exstruct import (
|
|
132
|
+
DestinationOptions,
|
|
133
|
+
ExStructEngine,
|
|
134
|
+
FilterOptions,
|
|
135
|
+
FormatOptions,
|
|
136
|
+
OutputOptions,
|
|
137
|
+
StructOptions,
|
|
138
|
+
export_auto_page_breaks,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
engine = ExStructEngine(
|
|
142
|
+
options=StructOptions(mode="verbose"), # verbose includes hyperlinks by default
|
|
143
|
+
output=OutputOptions(
|
|
144
|
+
format=FormatOptions(pretty=True),
|
|
145
|
+
filters=FilterOptions(include_shapes=False), # drop shapes in output
|
|
146
|
+
destinations=DestinationOptions(sheets_dir=Path("out_sheets")), # also write per-sheet files
|
|
147
|
+
),
|
|
148
|
+
)
|
|
149
|
+
wb2 = engine.extract("input.xlsx")
|
|
150
|
+
engine.export(wb2, Path("out_filtered.json")) # drops shapes via filters
|
|
151
|
+
|
|
152
|
+
# Enable hyperlinks in other modes
|
|
153
|
+
engine_links = ExStructEngine(options=StructOptions(mode="standard", include_cell_links=True))
|
|
154
|
+
with_links = engine_links.extract("input.xlsx")
|
|
155
|
+
|
|
156
|
+
# Export per print area (if print areas exist)
|
|
157
|
+
from exstruct import export_print_areas_as
|
|
158
|
+
export_print_areas_as(wb, "areas", fmt="json", pretty=True)
|
|
159
|
+
|
|
160
|
+
# Auto page-break extraction/output (COM only; raises if no auto breaks exist)
|
|
161
|
+
engine_auto = ExStructEngine(
|
|
162
|
+
output=OutputOptions(
|
|
163
|
+
destinations=DestinationOptions(auto_page_breaks_dir=Path("auto_areas"))
|
|
164
|
+
)
|
|
165
|
+
)
|
|
166
|
+
wb_auto = engine_auto.extract("input.xlsx") # includes SheetData.auto_print_areas
|
|
167
|
+
engine_auto.export(wb_auto, Path("out_with_auto.json")) # also writes auto_areas/*
|
|
168
|
+
export_auto_page_breaks(wb_auto, "auto_areas", fmt="json", pretty=True) # manual writer
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Note (non-COM environments):** If Excel COM is unavailable, extraction still runs and returns cells + `table_candidates`; `shapes`/`charts` will be empty.
|
|
172
|
+
|
|
173
|
+
## Table Detection Tuning
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
from exstruct import set_table_detection_params
|
|
177
|
+
|
|
178
|
+
set_table_detection_params(
|
|
179
|
+
table_score_threshold=0.35, # increase to be stricter
|
|
180
|
+
density_min=0.05,
|
|
181
|
+
coverage_min=0.2,
|
|
182
|
+
min_nonempty_cells=3,
|
|
183
|
+
)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Use higher thresholds to reduce false positives; lower them if true tables are missed.
|
|
187
|
+
|
|
188
|
+
## Output Modes
|
|
189
|
+
|
|
190
|
+
- **light**: cells + table candidates (no COM needed).
|
|
191
|
+
- **standard**: texted shapes + arrows, charts (COM if available), table candidates. Hyperlinks are off unless `include_cell_links=True`.
|
|
192
|
+
- **verbose**: all shapes (with width/height), charts, table candidates, and cell hyperlinks.
|
|
193
|
+
|
|
194
|
+
## Error Handling / Fallbacks
|
|
195
|
+
|
|
196
|
+
- Excel COM unavailable β falls back to cells + table candidates; shapes/charts empty.
|
|
197
|
+
- Shape extraction failure β logs warning, still returns cells + table candidates.
|
|
198
|
+
- CLI prints errors to stdout/stderr and returns non-zero on failures.
|
|
199
|
+
|
|
200
|
+
## Optional Rendering
|
|
201
|
+
|
|
202
|
+
Requires Excel and `pypdfium2`.
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
exstruct input.xlsx --pdf --image --dpi 144
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Creates `<output>.pdf` and `<output>_images/` PNGs per sheet.
|
|
209
|
+
|
|
210
|
+
## Benchmark: Excel Structuring Demo
|
|
211
|
+
|
|
212
|
+
To show how well exstruct can structure Excel, we parse a workbook that combines three elements on one sheet and share an AI reasoning benchmark that uses the JSON output.
|
|
213
|
+
|
|
214
|
+
- Table (sales data)
|
|
215
|
+
- Line chart
|
|
216
|
+
- Flowchart built only with shapes
|
|
217
|
+
|
|
218
|
+
(Screenshot below is the actual sample Excel sheet)
|
|
219
|
+

|
|
220
|
+
Sample workbook: `sample/sample.xlsx`
|
|
221
|
+
Sample workbook: `sample/sample.xlsx`
|
|
222
|
+
|
|
223
|
+
### 1. Input: Excel Sheet Overview
|
|
224
|
+
|
|
225
|
+
This sample Excel contains:
|
|
226
|
+
|
|
227
|
+
### β Table (Sales Data)
|
|
228
|
+
|
|
229
|
+
| Month | Product A | Product B | Product C |
|
|
230
|
+
| ------ | --------- | --------- | --------- |
|
|
231
|
+
| Jan-25 | 120 | 80 | 60 |
|
|
232
|
+
| Feb-25 | 135 | 90 | 64 |
|
|
233
|
+
| Mar-25 | 150 | 100 | 70 |
|
|
234
|
+
| Apr-25 | 170 | 110 | 72 |
|
|
235
|
+
| May-25 | 160 | 120 | 75 |
|
|
236
|
+
| Jun-25 | 180 | 130 | 80 |
|
|
237
|
+
|
|
238
|
+
### β‘ Chart (Line Chart)
|
|
239
|
+
|
|
240
|
+
- Title: Sales Data
|
|
241
|
+
- Series: Product A / Product B / Product C (six months)
|
|
242
|
+
- Y axis: 0β200
|
|
243
|
+
|
|
244
|
+
### β’ Flowchart built with shapes
|
|
245
|
+
|
|
246
|
+
The sheet includes this flow:
|
|
247
|
+
|
|
248
|
+
- Start / End
|
|
249
|
+
- Format check
|
|
250
|
+
- Loop (items remaining?)
|
|
251
|
+
- Error handling
|
|
252
|
+
- Yes/No decision for sending email
|
|
253
|
+
|
|
254
|
+
### 2. Output: Structured JSON produced by exstruct (excerpt)
|
|
255
|
+
|
|
256
|
+
Below is a **shortened JSON output example** from parsing this Excel workbook.
|
|
257
|
+
|
|
258
|
+
```json
|
|
259
|
+
{
|
|
260
|
+
"book_name": "sample.xlsx",
|
|
261
|
+
"sheets": {
|
|
262
|
+
"Sheet1": {
|
|
263
|
+
"rows": [
|
|
264
|
+
{
|
|
265
|
+
"r": 3,
|
|
266
|
+
"c": {
|
|
267
|
+
"1": "ζ",
|
|
268
|
+
"2": "θ£½εA",
|
|
269
|
+
"3": "θ£½εB",
|
|
270
|
+
"4": "θ£½εC"
|
|
271
|
+
}
|
|
272
|
+
},
|
|
273
|
+
...
|
|
274
|
+
],
|
|
275
|
+
"shapes": [
|
|
276
|
+
{
|
|
277
|
+
"text": "ιε§",
|
|
278
|
+
"l": 148,
|
|
279
|
+
"t": 220,
|
|
280
|
+
"type": "AutoShape-FlowchartProcess"
|
|
281
|
+
},
|
|
282
|
+
{
|
|
283
|
+
"text": "ε
₯εγγΌγΏθͺγΏθΎΌγΏ",
|
|
284
|
+
"l": 132,
|
|
285
|
+
"t": 282,
|
|
286
|
+
"type": "AutoShape-FlowchartProcess"
|
|
287
|
+
},
|
|
288
|
+
{
|
|
289
|
+
"l": 193,
|
|
290
|
+
"t": 246,
|
|
291
|
+
"type": "AutoShape-Mixed",
|
|
292
|
+
"begin_arrow_style": 1,
|
|
293
|
+
"end_arrow_style": 2,
|
|
294
|
+
"direction": "N"
|
|
295
|
+
},
|
|
296
|
+
...
|
|
297
|
+
],
|
|
298
|
+
"charts": [
|
|
299
|
+
{
|
|
300
|
+
"name": "Chart 1",
|
|
301
|
+
"chart_type": "Line",
|
|
302
|
+
"title": "ε£²δΈγγΌγΏ",
|
|
303
|
+
"y_axis_range": [
|
|
304
|
+
0.0,
|
|
305
|
+
200.0
|
|
306
|
+
],
|
|
307
|
+
"series": [
|
|
308
|
+
{
|
|
309
|
+
"name": "θ£½εA",
|
|
310
|
+
"name_range": "Sheet1!$C$3",
|
|
311
|
+
"x_range": "Sheet1!$B$4:$B$9",
|
|
312
|
+
"y_range": "Sheet1!$C$4:$C$9"
|
|
313
|
+
},
|
|
314
|
+
...
|
|
315
|
+
],
|
|
316
|
+
"l": 377,
|
|
317
|
+
"t": 25
|
|
318
|
+
}
|
|
319
|
+
],
|
|
320
|
+
"table_candidates": [
|
|
321
|
+
"B3:E9"
|
|
322
|
+
]
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### 3. How AI (Copilot / LLM) interprets the JSON
|
|
329
|
+
|
|
330
|
+
````md
|
|
331
|
+
Below is the Markdown reconstruction of the Excel workbook. The table, chart, and flowchart are all represented.
|
|
332
|
+
|
|
333
|
+
---
|
|
334
|
+
|
|
335
|
+
## π Sales Data Table
|
|
336
|
+
|
|
337
|
+
| Month | Product A | Product B | Product C |
|
|
338
|
+
| ---------- | --------- | --------- | --------- |
|
|
339
|
+
| 2025-01-01 | 120 | 80 | 60 |
|
|
340
|
+
| 2025-02-01 | 135 | 90 | 64 |
|
|
341
|
+
| 2025-03-01 | 150 | 100 | 70 |
|
|
342
|
+
| 2025-04-01 | 170 | 110 | 72 |
|
|
343
|
+
| 2025-05-01 | 160 | 120 | 75 |
|
|
344
|
+
| 2025-06-01 | 180 | 130 | 80 |
|
|
345
|
+
|
|
346
|
+
---
|
|
347
|
+
|
|
348
|
+
## π Sales Data (Line Chart)
|
|
349
|
+
|
|
350
|
+
- Chart title: **ε£²δΈγγΌγΏ (Sales Data)**
|
|
351
|
+
- Chart type: Line
|
|
352
|
+
- Y-axis range: 0 to 200
|
|
353
|
+
- Data series:
|
|
354
|
+
- Product A: 120 β 135 β 150 β 170 β 160 β 180
|
|
355
|
+
- Product B: 80 β 90 β 100 β 110 β 120 β 130
|
|
356
|
+
- Product C: 60 β 64 β 70 β 72 β 75 β 80
|
|
357
|
+
|
|
358
|
+
---
|
|
359
|
+
|
|
360
|
+
## π Process Flow (Mermaid Flowchart)
|
|
361
|
+
|
|
362
|
+
```mermaid
|
|
363
|
+
flowchart TD
|
|
364
|
+
A[Start]
|
|
365
|
+
B[Load input data]
|
|
366
|
+
C{Is format valid?}
|
|
367
|
+
D[Show error]
|
|
368
|
+
E[Process one item]
|
|
369
|
+
F{Items remaining?}
|
|
370
|
+
G[Generate output]
|
|
371
|
+
H{Send email?}
|
|
372
|
+
I[Send email]
|
|
373
|
+
J[Finish]
|
|
374
|
+
|
|
375
|
+
A --> B
|
|
376
|
+
B --> C
|
|
377
|
+
C -- no --> D
|
|
378
|
+
C -- yes --> E
|
|
379
|
+
E --> F
|
|
380
|
+
F -- yes --> E
|
|
381
|
+
F -- no --> G
|
|
382
|
+
G --> H
|
|
383
|
+
H -- yes --> I
|
|
384
|
+
H -- no --> J
|
|
385
|
+
I --> J
|
|
386
|
+
```
|
|
387
|
+
````
|
|
388
|
+
|
|
389
|
+
From this we can see:
|
|
390
|
+
|
|
391
|
+
**exstruct's JSON is already in a format that AI can read and reason over directly.**
|
|
392
|
+
|
|
393
|
+
### 4. Summary
|
|
394
|
+
|
|
395
|
+
This benchmark confirms exstruct can:
|
|
396
|
+
|
|
397
|
+
- **Parse tables, charts, and shapes (flowcharts) simultaneously**
|
|
398
|
+
- Convert the semantic structure of Excel into JSON
|
|
399
|
+
- Let AI/LLMs read that JSON directly and reconstruct the workbook contents
|
|
400
|
+
|
|
401
|
+
In short, **exstruct = βan engine that converts Excel into a format AI can understand.β**
|
|
402
|
+
|
|
403
|
+
## Notes
|
|
404
|
+
|
|
405
|
+
- Default JSON is compact to reduce tokens; use `--pretty` or `pretty=True` when readability matters.
|
|
406
|
+
- Field `table_candidates` replaces `tables`; adjust downstream consumers accordingly.
|
|
407
|
+
|
|
408
|
+
## Enterprise Use
|
|
409
|
+
|
|
410
|
+
ExStruct is used primarily as a **library**, not a service.
|
|
411
|
+
|
|
412
|
+
- No official support or SLA is provided
|
|
413
|
+
- Long-term stability is prioritized over rapid feature growth
|
|
414
|
+
- Forking and internal modification are expected in enterprise use
|
|
415
|
+
|
|
416
|
+
This project is suitable for teams that:
|
|
417
|
+
- need transparency over black-box tools
|
|
418
|
+
- are comfortable maintaining internal forks if necessary
|
|
419
|
+
|
|
420
|
+
## Print Areas and Auto Page Breaks (PrintArea / PrintAreaView)
|
|
421
|
+
|
|
422
|
+
- `SheetData.print_areas` holds print areas (cell coordinates) in light/standard/verbose.
|
|
423
|
+
- `SheetData.auto_print_areas` holds Excel COM-computed auto page-break areas when auto page-break extraction is enabled (COM only).
|
|
424
|
+
- Use `export_print_areas_as(...)` or CLI `--print-areas-dir` to write one file per print area (nothing is written if none exist).
|
|
425
|
+
- Use CLI `--auto-page-breaks-dir` (COM only), `DestinationOptions.auto_page_breaks_dir` (preferred), or `export_auto_page_breaks(...)` to write per-auto-page-break files; the API raises `ValueError` if no auto page breaks exist.
|
|
426
|
+
- `PrintAreaView` includes rows and table candidates inside the area, plus shapes/charts that overlap the area (size-less shapes are treated as points). `normalize=True` rebases row/col indices to the area origin.
|
|
427
|
+
|
|
428
|
+
## License
|
|
429
|
+
|
|
430
|
+
BSD-3-Clause. See `LICENSE` for details.
|
|
431
|
+
|
|
432
|
+
## Documentation
|
|
433
|
+
|
|
434
|
+
- API Reference (GitHub Pages): https://harumiweb.github.io/exstruct/
|
|
435
|
+
- JSON Schemas: see `schemas/` (one file per model); regenerate via `python scripts/gen_json_schema.py`.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
exstruct/__init__.py,sha256=649fea37d359a9c994790c0256de70adace62647809547b3567ab9d1c4ba0e63,12122
|
|
2
|
+
exstruct/cli/availability.py,sha256=29b79cc084e9d4d314626f56e2745c5c1238f51c984f1c97a7115c1c2fbe79ca,1410
|
|
3
|
+
exstruct/cli/main.py,sha256=63d299d9032522ab9b29032aa77d17700815f752f024d49ab5de7a5068e64751,3830
|
|
4
|
+
exstruct/core/__init__.py,sha256=e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,0
|
|
5
|
+
exstruct/core/cells.py,sha256=e38266674e621ddd815fdea017d4b9ede833147933a80cd56f462f7db882f0a8,35930
|
|
6
|
+
exstruct/core/charts.py,sha256=e7962eebb32dbd58165825639f0b5a32da2bf0a9880d4a57687c8a265f1e76e7,7777
|
|
7
|
+
exstruct/core/integrate.py,sha256=da216a4f864bb0a3111aa9d8b2d7cbd3ef8c26d30a32a3425e0fb90b31eb1c1a,14037
|
|
8
|
+
exstruct/core/shapes.py,sha256=f2913c5134b1c82be066e805a24cc3e21c3e5afef880dbab57987fa18c58c8bf,11069
|
|
9
|
+
exstruct/engine.py,sha256=8027aa9e9bc8d2f23c42c230fdd935b7685f4eb8b51613ed7a2d7fce2236e5a7,25543
|
|
10
|
+
exstruct/errors.py,sha256=9be81f7e93df84642fd2db4591bfbff1d5440d715287b582da113e7b0a5549ac,1002
|
|
11
|
+
exstruct/io/__init__.py,sha256=8cb00dd3e1fed186ab79f80948b241ff72a76e95b79fd9c4d4746829b61bf5ee,19267
|
|
12
|
+
exstruct/models/__init__.py,sha256=c502e877ce9bdbc899de6f6a95583282c95f9ef81d6a69296f2fef827dc3b7e3,11924
|
|
13
|
+
exstruct/models/maps.py,sha256=9ebb0e67e4d80b771b2ec3babba488cb84fa4a56681906990fff733273e73f52,12930
|
|
14
|
+
exstruct/models/types.py,sha256=4226f75035fc144bfaf88fe29bdeaa6a986924f18b3b3a048502187d27339d2a,278
|
|
15
|
+
exstruct/py.typed,sha256=e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,0
|
|
16
|
+
exstruct/render/__init__.py,sha256=e2c16904003c6fd28f96ff2ff13b2ac677465a48f732e93627544644b4dc37d9,4242
|
|
17
|
+
exstruct-0.2.80.dist-info/WHEEL,sha256=b6dc288e80aa2d1b1518ddb3502fd5b53e8fd6cb507ed2a4f932e9e6088b264a,78
|
|
18
|
+
exstruct-0.2.80.dist-info/entry_points.txt,sha256=3429e73dd9d41bb977b49a34914dddd7ec70352b79882bb937a3999e8e8bce9c,53
|
|
19
|
+
exstruct-0.2.80.dist-info/METADATA,sha256=d162e1003f843a4912e9dbfc82c298614068832b6506a23034fb11948ef790fe,16635
|
|
20
|
+
exstruct-0.2.80.dist-info/RECORD,,
|