exstruct 0.2.80__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exstruct/__init__.py +23 -12
- exstruct/cli/main.py +20 -0
- exstruct/core/backends/__init__.py +7 -0
- exstruct/core/backends/base.py +42 -0
- exstruct/core/backends/com_backend.py +230 -0
- exstruct/core/backends/openpyxl_backend.py +191 -0
- exstruct/core/cells.py +999 -483
- exstruct/core/charts.py +243 -241
- exstruct/core/integrate.py +42 -375
- exstruct/core/logging_utils.py +16 -0
- exstruct/core/modeling.py +87 -0
- exstruct/core/pipeline.py +749 -0
- exstruct/core/ranges.py +48 -0
- exstruct/core/shapes.py +282 -36
- exstruct/core/workbook.py +114 -0
- exstruct/engine.py +51 -123
- exstruct/errors.py +12 -1
- exstruct/io/__init__.py +130 -138
- exstruct/io/serialize.py +112 -0
- exstruct/models/__init__.py +58 -8
- exstruct/render/__init__.py +3 -7
- {exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/METADATA +133 -18
- exstruct-0.3.2.dist-info/RECORD +30 -0
- exstruct-0.2.80.dist-info/RECORD +0 -20
- {exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/WHEEL +0 -0
- {exstruct-0.2.80.dist-info → exstruct-0.3.2.dist-info}/entry_points.txt +0 -0
exstruct/io/serialize.py
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
import json
|
|
5
|
+
from types import ModuleType
|
|
6
|
+
|
|
7
|
+
from ..errors import MissingDependencyError, SerializationError
|
|
8
|
+
from ..models.types import JsonStructure
|
|
9
|
+
|
|
10
|
+
_FORMAT_HINTS: set[str] = {"json", "yaml", "toon"}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _normalize_format_hint(fmt: str) -> str:
|
|
14
|
+
"""Normalize a format hint string.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
fmt: Format string such as "json", "yaml", or "yml".
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Normalized format hint.
|
|
21
|
+
"""
|
|
22
|
+
format_hint = fmt.lower()
|
|
23
|
+
if format_hint == "yml":
|
|
24
|
+
return "yaml"
|
|
25
|
+
return format_hint
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _ensure_format_hint(
|
|
29
|
+
fmt: str,
|
|
30
|
+
*,
|
|
31
|
+
allowed: set[str],
|
|
32
|
+
error_type: type[Exception],
|
|
33
|
+
error_message: str,
|
|
34
|
+
) -> str:
|
|
35
|
+
"""Validate and normalize a format hint.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
fmt: Raw format string.
|
|
39
|
+
allowed: Allowed format hints.
|
|
40
|
+
error_type: Exception type to raise on error.
|
|
41
|
+
error_message: Error message template with {fmt}.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Normalized format hint.
|
|
45
|
+
"""
|
|
46
|
+
format_hint = _normalize_format_hint(fmt)
|
|
47
|
+
if format_hint not in allowed:
|
|
48
|
+
raise error_type(error_message.format(fmt=fmt))
|
|
49
|
+
return format_hint
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _serialize_payload_from_hint(
|
|
53
|
+
payload: JsonStructure,
|
|
54
|
+
format_hint: str,
|
|
55
|
+
*,
|
|
56
|
+
pretty: bool = False,
|
|
57
|
+
indent: int | None = None,
|
|
58
|
+
) -> str:
|
|
59
|
+
"""Serialize a payload using a normalized format hint.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
payload: JSON-serializable payload.
|
|
63
|
+
format_hint: Normalized format hint ("json", "yaml", "toon").
|
|
64
|
+
pretty: Whether to pretty-print JSON.
|
|
65
|
+
indent: Optional JSON indentation width.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Serialized string for the requested format.
|
|
69
|
+
"""
|
|
70
|
+
match format_hint:
|
|
71
|
+
case "json":
|
|
72
|
+
indent_val = 2 if pretty and indent is None else indent
|
|
73
|
+
return json.dumps(payload, ensure_ascii=False, indent=indent_val)
|
|
74
|
+
case "yaml":
|
|
75
|
+
yaml = _require_yaml()
|
|
76
|
+
return str(
|
|
77
|
+
yaml.safe_dump(
|
|
78
|
+
payload,
|
|
79
|
+
allow_unicode=True,
|
|
80
|
+
sort_keys=False,
|
|
81
|
+
indent=2,
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
case "toon":
|
|
85
|
+
toon = _require_toon()
|
|
86
|
+
return str(toon.encode(payload))
|
|
87
|
+
case _:
|
|
88
|
+
raise SerializationError(
|
|
89
|
+
f"Unsupported export format '{format_hint}'. Allowed: json, yaml, yml, toon."
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _require_yaml() -> ModuleType:
|
|
94
|
+
"""Ensure pyyaml is installed; otherwise raise with guidance."""
|
|
95
|
+
try:
|
|
96
|
+
module = importlib.import_module("yaml")
|
|
97
|
+
except ImportError as e:
|
|
98
|
+
raise MissingDependencyError(
|
|
99
|
+
"YAML export requires pyyaml. Install it via `pip install pyyaml` or add the 'yaml' extra."
|
|
100
|
+
) from e
|
|
101
|
+
return module
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _require_toon() -> ModuleType:
|
|
105
|
+
"""Ensure python-toon is installed; otherwise raise with guidance."""
|
|
106
|
+
try:
|
|
107
|
+
module = importlib.import_module("toon")
|
|
108
|
+
except ImportError as e:
|
|
109
|
+
raise MissingDependencyError(
|
|
110
|
+
"TOON export requires python-toon. Install it via `pip install python-toon` or add the 'toon' extra."
|
|
111
|
+
) from e
|
|
112
|
+
return module
|
exstruct/models/__init__.py
CHANGED
|
@@ -8,21 +8,34 @@ from typing import Literal
|
|
|
8
8
|
from pydantic import BaseModel, Field
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
class
|
|
12
|
-
"""
|
|
11
|
+
class BaseShape(BaseModel):
|
|
12
|
+
"""Common shape metadata (position, size, text, and styling)."""
|
|
13
13
|
|
|
14
14
|
id: int | None = Field(
|
|
15
|
-
default=None,
|
|
15
|
+
default=None,
|
|
16
|
+
description="Sequential shape id within the sheet (if applicable).",
|
|
16
17
|
)
|
|
17
18
|
text: str = Field(description="Visible text content of the shape.")
|
|
18
19
|
l: int = Field(description="Left offset (Excel units).") # noqa: E741
|
|
19
20
|
t: int = Field(description="Top offset (Excel units).")
|
|
20
21
|
w: int | None = Field(default=None, description="Shape width (None if unknown).")
|
|
21
22
|
h: int | None = Field(default=None, description="Shape height (None if unknown).")
|
|
22
|
-
type: str | None = Field(default=None, description="Excel shape type name.")
|
|
23
23
|
rotation: float | None = Field(
|
|
24
24
|
default=None, description="Rotation angle in degrees."
|
|
25
25
|
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Shape(BaseShape):
|
|
29
|
+
"""Normal shape metadata."""
|
|
30
|
+
|
|
31
|
+
kind: Literal["shape"] = Field(default="shape", description="Shape kind.")
|
|
32
|
+
type: str | None = Field(default=None, description="Excel shape type name.")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Arrow(BaseShape):
|
|
36
|
+
"""Connector shape metadata."""
|
|
37
|
+
|
|
38
|
+
kind: Literal["arrow"] = Field(default="arrow", description="Shape kind.")
|
|
26
39
|
begin_arrow_style: int | None = Field(
|
|
27
40
|
default=None, description="Arrow style enum for the start of a connector."
|
|
28
41
|
)
|
|
@@ -46,6 +59,33 @@ class Shape(BaseModel):
|
|
|
46
59
|
)
|
|
47
60
|
|
|
48
61
|
|
|
62
|
+
class SmartArtNode(BaseModel):
|
|
63
|
+
"""Node of SmartArt hierarchy."""
|
|
64
|
+
|
|
65
|
+
text: str = Field(description="Visible text for the node.")
|
|
66
|
+
kids: list[SmartArtNode] = Field(default_factory=list, description="Child nodes.")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class SmartArt(BaseShape):
|
|
70
|
+
"""SmartArt shape metadata with nested nodes."""
|
|
71
|
+
|
|
72
|
+
kind: Literal["smartart"] = Field(default="smartart", description="Shape kind.")
|
|
73
|
+
layout: str = Field(description="SmartArt layout name.")
|
|
74
|
+
nodes: list[SmartArtNode] = Field(
|
|
75
|
+
default_factory=list, description="Root nodes of SmartArt tree."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class MergedCell(BaseModel):
|
|
80
|
+
"""Metadata for a merged cell range."""
|
|
81
|
+
|
|
82
|
+
r1: int = Field(description="Start row (1-based).")
|
|
83
|
+
c1: int = Field(description="Start column (0-based).")
|
|
84
|
+
r2: int = Field(description="End row (1-based, inclusive).")
|
|
85
|
+
c2: int = Field(description="End column (0-based, inclusive).")
|
|
86
|
+
v: str = ""
|
|
87
|
+
|
|
88
|
+
|
|
49
89
|
class CellRow(BaseModel):
|
|
50
90
|
"""A single row of cells with optional hyperlinks."""
|
|
51
91
|
|
|
@@ -97,9 +137,9 @@ class PrintArea(BaseModel):
|
|
|
97
137
|
"""Cell coordinate bounds for a print area."""
|
|
98
138
|
|
|
99
139
|
r1: int = Field(description="Start row (1-based).")
|
|
100
|
-
c1: int = Field(description="Start column (
|
|
140
|
+
c1: int = Field(description="Start column (0-based).")
|
|
101
141
|
r2: int = Field(description="End row (1-based, inclusive).")
|
|
102
|
-
c2: int = Field(description="End column (
|
|
142
|
+
c2: int = Field(description="End column (0-based, inclusive).")
|
|
103
143
|
|
|
104
144
|
|
|
105
145
|
class SheetData(BaseModel):
|
|
@@ -108,7 +148,7 @@ class SheetData(BaseModel):
|
|
|
108
148
|
rows: list[CellRow] = Field(
|
|
109
149
|
default_factory=list, description="Extracted rows with cell values and links."
|
|
110
150
|
)
|
|
111
|
-
shapes: list[Shape] = Field(
|
|
151
|
+
shapes: list[Shape | Arrow | SmartArt] = Field(
|
|
112
152
|
default_factory=list, description="Shapes detected on the sheet."
|
|
113
153
|
)
|
|
114
154
|
charts: list[Chart] = Field(
|
|
@@ -123,6 +163,16 @@ class SheetData(BaseModel):
|
|
|
123
163
|
auto_print_areas: list[PrintArea] = Field(
|
|
124
164
|
default_factory=list, description="COM-computed auto page-break areas."
|
|
125
165
|
)
|
|
166
|
+
colors_map: dict[str, list[tuple[int, int]]] = Field(
|
|
167
|
+
default_factory=dict,
|
|
168
|
+
description=(
|
|
169
|
+
"Mapping of hex color codes to lists of (row, column) tuples "
|
|
170
|
+
"where row is 1-based and column is 0-based."
|
|
171
|
+
),
|
|
172
|
+
)
|
|
173
|
+
merged_cells: list[MergedCell] = Field(
|
|
174
|
+
default_factory=list, description="Merged cell ranges on the sheet."
|
|
175
|
+
)
|
|
126
176
|
|
|
127
177
|
def _as_payload(self) -> dict[str, object]:
|
|
128
178
|
from ..io import dict_without_empty_values
|
|
@@ -259,7 +309,7 @@ class PrintAreaView(BaseModel):
|
|
|
259
309
|
book_name: str = Field(description="Workbook name owning the area.")
|
|
260
310
|
sheet_name: str = Field(description="Sheet name owning the area.")
|
|
261
311
|
area: PrintArea = Field(description="Print area bounds.")
|
|
262
|
-
shapes: list[Shape] = Field(
|
|
312
|
+
shapes: list[Shape | Arrow | SmartArt] = Field(
|
|
263
313
|
default_factory=list, description="Shapes overlapping the area."
|
|
264
314
|
)
|
|
265
315
|
charts: list[Chart] = Field(
|
exstruct/render/__init__.py
CHANGED
|
@@ -49,10 +49,8 @@ def export_pdf(excel_path: str | Path, output_pdf: str | Path) -> list[str]:
|
|
|
49
49
|
raise
|
|
50
50
|
except Exception as exc:
|
|
51
51
|
raise RenderError(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
f"'{normalized_excel_path}' to '{normalized_output_pdf}'."
|
|
55
|
-
)
|
|
52
|
+
"Failed to export PDF for "
|
|
53
|
+
f"'{normalized_excel_path}' to '{normalized_output_pdf}'."
|
|
56
54
|
) from exc
|
|
57
55
|
finally:
|
|
58
56
|
if wb is not None:
|
|
@@ -60,9 +58,7 @@ def export_pdf(excel_path: str | Path, output_pdf: str | Path) -> list[str]:
|
|
|
60
58
|
if app is not None:
|
|
61
59
|
app.quit()
|
|
62
60
|
if not normalized_output_pdf.exists():
|
|
63
|
-
raise RenderError(
|
|
64
|
-
f"Failed to export PDF to '{normalized_output_pdf}'."
|
|
65
|
-
)
|
|
61
|
+
raise RenderError(f"Failed to export PDF to '{normalized_output_pdf}'.")
|
|
66
62
|
return sheet_names
|
|
67
63
|
|
|
68
64
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: exstruct
|
|
3
|
-
Version: 0.2
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: Excel to structured JSON (tables, shapes, charts) for LLM/RAG pipelines
|
|
5
5
|
Keywords: excel,structure,data,exstruct
|
|
6
6
|
Author: harumiWeb
|
|
@@ -55,18 +55,18 @@ Description-Content-Type: text/markdown
|
|
|
55
55
|
|
|
56
56
|
# ExStruct — Excel Structured Extraction Engine
|
|
57
57
|
|
|
58
|
-
[](https://pypi.org/project/exstruct/) [](https://pepy.tech/projects/exstruct)  [](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade)
|
|
58
|
+
[](https://pypi.org/project/exstruct/) [](https://pepy.tech/projects/exstruct)  [](https://github.com/harumiWeb/exstruct/actions/workflows/pytest.yml) [](https://app.codacy.com/gh/harumiWeb/exstruct/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) [](https://codecov.io/gh/harumiWeb/exstruct)
|
|
59
59
|
|
|
60
60
|

|
|
61
61
|
|
|
62
|
-
ExStruct reads Excel workbooks and outputs structured data (cells, table candidates, shapes, charts, print areas/views, auto page-break areas, hyperlinks) as JSON by default, with optional YAML/TOON formats. It targets both COM/Excel environments (rich extraction) and non-COM environments (cells + table candidates + print areas), with tunable detection heuristics and multiple output modes to fit LLM/RAG pipelines.
|
|
62
|
+
ExStruct reads Excel workbooks and outputs structured data (cells, table candidates, shapes, charts, smartart, merged cell ranges, print areas/views, auto page-break areas, hyperlinks) as JSON by default, with optional YAML/TOON formats. It targets both COM/Excel environments (rich extraction) and non-COM environments (cells + table candidates + print areas), with tunable detection heuristics and multiple output modes to fit LLM/RAG pipelines.
|
|
63
63
|
|
|
64
|
-
[日本版README](README.ja.md)
|
|
64
|
+
[日本版 README](README.ja.md)
|
|
65
65
|
|
|
66
66
|
## Features
|
|
67
67
|
|
|
68
|
-
- **Excel → Structured JSON**: cells, shapes, charts, table candidates, print areas/views, and auto page-break areas per sheet.
|
|
69
|
-
- **Output modes**: `light` (cells + table candidates + print areas; no COM, shapes/charts empty), `standard` (texted shapes + arrows, charts, print areas), `verbose` (all shapes with width/height, charts with size, print areas). Verbose also emits cell hyperlinks
|
|
68
|
+
- **Excel → Structured JSON**: cells, shapes, charts, smartart, table candidates, print areas/views, and auto page-break areas per sheet.
|
|
69
|
+
- **Output modes**: `light` (cells + table candidates + print areas; no COM, shapes/charts empty), `standard` (texted shapes + arrows, charts, smartart, merged cell ranges, print areas), `verbose` (all shapes with width/height, charts with size, merged cell ranges, print areas). Verbose also emits cell hyperlinks and `colors_map`. Size output is flag-controlled.
|
|
70
70
|
- **Auto page-break export (COM only)**: capture Excel-computed auto page breaks and write per-area JSON/YAML/TOON when requested (CLI option appears only when COM is available).
|
|
71
71
|
- **Formats**: JSON (compact by default, `--pretty` available), YAML, TOON (optional dependencies).
|
|
72
72
|
- **Table detection tuning**: adjust heuristics at runtime via API.
|
|
@@ -188,8 +188,8 @@ Use higher thresholds to reduce false positives; lower them if true tables are m
|
|
|
188
188
|
## Output Modes
|
|
189
189
|
|
|
190
190
|
- **light**: cells + table candidates (no COM needed).
|
|
191
|
-
- **standard**: texted shapes + arrows, charts (COM if available), table candidates. Hyperlinks are off unless `include_cell_links=True`.
|
|
192
|
-
- **verbose**: all shapes (with width/height), charts, table candidates,
|
|
191
|
+
- **standard**: texted shapes + arrows, charts (COM if available), merged cell ranges, table candidates. Hyperlinks are off unless `include_cell_links=True`.
|
|
192
|
+
- **verbose**: all shapes (with width/height), charts, merged cell ranges, table candidates, cell hyperlinks, and `colors_map`.
|
|
193
193
|
|
|
194
194
|
## Error Handling / Fallbacks
|
|
195
195
|
|
|
@@ -207,7 +207,7 @@ exstruct input.xlsx --pdf --image --dpi 144
|
|
|
207
207
|
|
|
208
208
|
Creates `<output>.pdf` and `<output>_images/` PNGs per sheet.
|
|
209
209
|
|
|
210
|
-
##
|
|
210
|
+
## Example 1: Excel Structuring Demo
|
|
211
211
|
|
|
212
212
|
To show how well exstruct can structure Excel, we parse a workbook that combines three elements on one sheet and share an AI reasoning benchmark that uses the JSON output.
|
|
213
213
|
|
|
@@ -218,7 +218,6 @@ To show how well exstruct can structure Excel, we parse a workbook that combines
|
|
|
218
218
|
(Screenshot below is the actual sample Excel sheet)
|
|
219
219
|

|
|
220
220
|
Sample workbook: `sample/sample.xlsx`
|
|
221
|
-
Sample workbook: `sample/sample.xlsx`
|
|
222
221
|
|
|
223
222
|
### 1. Input: Excel Sheet Overview
|
|
224
223
|
|
|
@@ -274,23 +273,29 @@ Below is a **shortened JSON output example** from parsing this Excel workbook.
|
|
|
274
273
|
],
|
|
275
274
|
"shapes": [
|
|
276
275
|
{
|
|
276
|
+
"id": 1,
|
|
277
277
|
"text": "開始",
|
|
278
278
|
"l": 148,
|
|
279
279
|
"t": 220,
|
|
280
|
+
"kind": "shape",
|
|
280
281
|
"type": "AutoShape-FlowchartProcess"
|
|
281
282
|
},
|
|
282
283
|
{
|
|
284
|
+
"id": 2,
|
|
283
285
|
"text": "入力データ読み込み",
|
|
284
286
|
"l": 132,
|
|
285
287
|
"t": 282,
|
|
288
|
+
"kind": "shape",
|
|
286
289
|
"type": "AutoShape-FlowchartProcess"
|
|
287
290
|
},
|
|
288
291
|
{
|
|
289
292
|
"l": 193,
|
|
290
293
|
"t": 246,
|
|
291
|
-
"
|
|
294
|
+
"kind": "arrow",
|
|
292
295
|
"begin_arrow_style": 1,
|
|
293
296
|
"end_arrow_style": 2,
|
|
297
|
+
"begin_id": 1,
|
|
298
|
+
"end_id": 2,
|
|
294
299
|
"direction": "N"
|
|
295
300
|
},
|
|
296
301
|
...
|
|
@@ -374,22 +379,110 @@ flowchart TD
|
|
|
374
379
|
|
|
375
380
|
A --> B
|
|
376
381
|
B --> C
|
|
377
|
-
C
|
|
378
|
-
C
|
|
382
|
+
C -->|yes| D
|
|
383
|
+
C --> H
|
|
384
|
+
D --> E
|
|
379
385
|
E --> F
|
|
380
|
-
F
|
|
381
|
-
|
|
382
|
-
G
|
|
383
|
-
H
|
|
384
|
-
H -- no --> J
|
|
386
|
+
F --> G
|
|
387
|
+
G -->|yes| I
|
|
388
|
+
G -->|no| J
|
|
389
|
+
H --> J
|
|
385
390
|
I --> J
|
|
386
391
|
```
|
|
387
392
|
````
|
|
388
393
|
|
|
394
|
+
## Example 2: General Application Form
|
|
395
|
+
|
|
396
|
+
### Excel Sheet
|
|
397
|
+
|
|
398
|
+

|
|
399
|
+
|
|
400
|
+
### ExStruct JSON
|
|
401
|
+
|
|
402
|
+
(Truncated for brevity)
|
|
403
|
+
|
|
404
|
+
```json
|
|
405
|
+
{
|
|
406
|
+
"book_name": "ja_form.xlsx",
|
|
407
|
+
"sheets": {
|
|
408
|
+
"Sheet1": {
|
|
409
|
+
"rows": [
|
|
410
|
+
{ "r": 1, "c": { "0": "??????????????" } },
|
|
411
|
+
{
|
|
412
|
+
"r": 3,
|
|
413
|
+
"c": { "0": "???", "7": " ???????????????" }
|
|
414
|
+
},
|
|
415
|
+
{ "r": 4, "c": { "1": "X???" } },
|
|
416
|
+
...
|
|
417
|
+
],
|
|
418
|
+
"table_candidates": ["B25:C26", "C37:D50"],
|
|
419
|
+
"merged_cells": [
|
|
420
|
+
{
|
|
421
|
+
"r1": 55,
|
|
422
|
+
"c1": 5,
|
|
423
|
+
"r2": 55,
|
|
424
|
+
"c2": 10,
|
|
425
|
+
"v": "?????????????????????????????"
|
|
426
|
+
},
|
|
427
|
+
{ "r1": 54, "c1": 8, "r2": 54, "c2": 10 },
|
|
428
|
+
{ "r1": 51, "c1": 5, "r2": 52, "c2": 6, "v": "????" },
|
|
429
|
+
...
|
|
430
|
+
]
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
}
|
|
434
|
+
```
|
|
435
|
+
|
|
436
|
+
### LLM reconstruction example
|
|
437
|
+
|
|
438
|
+
```md
|
|
439
|
+
# ??????????????
|
|
440
|
+
|
|
441
|
+
????????????????????????
|
|
442
|
+
X ??
|
|
443
|
+
|
|
444
|
+
?????????????????????????????????????????
|
|
445
|
+
|
|
446
|
+
---
|
|
447
|
+
|
|
448
|
+
## ??????
|
|
449
|
+
|
|
450
|
+
| ?? | ?? |
|
|
451
|
+
| ------ | -------------- |
|
|
452
|
+
| ???? | |
|
|
453
|
+
| ????? | |
|
|
454
|
+
| ?????? | |
|
|
455
|
+
| ???? | |
|
|
456
|
+
| ???? | ?????????????? |
|
|
457
|
+
| ?? | |
|
|
458
|
+
| ??? | |
|
|
459
|
+
|
|
460
|
+
---
|
|
461
|
+
|
|
462
|
+
## ?????????????
|
|
463
|
+
|
|
464
|
+
| ?? | ?? |
|
|
465
|
+
| ----------- | ----- |
|
|
466
|
+
| ??????????? | |
|
|
467
|
+
| ??? | |
|
|
468
|
+
| ???????? | ????? |
|
|
469
|
+
|
|
470
|
+
**???????????????????????????????????????**
|
|
471
|
+
|
|
472
|
+
...
|
|
473
|
+
```
|
|
474
|
+
|
|
389
475
|
From this we can see:
|
|
390
476
|
|
|
391
477
|
**exstruct's JSON is already in a format that AI can read and reason over directly.**
|
|
392
478
|
|
|
479
|
+
Other LLM inference samples using this library can be found in the following directory:
|
|
480
|
+
|
|
481
|
+
- [Basic Excel](sample/basic/)
|
|
482
|
+
- [Flowchart](sample/flowchart/)
|
|
483
|
+
- [Gantt Chart](sample/gantt_chart/)
|
|
484
|
+
- [Application forms with many merged cells](sample/forms_with_many_merged_cells/)
|
|
485
|
+
|
|
393
486
|
### 4. Summary
|
|
394
487
|
|
|
395
488
|
This benchmark confirms exstruct can:
|
|
@@ -414,6 +507,7 @@ ExStruct is used primarily as a **library**, not a service.
|
|
|
414
507
|
- Forking and internal modification are expected in enterprise use
|
|
415
508
|
|
|
416
509
|
This project is suitable for teams that:
|
|
510
|
+
|
|
417
511
|
- need transparency over black-box tools
|
|
418
512
|
- are comfortable maintaining internal forks if necessary
|
|
419
513
|
|
|
@@ -425,6 +519,27 @@ This project is suitable for teams that:
|
|
|
425
519
|
- Use CLI `--auto-page-breaks-dir` (COM only), `DestinationOptions.auto_page_breaks_dir` (preferred), or `export_auto_page_breaks(...)` to write per-auto-page-break files; the API raises `ValueError` if no auto page breaks exist.
|
|
426
520
|
- `PrintAreaView` includes rows and table candidates inside the area, plus shapes/charts that overlap the area (size-less shapes are treated as points). `normalize=True` rebases row/col indices to the area origin.
|
|
427
521
|
|
|
522
|
+
## Architecture
|
|
523
|
+
|
|
524
|
+
ExStruct uses a pipeline-based architecture that separates
|
|
525
|
+
extraction strategy (Backend) from orchestration (Pipeline)
|
|
526
|
+
and semantic modeling.
|
|
527
|
+
|
|
528
|
+
→ See: [docs/architecture/pipeline.md](docs/architecture/pipeline.md)
|
|
529
|
+
|
|
530
|
+
## Contributing
|
|
531
|
+
|
|
532
|
+
If you plan to extend ExStruct internals,
|
|
533
|
+
please read the contributor architecture guide.
|
|
534
|
+
|
|
535
|
+
→ [docs/contributors/architecture.md](docs/contributors/architecture.md)
|
|
536
|
+
|
|
537
|
+
## Note on coverage
|
|
538
|
+
|
|
539
|
+
The cell-structure inference logic (cells.py) relies on heuristic rules
|
|
540
|
+
and Excel-specific behaviors. Full coverage is intentionally not pursued,
|
|
541
|
+
as exhaustive testing would not reflect real-world reliability.
|
|
542
|
+
|
|
428
543
|
## License
|
|
429
544
|
|
|
430
545
|
BSD-3-Clause. See `LICENSE` for details.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
exstruct/__init__.py,sha256=94434d63545b0ab9676956233503407666023024759ed644a807ea2feac85948,12512
|
|
2
|
+
exstruct/cli/availability.py,sha256=29b79cc084e9d4d314626f56e2745c5c1238f51c984f1c97a7115c1c2fbe79ca,1410
|
|
3
|
+
exstruct/cli/main.py,sha256=acc182aff4415ae6773293a18bfa0ced28523b7ea790b2094d08c1603a892049,4429
|
|
4
|
+
exstruct/core/__init__.py,sha256=e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,0
|
|
5
|
+
exstruct/core/backends/__init__.py,sha256=869dfea0defcb6f61fe51bdb834374c4bc70d12e60b5e5b24dc69cc0c7a73f56,207
|
|
6
|
+
exstruct/core/backends/base.py,sha256=7b89e4f700f4ccd2e5ec0c3ad5b452ccd8366e6e0777d04ba9b8d2eb5425bb7b,1309
|
|
7
|
+
exstruct/core/backends/com_backend.py,sha256=6c61cec63d5513eb185b7dd58de89edc98abff36f38a8431f4c9a1a2ed25b638,8151
|
|
8
|
+
exstruct/core/backends/openpyxl_backend.py,sha256=6537dda274b1b522e91415928021a3bc253ea992d74c621d676dba7f0a379e14,5854
|
|
9
|
+
exstruct/core/cells.py,sha256=a008d66dbc40b52e95a2ae8ea5460eecee01845639dc16d8d4f1d39551cf4b8f,52196
|
|
10
|
+
exstruct/core/charts.py,sha256=5c270770e06b3c2c4a93792f1e318c3830830123df27c9bc10a6f0f6cfac9213,8081
|
|
11
|
+
exstruct/core/integrate.py,sha256=d994f6bf45bc1de496cd211f1b9ebcd71717e3dcfc52e8c20086cd59595d3546,2088
|
|
12
|
+
exstruct/core/logging_utils.py,sha256=f566672aec02b21915cacd5bf3270676ac41cd418e97b7efea73fc371d516773,449
|
|
13
|
+
exstruct/core/modeling.py,sha256=83503084f905c4beb5c75463fdc72e0e08b2575990fa8bba16918f4e1fcbf799,2265
|
|
14
|
+
exstruct/core/pipeline.py,sha256=4f5df12b015d41c7167d05cdc101749fe70d49abdd80f1ec20c103bb5a1aa436,25661
|
|
15
|
+
exstruct/core/ranges.py,sha256=16aab38f4368e6c19721b45e1c61bdc60a3e31a474d7155fdfdd790ff1fcf0b9,1137
|
|
16
|
+
exstruct/core/shapes.py,sha256=ede3cd115abb90519af1fc093f0aac3a6aac5f0772b94eb51938885c252c9a12,20301
|
|
17
|
+
exstruct/core/workbook.py,sha256=bffd7c4b89e1de5b668155827406debf6d17e66b1e056f28b5f30bf78bb499df,3444
|
|
18
|
+
exstruct/engine.py,sha256=70be13aad0bdccb1a2fdafda10e1678af31da9d7e18095b68987617b88ff3d01,23051
|
|
19
|
+
exstruct/errors.py,sha256=d3cef6732d519bd0fb14e289fa296ea5ae26f6a3f7291c28da810478895f90a4,1277
|
|
20
|
+
exstruct/io/__init__.py,sha256=1cb0ba434e540c8c746bafcd6b100115c154fc75cba4a1438ed57b82c4598eeb,18860
|
|
21
|
+
exstruct/io/serialize.py,sha256=cbd32fe7c8fa275e3922507b3a82866bae57cca46b79124ed970da723520ac88,3272
|
|
22
|
+
exstruct/models/__init__.py,sha256=6598a14f0e3439f6599ee0ac52d014b19263cb27527a10adfd151d8a03a3b22c,13610
|
|
23
|
+
exstruct/models/maps.py,sha256=9ebb0e67e4d80b771b2ec3babba488cb84fa4a56681906990fff733273e73f52,12930
|
|
24
|
+
exstruct/models/types.py,sha256=4226f75035fc144bfaf88fe29bdeaa6a986924f18b3b3a048502187d27339d2a,278
|
|
25
|
+
exstruct/py.typed,sha256=e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,0
|
|
26
|
+
exstruct/render/__init__.py,sha256=547eae65c53c748f6153ef2ba54373466cc521b3b95cff5e603b9eff8eae1d69,4164
|
|
27
|
+
exstruct-0.3.2.dist-info/WHEEL,sha256=b6dc288e80aa2d1b1518ddb3502fd5b53e8fd6cb507ed2a4f932e9e6088b264a,78
|
|
28
|
+
exstruct-0.3.2.dist-info/entry_points.txt,sha256=3429e73dd9d41bb977b49a34914dddd7ec70352b79882bb937a3999e8e8bce9c,53
|
|
29
|
+
exstruct-0.3.2.dist-info/METADATA,sha256=ccaec8fee811a6f92fe0284cc66be0ce81e18e7ada0cd29329dde207a6f7294b,19467
|
|
30
|
+
exstruct-0.3.2.dist-info/RECORD,,
|
exstruct-0.2.80.dist-info/RECORD
DELETED
|
@@ -1,20 +0,0 @@
|
|
|
1
|
-
exstruct/__init__.py,sha256=649fea37d359a9c994790c0256de70adace62647809547b3567ab9d1c4ba0e63,12122
|
|
2
|
-
exstruct/cli/availability.py,sha256=29b79cc084e9d4d314626f56e2745c5c1238f51c984f1c97a7115c1c2fbe79ca,1410
|
|
3
|
-
exstruct/cli/main.py,sha256=63d299d9032522ab9b29032aa77d17700815f752f024d49ab5de7a5068e64751,3830
|
|
4
|
-
exstruct/core/__init__.py,sha256=e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,0
|
|
5
|
-
exstruct/core/cells.py,sha256=e38266674e621ddd815fdea017d4b9ede833147933a80cd56f462f7db882f0a8,35930
|
|
6
|
-
exstruct/core/charts.py,sha256=e7962eebb32dbd58165825639f0b5a32da2bf0a9880d4a57687c8a265f1e76e7,7777
|
|
7
|
-
exstruct/core/integrate.py,sha256=da216a4f864bb0a3111aa9d8b2d7cbd3ef8c26d30a32a3425e0fb90b31eb1c1a,14037
|
|
8
|
-
exstruct/core/shapes.py,sha256=f2913c5134b1c82be066e805a24cc3e21c3e5afef880dbab57987fa18c58c8bf,11069
|
|
9
|
-
exstruct/engine.py,sha256=8027aa9e9bc8d2f23c42c230fdd935b7685f4eb8b51613ed7a2d7fce2236e5a7,25543
|
|
10
|
-
exstruct/errors.py,sha256=9be81f7e93df84642fd2db4591bfbff1d5440d715287b582da113e7b0a5549ac,1002
|
|
11
|
-
exstruct/io/__init__.py,sha256=8cb00dd3e1fed186ab79f80948b241ff72a76e95b79fd9c4d4746829b61bf5ee,19267
|
|
12
|
-
exstruct/models/__init__.py,sha256=c502e877ce9bdbc899de6f6a95583282c95f9ef81d6a69296f2fef827dc3b7e3,11924
|
|
13
|
-
exstruct/models/maps.py,sha256=9ebb0e67e4d80b771b2ec3babba488cb84fa4a56681906990fff733273e73f52,12930
|
|
14
|
-
exstruct/models/types.py,sha256=4226f75035fc144bfaf88fe29bdeaa6a986924f18b3b3a048502187d27339d2a,278
|
|
15
|
-
exstruct/py.typed,sha256=e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855,0
|
|
16
|
-
exstruct/render/__init__.py,sha256=e2c16904003c6fd28f96ff2ff13b2ac677465a48f732e93627544644b4dc37d9,4242
|
|
17
|
-
exstruct-0.2.80.dist-info/WHEEL,sha256=b6dc288e80aa2d1b1518ddb3502fd5b53e8fd6cb507ed2a4f932e9e6088b264a,78
|
|
18
|
-
exstruct-0.2.80.dist-info/entry_points.txt,sha256=3429e73dd9d41bb977b49a34914dddd7ec70352b79882bb937a3999e8e8bce9c,53
|
|
19
|
-
exstruct-0.2.80.dist-info/METADATA,sha256=d162e1003f843a4912e9dbfc82c298614068832b6506a23034fb11948ef790fe,16635
|
|
20
|
-
exstruct-0.2.80.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|