carloforte 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- carloforte-0.1.0/PKG-INFO +79 -0
- carloforte-0.1.0/README.md +60 -0
- carloforte-0.1.0/pyproject.toml +50 -0
- carloforte-0.1.0/src/carloforte/__init__.py +41 -0
- carloforte-0.1.0/src/carloforte/_islands.py +70 -0
- carloforte-0.1.0/src/carloforte/_models.py +18 -0
- carloforte-0.1.0/src/carloforte/_reader.py +21 -0
- carloforte-0.1.0/src/carloforte/_serialiser.py +91 -0
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: carloforte
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Extract structured data from Excel files with minimal token usage
|
|
5
|
+
Keywords: excel,xlsx,llm,extraction,json
|
|
6
|
+
Author: Giovanni De Cillis
|
|
7
|
+
Author-email: Giovanni De Cillis <giovanni.decillis@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Office/Business :: Financial :: Spreadsheet
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Dist: loguru>=0.7.3
|
|
15
|
+
Requires-Dist: openpyxl>=3.1.5
|
|
16
|
+
Requires-Dist: pre-commit>=4.6.0
|
|
17
|
+
Requires-Python: >=3.14
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# carloforte
|
|
21
|
+
|
|
22
|
+
Extract structured data from Excel files with minimal token usage.
|
|
23
|
+
|
|
24
|
+
carloforte uses an island-detection algorithm to convert Excel sheets into a compact intermediate representation (CSV, Markdown, or JSON), making it efficient to pass spreadsheet data to LLMs.
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
TBC
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
```python
|
|
32
|
+
import carloforte
|
|
33
|
+
|
|
34
|
+
# Extract all sheets as CSV (default)
|
|
35
|
+
text = carloforte.extract("data.xlsx")
|
|
36
|
+
|
|
37
|
+
# Extract specific sheets as Markdown
|
|
38
|
+
text = carloforte.extract("data.xlsx", sheets=["Revenue", "Costs"], fmt="markdown")
|
|
39
|
+
|
|
40
|
+
# Extract as JSON
|
|
41
|
+
text = carloforte.extract("data.xlsx", fmt="json")
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Formats
|
|
45
|
+
|
|
46
|
+
| Format | Best for |
|
|
47
|
+
|--------|----------|
|
|
48
|
+
| `csv` | Compact, low token count |
|
|
49
|
+
| `markdown` | Readable, good for LLM prompts |
|
|
50
|
+
| `json` | Structured output, programmatic use |
|
|
51
|
+
|
|
52
|
+
### CLI
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
carloforte data.xlsx --fmt markdown
|
|
56
|
+
carloforte data.xlsx --sheets Revenue Costs --fmt json
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## How it works
|
|
60
|
+
|
|
61
|
+
Excel sheets often contain multiple disconnected tables, empty rows, and metadata scattered around. carloforte detects each contiguous block of data ("island") independently and serialises only what matters โ reducing token usage by 60โ75% compared to passing raw Excel content to an LLM.
|
|
62
|
+
|
|
63
|
+
## Architecture
|
|
64
|
+
|
|
65
|
+
```mermaid
|
|
66
|
+
flowchart LR
|
|
67
|
+
A["๐ .xlsx file"] --> B["_reader\nload sheets"]
|
|
68
|
+
B --> C["dict[sheet โ grid]"]
|
|
69
|
+
C --> D["_islands\nBFS detection"]
|
|
70
|
+
D --> E["dict[sheet โ islands]"]
|
|
71
|
+
E --> F{"fmt?"}
|
|
72
|
+
F -->|csv| G["CSV"]
|
|
73
|
+
F -->|markdown| H["Markdown"]
|
|
74
|
+
F -->|json| I["JSON"]
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## License
|
|
78
|
+
|
|
79
|
+
MIT
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# carloforte
|
|
2
|
+
|
|
3
|
+
Extract structured data from Excel files with minimal token usage.
|
|
4
|
+
|
|
5
|
+
carloforte uses an island-detection algorithm to convert Excel sheets into a compact intermediate representation (CSV, Markdown, or JSON), making it efficient to pass spreadsheet data to LLMs.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
TBC
|
|
9
|
+
|
|
10
|
+
## Usage
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
import carloforte
|
|
14
|
+
|
|
15
|
+
# Extract all sheets as CSV (default)
|
|
16
|
+
text = carloforte.extract("data.xlsx")
|
|
17
|
+
|
|
18
|
+
# Extract specific sheets as Markdown
|
|
19
|
+
text = carloforte.extract("data.xlsx", sheets=["Revenue", "Costs"], fmt="markdown")
|
|
20
|
+
|
|
21
|
+
# Extract as JSON
|
|
22
|
+
text = carloforte.extract("data.xlsx", fmt="json")
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Formats
|
|
26
|
+
|
|
27
|
+
| Format | Best for |
|
|
28
|
+
|--------|----------|
|
|
29
|
+
| `csv` | Compact, low token count |
|
|
30
|
+
| `markdown` | Readable, good for LLM prompts |
|
|
31
|
+
| `json` | Structured output, programmatic use |
|
|
32
|
+
|
|
33
|
+
### CLI
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
carloforte data.xlsx --fmt markdown
|
|
37
|
+
carloforte data.xlsx --sheets Revenue Costs --fmt json
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## How it works
|
|
41
|
+
|
|
42
|
+
Excel sheets often contain multiple disconnected tables, empty rows, and metadata scattered around. carloforte detects each contiguous block of data ("island") independently and serialises only what matters โ reducing token usage by 60โ75% compared to passing raw Excel content to an LLM.
|
|
43
|
+
|
|
44
|
+
## Architecture
|
|
45
|
+
|
|
46
|
+
```mermaid
|
|
47
|
+
flowchart LR
|
|
48
|
+
A["๐ .xlsx file"] --> B["_reader\nload sheets"]
|
|
49
|
+
B --> C["dict[sheet โ grid]"]
|
|
50
|
+
C --> D["_islands\nBFS detection"]
|
|
51
|
+
D --> E["dict[sheet โ islands]"]
|
|
52
|
+
E --> F{"fmt?"}
|
|
53
|
+
F -->|csv| G["CSV"]
|
|
54
|
+
F -->|markdown| H["Markdown"]
|
|
55
|
+
F -->|json| I["JSON"]
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## License
|
|
59
|
+
|
|
60
|
+
MIT
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "carloforte"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Extract structured data from Excel files with minimal token usage"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "Giovanni De Cillis", email = "giovanni.decillis@gmail.com" }
|
|
9
|
+
]
|
|
10
|
+
keywords = ["excel", "xlsx", "llm", "extraction", "json"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 3 - Alpha",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Topic :: Office/Business :: Financial :: Spreadsheet",
|
|
16
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
17
|
+
]
|
|
18
|
+
requires-python = ">=3.14"
|
|
19
|
+
dependencies = [
|
|
20
|
+
"loguru>=0.7.3",
|
|
21
|
+
"openpyxl>=3.1.5",
|
|
22
|
+
"pre-commit>=4.6.0",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.scripts]
|
|
26
|
+
carloforte = "carloforte:main"
|
|
27
|
+
|
|
28
|
+
[build-system]
|
|
29
|
+
requires = ["uv_build>=0.11.7,<0.12.0"]
|
|
30
|
+
build-backend = "uv_build"
|
|
31
|
+
|
|
32
|
+
[dependency-groups]
|
|
33
|
+
dev = [
|
|
34
|
+
"commitizen>=4.16.4",
|
|
35
|
+
"pre-commit>=4.0",
|
|
36
|
+
"mypy>=1.16",
|
|
37
|
+
"openpyxl-stubs",
|
|
38
|
+
"pytest>=8.0",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
testpaths = ["tests"]
|
|
43
|
+
pythonpath = ["src"]
|
|
44
|
+
|
|
45
|
+
[tool.commitizen]
|
|
46
|
+
name = "cz_conventional_commits"
|
|
47
|
+
version = "0.1.0"
|
|
48
|
+
version_files = ["pyproject.toml:version"]
|
|
49
|
+
tag_format = "v$version"
|
|
50
|
+
update_changelog_on_bump = true
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from loguru import logger
|
|
3
|
+
from ._reader import load_workbook_sheets
|
|
4
|
+
from ._islands import find_islands
|
|
5
|
+
from ._serialiser import serialise
|
|
6
|
+
|
|
7
|
+
logger.disable("carloforte")
|
|
8
|
+
|
|
9
|
+
_FORMATS = ("csv", "markdown", "json")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def extract(
|
|
13
|
+
path: str,
|
|
14
|
+
sheets: list[str] | None = None,
|
|
15
|
+
fmt: str = "csv",
|
|
16
|
+
) -> str:
|
|
17
|
+
if fmt not in _FORMATS:
|
|
18
|
+
raise ValueError(f"Unknown format {fmt!r}. Choose: {_FORMATS}")
|
|
19
|
+
logger.debug(
|
|
20
|
+
"Loading {path}, sheets={sheets}, fmt={fmt}", path=path, sheets=sheets, fmt=fmt
|
|
21
|
+
)
|
|
22
|
+
grids = load_workbook_sheets(path, sheets)
|
|
23
|
+
logger.debug("Loaded {n} sheet(s)", n=len(grids))
|
|
24
|
+
sheet_islands = {name: find_islands(grid) for name, grid in grids.items()}
|
|
25
|
+
logger.debug(
|
|
26
|
+
"Found islands: { {name: len(isl)} for name, isl in sheet_islands.items() }"
|
|
27
|
+
)
|
|
28
|
+
return serialise(path, sheet_islands, fmt)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def main() -> None:
|
|
32
|
+
logger.enable("carloforte")
|
|
33
|
+
parser = argparse.ArgumentParser(prog="carloforte")
|
|
34
|
+
parser.add_argument("file", help="Path to the Excel file")
|
|
35
|
+
parser.add_argument("--sheets", nargs="+", help="Sheet names to process")
|
|
36
|
+
parser.add_argument("--fmt", default="csv", choices=_FORMATS, help="Output format")
|
|
37
|
+
args = parser.parse_args()
|
|
38
|
+
print(extract(args.file, sheets=args.sheets, fmt=args.fmt))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
__all__ = ["extract"]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from collections import deque
|
|
2
|
+
from ._models import Island, CellValue
|
|
3
|
+
|
|
4
|
+
CellGrid = list[list[CellValue]]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def detect_header_row(cells: list[list[CellValue]]) -> int:
|
|
8
|
+
for i, row in enumerate(cells):
|
|
9
|
+
non_null = [v for v in row if v is not None]
|
|
10
|
+
if non_null and all(isinstance(v, str) for v in non_null):
|
|
11
|
+
return i
|
|
12
|
+
return 0
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def find_islands(grid: CellGrid) -> list[Island]:
|
|
16
|
+
if not grid:
|
|
17
|
+
return []
|
|
18
|
+
|
|
19
|
+
max_row = len(grid)
|
|
20
|
+
max_col = max((len(row) for row in grid), default=0)
|
|
21
|
+
|
|
22
|
+
for row in grid:
|
|
23
|
+
while len(row) < max_col:
|
|
24
|
+
row.append(None)
|
|
25
|
+
|
|
26
|
+
visited = [[False] * max_col for _ in range(max_row)]
|
|
27
|
+
islands: list[Island] = []
|
|
28
|
+
|
|
29
|
+
for start_r in range(max_row):
|
|
30
|
+
for start_c in range(max_col):
|
|
31
|
+
if visited[start_r][start_c] or grid[start_r][start_c] is None:
|
|
32
|
+
visited[start_r][start_c] = True
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
component: set[tuple[int, int]] = set()
|
|
36
|
+
queue: deque[tuple[int, int]] = deque()
|
|
37
|
+
queue.append((start_r, start_c))
|
|
38
|
+
visited[start_r][start_c] = True
|
|
39
|
+
|
|
40
|
+
while queue:
|
|
41
|
+
r, c = queue.popleft()
|
|
42
|
+
component.add((r, c))
|
|
43
|
+
for dr, dc in ((-1, 0), (1, 0), (0, -1), (0, 1)):
|
|
44
|
+
nr, nc = r + dr, c + dc
|
|
45
|
+
if (
|
|
46
|
+
0 <= nr < max_row
|
|
47
|
+
and 0 <= nc < max_col
|
|
48
|
+
and not visited[nr][nc]
|
|
49
|
+
and grid[nr][nc] is not None
|
|
50
|
+
):
|
|
51
|
+
visited[nr][nc] = True
|
|
52
|
+
queue.append((nr, nc))
|
|
53
|
+
|
|
54
|
+
min_r = min(r for r, _ in component)
|
|
55
|
+
max_r = max(r for r, _ in component)
|
|
56
|
+
min_c = min(c for _, c in component)
|
|
57
|
+
max_c = max(c for _, c in component)
|
|
58
|
+
|
|
59
|
+
islands.append(
|
|
60
|
+
Island(
|
|
61
|
+
top_row=min_r + 1,
|
|
62
|
+
left_col=min_c + 1,
|
|
63
|
+
cells=[
|
|
64
|
+
[grid[r][c] for c in range(min_c, max_c + 1)]
|
|
65
|
+
for r in range(min_r, max_r + 1)
|
|
66
|
+
],
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return islands
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
|
|
3
|
+
CellValue = str | int | float | bool | None
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Island:
|
|
8
|
+
top_row: int # 1-indexed (Excel convention)
|
|
9
|
+
left_col: int # 1-indexed
|
|
10
|
+
cells: list[list[CellValue]]
|
|
11
|
+
|
|
12
|
+
@property
|
|
13
|
+
def height(self) -> int:
|
|
14
|
+
return len(self.cells)
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def width(self) -> int:
|
|
18
|
+
return len(self.cells[0]) if self.cells else 0
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import openpyxl
|
|
2
|
+
from ._models import CellValue
|
|
3
|
+
|
|
4
|
+
CellGrid = list[list[CellValue]]
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def load_workbook_sheets(
|
|
8
|
+
path: str,
|
|
9
|
+
sheets: list[str] | None = None,
|
|
10
|
+
) -> dict[str, CellGrid]:
|
|
11
|
+
wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
|
|
12
|
+
names = sheets if sheets is not None else list(wb.sheetnames)
|
|
13
|
+
result: dict[str, CellGrid] = {}
|
|
14
|
+
for name in names:
|
|
15
|
+
if name not in wb.sheetnames:
|
|
16
|
+
wb.close()
|
|
17
|
+
raise KeyError(f"Sheet '{name}' not found in {path!r}")
|
|
18
|
+
ws = wb[name]
|
|
19
|
+
result[name] = [list(row) for row in ws.iter_rows(values_only=True)]
|
|
20
|
+
wb.close()
|
|
21
|
+
return result
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import io
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from ._models import Island, CellValue
|
|
6
|
+
from ._islands import detect_header_row
|
|
7
|
+
|
|
8
|
+
CellGrid = list[list[CellValue]]
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def col_letter(n: int) -> str:
|
|
12
|
+
result = ""
|
|
13
|
+
while n > 0:
|
|
14
|
+
n, remainder = divmod(n - 1, 26)
|
|
15
|
+
result = chr(65 + remainder) + result
|
|
16
|
+
return result
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def range_str(island: Island) -> str:
|
|
20
|
+
top_left = f"{col_letter(island.left_col)}{island.top_row}"
|
|
21
|
+
bottom_right = f"{col_letter(island.left_col + island.width - 1)}{island.top_row + island.height - 1}"
|
|
22
|
+
return f"{top_left}:{bottom_right}"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _cell(v: CellValue) -> str:
|
|
26
|
+
return "" if v is None else str(v)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _safe_id(name: str) -> str:
|
|
30
|
+
return re.sub(r"[^a-zA-Z0-9]", "_", name)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _to_csv(sheet_islands: dict[str, list[Island]]) -> str:
|
|
34
|
+
buf = io.StringIO()
|
|
35
|
+
writer = csv.writer(buf)
|
|
36
|
+
for sheet_name, islands in sheet_islands.items():
|
|
37
|
+
for island in islands:
|
|
38
|
+
buf.write(f"# {sheet_name} ยท {range_str(island)}\n")
|
|
39
|
+
for row in island.cells:
|
|
40
|
+
writer.writerow([_cell(v) for v in row])
|
|
41
|
+
buf.write("\n")
|
|
42
|
+
return buf.getvalue()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _to_markdown(path: str, sheet_islands: dict[str, list[Island]]) -> str:
|
|
46
|
+
def md_cell(v: CellValue) -> str:
|
|
47
|
+
return _cell(v).replace("|", "\\|")
|
|
48
|
+
|
|
49
|
+
filename = __import__("pathlib").Path(path).name
|
|
50
|
+
sections = [f"# {filename}\n"]
|
|
51
|
+
for sheet_name, islands in sheet_islands.items():
|
|
52
|
+
sections.append(f"## Sheet: {sheet_name}\n")
|
|
53
|
+
if not islands:
|
|
54
|
+
sections.append("_No data islands found._\n")
|
|
55
|
+
continue
|
|
56
|
+
for idx, island in enumerate(islands):
|
|
57
|
+
table_id = f"{_safe_id(sheet_name)}_t{idx}"
|
|
58
|
+
header_idx = detect_header_row(island.cells)
|
|
59
|
+
headers = [md_cell(v) for v in island.cells[header_idx]]
|
|
60
|
+
# rows before the header (pre-header data) + rows after
|
|
61
|
+
pre_header = island.cells[:header_idx]
|
|
62
|
+
rows = pre_header + island.cells[header_idx + 1 :]
|
|
63
|
+
sections.append(f"### {table_id} ยท {range_str(island)}")
|
|
64
|
+
sections.append("| " + " | ".join(headers) + " |")
|
|
65
|
+
sections.append("| " + " | ".join("---" for _ in headers) + " |")
|
|
66
|
+
for row in rows:
|
|
67
|
+
padded = list(row) + [None] * (len(headers) - len(row))
|
|
68
|
+
sections.append("| " + " | ".join(md_cell(v) for v in padded) + " |")
|
|
69
|
+
sections.append("")
|
|
70
|
+
return "\n".join(sections)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _to_json(sheet_islands: dict[str, list[Island]]) -> str:
|
|
74
|
+
result = {}
|
|
75
|
+
for sheet_name, islands in sheet_islands.items():
|
|
76
|
+
result[sheet_name] = [{"range": range_str(i), "rows": i.cells} for i in islands]
|
|
77
|
+
return json.dumps(result, default=str)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def serialise(
|
|
81
|
+
path: str,
|
|
82
|
+
sheet_islands: dict[str, list[Island]],
|
|
83
|
+
fmt: str,
|
|
84
|
+
) -> str:
|
|
85
|
+
if fmt == "csv":
|
|
86
|
+
return _to_csv(sheet_islands)
|
|
87
|
+
if fmt == "markdown":
|
|
88
|
+
return _to_markdown(path, sheet_islands)
|
|
89
|
+
if fmt == "json":
|
|
90
|
+
return _to_json(sheet_islands)
|
|
91
|
+
raise ValueError(f"Unknown format {fmt!r}. Choose: 'csv', 'markdown', 'json'")
|