carloforte 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,79 @@
1
+ Metadata-Version: 2.3
2
+ Name: carloforte
3
+ Version: 0.1.0
4
+ Summary: Extract structured data from Excel files with minimal token usage
5
+ Keywords: excel,xlsx,llm,extraction,json
6
+ Author: Giovanni De Cillis
7
+ Author-email: Giovanni De Cillis <giovanni.decillis@gmail.com>
8
+ License: MIT
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Office/Business :: Financial :: Spreadsheet
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Requires-Dist: loguru>=0.7.3
15
+ Requires-Dist: openpyxl>=3.1.5
16
+ Requires-Dist: pre-commit>=4.6.0
17
+ Requires-Python: >=3.14
18
+ Description-Content-Type: text/markdown
19
+
20
+ # carloforte
21
+
22
+ Extract structured data from Excel files with minimal token usage.
23
+
24
+ carloforte uses an island-detection algorithm to convert Excel sheets into a compact intermediate representation (CSV, Markdown, or JSON), making it efficient to pass spreadsheet data to LLMs.
25
+
26
+ ## Installation
27
+ TBC
28
+
29
+ ## Usage
30
+
31
+ ```python
32
+ import carloforte
33
+
34
+ # Extract all sheets as CSV (default)
35
+ text = carloforte.extract("data.xlsx")
36
+
37
+ # Extract specific sheets as Markdown
38
+ text = carloforte.extract("data.xlsx", sheets=["Revenue", "Costs"], fmt="markdown")
39
+
40
+ # Extract as JSON
41
+ text = carloforte.extract("data.xlsx", fmt="json")
42
+ ```
43
+
44
+ ### Formats
45
+
46
+ | Format | Best for |
47
+ |--------|----------|
48
+ | `csv` | Compact, low token count |
49
+ | `markdown` | Readable, good for LLM prompts |
50
+ | `json` | Structured output, programmatic use |
51
+
52
+ ### CLI
53
+
54
+ ```bash
55
+ carloforte data.xlsx --fmt markdown
56
+ carloforte data.xlsx --sheets Revenue Costs --fmt json
57
+ ```
58
+
59
+ ## How it works
60
+
61
+ Excel sheets often contain multiple disconnected tables, empty rows, and metadata scattered around. carloforte detects each contiguous block of data ("island") independently and serialises only what matters โ€” reducing token usage by 60โ€“75% compared to passing raw Excel content to an LLM.
62
+
63
+ ## Architecture
64
+
65
+ ```mermaid
66
+ flowchart LR
67
+ A["๐Ÿ“„ .xlsx file"] --> B["_reader\nload sheets"]
68
+ B --> C["dict[sheet โ†’ grid]"]
69
+ C --> D["_islands\nBFS detection"]
70
+ D --> E["dict[sheet โ†’ islands]"]
71
+ E --> F{"fmt?"}
72
+ F -->|csv| G["CSV"]
73
+ F -->|markdown| H["Markdown"]
74
+ F -->|json| I["JSON"]
75
+ ```
76
+
77
+ ## License
78
+
79
+ MIT
@@ -0,0 +1,60 @@
1
+ # carloforte
2
+
3
+ Extract structured data from Excel files with minimal token usage.
4
+
5
+ carloforte uses an island-detection algorithm to convert Excel sheets into a compact intermediate representation (CSV, Markdown, or JSON), making it efficient to pass spreadsheet data to LLMs.
6
+
7
+ ## Installation
8
+ TBC
9
+
10
+ ## Usage
11
+
12
+ ```python
13
+ import carloforte
14
+
15
+ # Extract all sheets as CSV (default)
16
+ text = carloforte.extract("data.xlsx")
17
+
18
+ # Extract specific sheets as Markdown
19
+ text = carloforte.extract("data.xlsx", sheets=["Revenue", "Costs"], fmt="markdown")
20
+
21
+ # Extract as JSON
22
+ text = carloforte.extract("data.xlsx", fmt="json")
23
+ ```
24
+
25
+ ### Formats
26
+
27
+ | Format | Best for |
28
+ |--------|----------|
29
+ | `csv` | Compact, low token count |
30
+ | `markdown` | Readable, good for LLM prompts |
31
+ | `json` | Structured output, programmatic use |
32
+
33
+ ### CLI
34
+
35
+ ```bash
36
+ carloforte data.xlsx --fmt markdown
37
+ carloforte data.xlsx --sheets Revenue Costs --fmt json
38
+ ```
39
+
40
+ ## How it works
41
+
42
+ Excel sheets often contain multiple disconnected tables, empty rows, and metadata scattered around. carloforte detects each contiguous block of data ("island") independently and serialises only what matters โ€” reducing token usage by 60โ€“75% compared to passing raw Excel content to an LLM.
43
+
44
+ ## Architecture
45
+
46
+ ```mermaid
47
+ flowchart LR
48
+ A["๐Ÿ“„ .xlsx file"] --> B["_reader\nload sheets"]
49
+ B --> C["dict[sheet โ†’ grid]"]
50
+ C --> D["_islands\nBFS detection"]
51
+ D --> E["dict[sheet โ†’ islands]"]
52
+ E --> F{"fmt?"}
53
+ F -->|csv| G["CSV"]
54
+ F -->|markdown| H["Markdown"]
55
+ F -->|json| I["JSON"]
56
+ ```
57
+
58
+ ## License
59
+
60
+ MIT
@@ -0,0 +1,50 @@
1
+ [project]
2
+ name = "carloforte"
3
+ version = "0.1.0"
4
+ description = "Extract structured data from Excel files with minimal token usage"
5
+ readme = "README.md"
6
+ license = { text = "MIT" }
7
+ authors = [
8
+ { name = "Giovanni De Cillis", email = "giovanni.decillis@gmail.com" }
9
+ ]
10
+ keywords = ["excel", "xlsx", "llm", "extraction", "json"]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "Intended Audience :: Developers",
14
+ "Programming Language :: Python :: 3",
15
+ "Topic :: Office/Business :: Financial :: Spreadsheet",
16
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
17
+ ]
18
+ requires-python = ">=3.14"
19
+ dependencies = [
20
+ "loguru>=0.7.3",
21
+ "openpyxl>=3.1.5",
22
+ "pre-commit>=4.6.0",
23
+ ]
24
+
25
+ [project.scripts]
26
+ carloforte = "carloforte:main"
27
+
28
+ [build-system]
29
+ requires = ["uv_build>=0.11.7,<0.12.0"]
30
+ build-backend = "uv_build"
31
+
32
+ [dependency-groups]
33
+ dev = [
34
+ "commitizen>=4.16.4",
35
+ "pre-commit>=4.0",
36
+ "mypy>=1.16",
37
+ "openpyxl-stubs",
38
+ "pytest>=8.0",
39
+ ]
40
+
41
+ [tool.pytest.ini_options]
42
+ testpaths = ["tests"]
43
+ pythonpath = ["src"]
44
+
45
+ [tool.commitizen]
46
+ name = "cz_conventional_commits"
47
+ version = "0.1.0"
48
+ version_files = ["pyproject.toml:version"]
49
+ tag_format = "v$version"
50
+ update_changelog_on_bump = true
@@ -0,0 +1,41 @@
1
+ import argparse
2
+ from loguru import logger
3
+ from ._reader import load_workbook_sheets
4
+ from ._islands import find_islands
5
+ from ._serialiser import serialise
6
+
7
+ logger.disable("carloforte")
8
+
9
+ _FORMATS = ("csv", "markdown", "json")
10
+
11
+
12
+ def extract(
13
+ path: str,
14
+ sheets: list[str] | None = None,
15
+ fmt: str = "csv",
16
+ ) -> str:
17
+ if fmt not in _FORMATS:
18
+ raise ValueError(f"Unknown format {fmt!r}. Choose: {_FORMATS}")
19
+ logger.debug(
20
+ "Loading {path}, sheets={sheets}, fmt={fmt}", path=path, sheets=sheets, fmt=fmt
21
+ )
22
+ grids = load_workbook_sheets(path, sheets)
23
+ logger.debug("Loaded {n} sheet(s)", n=len(grids))
24
+ sheet_islands = {name: find_islands(grid) for name, grid in grids.items()}
25
+ logger.debug(
26
+ "Found islands: { {name: len(isl)} for name, isl in sheet_islands.items() }"
27
+ )
28
+ return serialise(path, sheet_islands, fmt)
29
+
30
+
31
+ def main() -> None:
32
+ logger.enable("carloforte")
33
+ parser = argparse.ArgumentParser(prog="carloforte")
34
+ parser.add_argument("file", help="Path to the Excel file")
35
+ parser.add_argument("--sheets", nargs="+", help="Sheet names to process")
36
+ parser.add_argument("--fmt", default="csv", choices=_FORMATS, help="Output format")
37
+ args = parser.parse_args()
38
+ print(extract(args.file, sheets=args.sheets, fmt=args.fmt))
39
+
40
+
41
+ __all__ = ["extract"]
@@ -0,0 +1,70 @@
1
+ from collections import deque
2
+ from ._models import Island, CellValue
3
+
4
+ CellGrid = list[list[CellValue]]
5
+
6
+
7
+ def detect_header_row(cells: list[list[CellValue]]) -> int:
8
+ for i, row in enumerate(cells):
9
+ non_null = [v for v in row if v is not None]
10
+ if non_null and all(isinstance(v, str) for v in non_null):
11
+ return i
12
+ return 0
13
+
14
+
15
+ def find_islands(grid: CellGrid) -> list[Island]:
16
+ if not grid:
17
+ return []
18
+
19
+ max_row = len(grid)
20
+ max_col = max((len(row) for row in grid), default=0)
21
+
22
+ for row in grid:
23
+ while len(row) < max_col:
24
+ row.append(None)
25
+
26
+ visited = [[False] * max_col for _ in range(max_row)]
27
+ islands: list[Island] = []
28
+
29
+ for start_r in range(max_row):
30
+ for start_c in range(max_col):
31
+ if visited[start_r][start_c] or grid[start_r][start_c] is None:
32
+ visited[start_r][start_c] = True
33
+ continue
34
+
35
+ component: set[tuple[int, int]] = set()
36
+ queue: deque[tuple[int, int]] = deque()
37
+ queue.append((start_r, start_c))
38
+ visited[start_r][start_c] = True
39
+
40
+ while queue:
41
+ r, c = queue.popleft()
42
+ component.add((r, c))
43
+ for dr, dc in ((-1, 0), (1, 0), (0, -1), (0, 1)):
44
+ nr, nc = r + dr, c + dc
45
+ if (
46
+ 0 <= nr < max_row
47
+ and 0 <= nc < max_col
48
+ and not visited[nr][nc]
49
+ and grid[nr][nc] is not None
50
+ ):
51
+ visited[nr][nc] = True
52
+ queue.append((nr, nc))
53
+
54
+ min_r = min(r for r, _ in component)
55
+ max_r = max(r for r, _ in component)
56
+ min_c = min(c for _, c in component)
57
+ max_c = max(c for _, c in component)
58
+
59
+ islands.append(
60
+ Island(
61
+ top_row=min_r + 1,
62
+ left_col=min_c + 1,
63
+ cells=[
64
+ [grid[r][c] for c in range(min_c, max_c + 1)]
65
+ for r in range(min_r, max_r + 1)
66
+ ],
67
+ )
68
+ )
69
+
70
+ return islands
@@ -0,0 +1,18 @@
1
+ from dataclasses import dataclass
2
+
3
+ CellValue = str | int | float | bool | None
4
+
5
+
6
+ @dataclass
7
+ class Island:
8
+ top_row: int # 1-indexed (Excel convention)
9
+ left_col: int # 1-indexed
10
+ cells: list[list[CellValue]]
11
+
12
+ @property
13
+ def height(self) -> int:
14
+ return len(self.cells)
15
+
16
+ @property
17
+ def width(self) -> int:
18
+ return len(self.cells[0]) if self.cells else 0
@@ -0,0 +1,21 @@
1
+ import openpyxl
2
+ from ._models import CellValue
3
+
4
+ CellGrid = list[list[CellValue]]
5
+
6
+
7
+ def load_workbook_sheets(
8
+ path: str,
9
+ sheets: list[str] | None = None,
10
+ ) -> dict[str, CellGrid]:
11
+ wb = openpyxl.load_workbook(path, read_only=True, data_only=True)
12
+ names = sheets if sheets is not None else list(wb.sheetnames)
13
+ result: dict[str, CellGrid] = {}
14
+ for name in names:
15
+ if name not in wb.sheetnames:
16
+ wb.close()
17
+ raise KeyError(f"Sheet '{name}' not found in {path!r}")
18
+ ws = wb[name]
19
+ result[name] = [list(row) for row in ws.iter_rows(values_only=True)]
20
+ wb.close()
21
+ return result
@@ -0,0 +1,91 @@
1
+ import csv
2
+ import io
3
+ import json
4
+ import re
5
+ from ._models import Island, CellValue
6
+ from ._islands import detect_header_row
7
+
8
+ CellGrid = list[list[CellValue]]
9
+
10
+
11
+ def col_letter(n: int) -> str:
12
+ result = ""
13
+ while n > 0:
14
+ n, remainder = divmod(n - 1, 26)
15
+ result = chr(65 + remainder) + result
16
+ return result
17
+
18
+
19
+ def range_str(island: Island) -> str:
20
+ top_left = f"{col_letter(island.left_col)}{island.top_row}"
21
+ bottom_right = f"{col_letter(island.left_col + island.width - 1)}{island.top_row + island.height - 1}"
22
+ return f"{top_left}:{bottom_right}"
23
+
24
+
25
+ def _cell(v: CellValue) -> str:
26
+ return "" if v is None else str(v)
27
+
28
+
29
+ def _safe_id(name: str) -> str:
30
+ return re.sub(r"[^a-zA-Z0-9]", "_", name)
31
+
32
+
33
+ def _to_csv(sheet_islands: dict[str, list[Island]]) -> str:
34
+ buf = io.StringIO()
35
+ writer = csv.writer(buf)
36
+ for sheet_name, islands in sheet_islands.items():
37
+ for island in islands:
38
+ buf.write(f"# {sheet_name} ยท {range_str(island)}\n")
39
+ for row in island.cells:
40
+ writer.writerow([_cell(v) for v in row])
41
+ buf.write("\n")
42
+ return buf.getvalue()
43
+
44
+
45
+ def _to_markdown(path: str, sheet_islands: dict[str, list[Island]]) -> str:
46
+ def md_cell(v: CellValue) -> str:
47
+ return _cell(v).replace("|", "\\|")
48
+
49
+ filename = __import__("pathlib").Path(path).name
50
+ sections = [f"# {filename}\n"]
51
+ for sheet_name, islands in sheet_islands.items():
52
+ sections.append(f"## Sheet: {sheet_name}\n")
53
+ if not islands:
54
+ sections.append("_No data islands found._\n")
55
+ continue
56
+ for idx, island in enumerate(islands):
57
+ table_id = f"{_safe_id(sheet_name)}_t{idx}"
58
+ header_idx = detect_header_row(island.cells)
59
+ headers = [md_cell(v) for v in island.cells[header_idx]]
60
+ # rows before the header (pre-header data) + rows after
61
+ pre_header = island.cells[:header_idx]
62
+ rows = pre_header + island.cells[header_idx + 1 :]
63
+ sections.append(f"### {table_id} ยท {range_str(island)}")
64
+ sections.append("| " + " | ".join(headers) + " |")
65
+ sections.append("| " + " | ".join("---" for _ in headers) + " |")
66
+ for row in rows:
67
+ padded = list(row) + [None] * (len(headers) - len(row))
68
+ sections.append("| " + " | ".join(md_cell(v) for v in padded) + " |")
69
+ sections.append("")
70
+ return "\n".join(sections)
71
+
72
+
73
+ def _to_json(sheet_islands: dict[str, list[Island]]) -> str:
74
+ result = {}
75
+ for sheet_name, islands in sheet_islands.items():
76
+ result[sheet_name] = [{"range": range_str(i), "rows": i.cells} for i in islands]
77
+ return json.dumps(result, default=str)
78
+
79
+
80
+ def serialise(
81
+ path: str,
82
+ sheet_islands: dict[str, list[Island]],
83
+ fmt: str,
84
+ ) -> str:
85
+ if fmt == "csv":
86
+ return _to_csv(sheet_islands)
87
+ if fmt == "markdown":
88
+ return _to_markdown(path, sheet_islands)
89
+ if fmt == "json":
90
+ return _to_json(sheet_islands)
91
+ raise ValueError(f"Unknown format {fmt!r}. Choose: 'csv', 'markdown', 'json'")