xlstruct 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. xlstruct-0.1.0/.github/workflows/ci.yml +62 -0
  2. xlstruct-0.1.0/.github/workflows/codeql.yml +29 -0
  3. xlstruct-0.1.0/.github/workflows/publish.yml +61 -0
  4. xlstruct-0.1.0/.gitignore +39 -0
  5. xlstruct-0.1.0/.pre-commit-config.yaml +16 -0
  6. xlstruct-0.1.0/CHANGELOG.md +22 -0
  7. xlstruct-0.1.0/CONTRIBUTING.md +19 -0
  8. xlstruct-0.1.0/LICENSE +21 -0
  9. xlstruct-0.1.0/PKG-INFO +374 -0
  10. xlstruct-0.1.0/README.md +315 -0
  11. xlstruct-0.1.0/examples/basic_extraction.py +82 -0
  12. xlstruct-0.1.0/examples/cloud_storage.py +81 -0
  13. xlstruct-0.1.0/examples/custom_instructions.py +102 -0
  14. xlstruct-0.1.0/pyproject.toml +104 -0
  15. xlstruct-0.1.0/src/xlstruct/__init__.py +28 -0
  16. xlstruct-0.1.0/src/xlstruct/_tokens.py +49 -0
  17. xlstruct-0.1.0/src/xlstruct/cli.py +86 -0
  18. xlstruct-0.1.0/src/xlstruct/codegen/__init__.py +0 -0
  19. xlstruct-0.1.0/src/xlstruct/codegen/backends/__init__.py +1 -0
  20. xlstruct-0.1.0/src/xlstruct/codegen/backends/base.py +31 -0
  21. xlstruct-0.1.0/src/xlstruct/codegen/backends/docker.py +273 -0
  22. xlstruct-0.1.0/src/xlstruct/codegen/backends/subprocess.py +111 -0
  23. xlstruct-0.1.0/src/xlstruct/codegen/engine.py +192 -0
  24. xlstruct-0.1.0/src/xlstruct/codegen/executor.py +170 -0
  25. xlstruct-0.1.0/src/xlstruct/codegen/orchestrator.py +322 -0
  26. xlstruct-0.1.0/src/xlstruct/codegen/schema_utils.py +83 -0
  27. xlstruct-0.1.0/src/xlstruct/codegen/validation.py +276 -0
  28. xlstruct-0.1.0/src/xlstruct/config.py +160 -0
  29. xlstruct-0.1.0/src/xlstruct/encoder/__init__.py +0 -0
  30. xlstruct-0.1.0/src/xlstruct/encoder/_formatting.py +404 -0
  31. xlstruct-0.1.0/src/xlstruct/encoder/base.py +21 -0
  32. xlstruct-0.1.0/src/xlstruct/encoder/compressed.py +174 -0
  33. xlstruct-0.1.0/src/xlstruct/exceptions.py +27 -0
  34. xlstruct-0.1.0/src/xlstruct/extraction/__init__.py +0 -0
  35. xlstruct-0.1.0/src/xlstruct/extraction/chunking.py +97 -0
  36. xlstruct-0.1.0/src/xlstruct/extraction/engine.py +74 -0
  37. xlstruct-0.1.0/src/xlstruct/extractor.py +458 -0
  38. xlstruct-0.1.0/src/xlstruct/prompts/__init__.py +0 -0
  39. xlstruct-0.1.0/src/xlstruct/prompts/codegen.py +654 -0
  40. xlstruct-0.1.0/src/xlstruct/prompts/extraction.py +45 -0
  41. xlstruct-0.1.0/src/xlstruct/prompts/system.py +22 -0
  42. xlstruct-0.1.0/src/xlstruct/py.typed +0 -0
  43. xlstruct-0.1.0/src/xlstruct/reader/__init__.py +0 -0
  44. xlstruct-0.1.0/src/xlstruct/reader/base.py +32 -0
  45. xlstruct-0.1.0/src/xlstruct/reader/csv_reader.py +109 -0
  46. xlstruct-0.1.0/src/xlstruct/reader/hybrid_reader.py +296 -0
  47. xlstruct-0.1.0/src/xlstruct/schemas/__init__.py +0 -0
  48. xlstruct-0.1.0/src/xlstruct/schemas/codegen.py +82 -0
  49. xlstruct-0.1.0/src/xlstruct/schemas/core.py +80 -0
  50. xlstruct-0.1.0/src/xlstruct/schemas/usage.py +100 -0
  51. xlstruct-0.1.0/src/xlstruct/storage.py +41 -0
  52. xlstruct-0.1.0/tests/__init__.py +0 -0
  53. xlstruct-0.1.0/tests/conftest.py +107 -0
  54. xlstruct-0.1.0/tests/fixtures/__init__.py +0 -0
  55. xlstruct-0.1.0/tests/fixtures/generate.py +1795 -0
  56. xlstruct-0.1.0/tests/fixtures/schemas.py +341 -0
  57. xlstruct-0.1.0/tests/integration/__init__.py +0 -0
  58. xlstruct-0.1.0/tests/integration/conftest.py +143 -0
  59. xlstruct-0.1.0/tests/integration/docker-compose.yml +36 -0
  60. xlstruct-0.1.0/tests/integration/test_extractor_cloud.py +163 -0
  61. xlstruct-0.1.0/tests/integration/test_storage.py +107 -0
  62. xlstruct-0.1.0/tests/test_chunking.py +163 -0
  63. xlstruct-0.1.0/tests/test_cli.py +146 -0
  64. xlstruct-0.1.0/tests/test_codegen.py +441 -0
  65. xlstruct-0.1.0/tests/test_config.py +42 -0
  66. xlstruct-0.1.0/tests/test_encoder.py +91 -0
  67. xlstruct-0.1.0/tests/test_extractor.py +207 -0
  68. xlstruct-0.1.0/tests/test_formatting.py +92 -0
  69. xlstruct-0.1.0/tests/test_models.py +55 -0
  70. xlstruct-0.1.0/tests/test_reader.py +296 -0
  71. xlstruct-0.1.0/tests/test_tokens.py +16 -0
  72. xlstruct-0.1.0/uv.lock +3085 -0
@@ -0,0 +1,62 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: astral-sh/setup-uv@v5
15
+ with:
16
+ enable-cache: true
17
+ - uses: actions/setup-python@v5
18
+ with:
19
+ python-version: "3.11"
20
+ - run: uv sync --group dev --all-extras
21
+ - name: Ruff lint
22
+ run: uv run ruff check src/ tests/
23
+ - name: Mypy
24
+ run: uv run mypy src/xlstruct/
25
+
26
+ test:
27
+ needs: [lint]
28
+ runs-on: ubuntu-latest
29
+ strategy:
30
+ matrix:
31
+ python-version: ["3.11", "3.12"]
32
+ steps:
33
+ - uses: actions/checkout@v4
34
+ - uses: astral-sh/setup-uv@v5
35
+ with:
36
+ enable-cache: true
37
+ - uses: actions/setup-python@v5
38
+ with:
39
+ python-version: ${{ matrix.python-version }}
40
+ - run: uv sync --group dev
41
+ - name: Run tests
42
+ run: uv run pytest tests/ -v --ignore=tests/integration --cov=xlstruct --cov-report=term-missing --cov-fail-under=60
43
+
44
+ integration:
45
+ runs-on: ubuntu-latest
46
+ needs: [lint, test]
47
+ steps:
48
+ - uses: actions/checkout@v4
49
+ - uses: astral-sh/setup-uv@v5
50
+ with:
51
+ enable-cache: true
52
+ - uses: actions/setup-python@v5
53
+ with:
54
+ python-version: "3.11"
55
+ - run: uv sync --group dev --extra all-storage
56
+ - name: Start emulators
57
+ run: docker compose -f tests/integration/docker-compose.yml up -d --wait
58
+ - name: Run integration tests
59
+ run: uv run pytest tests/integration/ -v
60
+ - name: Stop emulators
61
+ if: always()
62
+ run: docker compose -f tests/integration/docker-compose.yml down
@@ -0,0 +1,29 @@
1
+ name: CodeQL
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+ schedule:
9
+ - cron: "0 6 * * 1" # Every Monday at 06:00 UTC
10
+
11
+ jobs:
12
+ analyze:
13
+ name: Analyze
14
+ runs-on: ubuntu-latest
15
+ permissions:
16
+ security-events: write
17
+ contents: read
18
+
19
+ steps:
20
+ - name: Checkout
21
+ uses: actions/checkout@v4
22
+
23
+ - name: Initialize CodeQL
24
+ uses: github/codeql-action/init@v3
25
+ with:
26
+ languages: python
27
+
28
+ - name: Perform CodeQL Analysis
29
+ uses: github/codeql-action/analyze@v3
@@ -0,0 +1,61 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ test:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+ - uses: astral-sh/setup-uv@v5
13
+ with:
14
+ enable-cache: true
15
+ - uses: actions/setup-python@v5
16
+ with:
17
+ python-version: "3.11"
18
+ - run: uv sync --group dev
19
+ - name: Ruff lint
20
+ run: uv run ruff check src/ tests/
21
+ - name: Mypy
22
+ run: uv run mypy src/xlstruct/
23
+ - name: Run tests
24
+ run: uv run pytest tests/ -v --ignore=tests/integration
25
+
26
+ verify-version:
27
+ runs-on: ubuntu-latest
28
+ steps:
29
+ - uses: actions/checkout@v4
30
+ - name: Check version matches tag
31
+ run: |
32
+ TAG_VERSION="${GITHUB_REF_NAME#v}"
33
+ PKG_VERSION=$(python -c "
34
+ import re
35
+ with open('pyproject.toml') as f:
36
+ match = re.search(r'version\s*=\s*\"(.+?)\"', f.read())
37
+ print(match.group(1))
38
+ ")
39
+ if [ "$TAG_VERSION" != "$PKG_VERSION" ]; then
40
+ echo "ERROR: Tag version ($TAG_VERSION) != pyproject.toml version ($PKG_VERSION)"
41
+ exit 1
42
+ fi
43
+ echo "Version check passed: $TAG_VERSION"
44
+
45
+ publish:
46
+ runs-on: ubuntu-latest
47
+ needs: [test, verify-version]
48
+ permissions:
49
+ id-token: write
50
+ steps:
51
+ - uses: actions/checkout@v4
52
+ - uses: astral-sh/setup-uv@v5
53
+ with:
54
+ enable-cache: true
55
+ - uses: actions/setup-python@v5
56
+ with:
57
+ python-version: "3.11"
58
+ - name: Build package
59
+ run: uv build
60
+ - name: Publish to PyPI
61
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,39 @@
1
+ # * Environment
2
+ .env
3
+ scripts/.env
4
+ **/prd
5
+
6
+ # * Python
7
+ __pycache__/
8
+ *.py[cod]
9
+ *.egg-info/
10
+ dist/
11
+ build/
12
+ .venv/
13
+ data/
14
+ results/
15
+ generated/
16
+ *.json
17
+
18
+ # * IDE
19
+ .idea/
20
+ .vscode/
21
+ *.swp
22
+
23
+ # * Testing
24
+ .pytest_cache/
25
+ .mypy_cache/
26
+ .ruff_cache/
27
+ tests/fixtures/data/
28
+
29
+ # * Local scripts (project-specific, not for public repo)
30
+ scripts/
31
+
32
+ # * Project instructions (not for public repo)
33
+ CLAUDE.md
34
+
35
+ # * MAC OS
36
+ .DS_Store
37
+
38
+ # * MCP
39
+ .serena/
@@ -0,0 +1,16 @@
1
+ repos:
2
+ - repo: local
3
+ hooks:
4
+ - id: ruff
5
+ name: ruff
6
+ entry: uv run ruff check src/ tests/
7
+ language: system
8
+ pass_filenames: false
9
+ types: [python]
10
+
11
+ - id: mypy
12
+ name: mypy
13
+ entry: uv run mypy src/xlstruct/
14
+ language: system
15
+ pass_filenames: false
16
+ types: [python]
@@ -0,0 +1,22 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.0] - 2026-03-10
9
+
10
+ ### Added
11
+
12
+ - Schema-driven Excel extraction via Pydantic models
13
+ - Two extraction modes: direct LLM extraction and code generation
14
+ - `HybridReader` — calamine (Rust) + openpyxl dual-pass reader
15
+ - `CompressedEncoder` — token-aware sheet encoding with sampling
16
+ - `ChunkSplitter` — automatic chunking for large sheets
17
+ - Code generation pipeline with self-correction (Analyzer → Parser → Transformer)
18
+ - Sandboxed script execution (`SubprocessBackend`) with blocked imports and stripped credentials
19
+ - Multi-provider LLM support via Instructor (OpenAI, Anthropic, Gemini)
20
+ - Cloud storage support via fsspec (S3, Azure Blob, GCS)
21
+ - Async-first API with `*_sync()` convenience wrappers
22
+ - Typer CLI (`xlstruct extract`)
@@ -0,0 +1,19 @@
1
+ # Contributing to XLStruct
2
+
3
+ Thanks for your interest in contributing! AI-assisted contributions (issue creation, coding, reviews) are welcome.
4
+
5
+ ## Before You Submit
6
+
7
+ - `uv sync --group dev` to install dependencies
8
+ - `uv run pytest tests/ -v --ignore=tests/integration` must pass
9
+ - `uv run ruff check src/ tests/` and `uv run mypy src/xlstruct/` must pass
10
+ - Pre-commit hooks run these automatically
11
+
12
+ ## Code Style
13
+
14
+ - Python 3.11+, `T | None` (not `Optional[T]`), Pydantic V2
15
+
16
+ ## Pull Requests
17
+
18
+ 1. Fork → feature branch → make changes with tests → PR against `main`
19
+ 2. Keep PRs focused — one feature or fix per PR
xlstruct-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 DanMeon
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,374 @@
1
+ Metadata-Version: 2.4
2
+ Name: xlstruct
3
+ Version: 0.1.0
4
+ Summary: LLM-powered Excel parser — define a Pydantic schema, get structured data from any Excel file
5
+ Project-URL: Homepage, https://github.com/DanMeon/xlstruct
6
+ Project-URL: Repository, https://github.com/DanMeon/xlstruct
7
+ Project-URL: Issues, https://github.com/DanMeon/xlstruct/issues
8
+ Project-URL: Changelog, https://github.com/DanMeon/xlstruct/blob/main/CHANGELOG.md
9
+ Author: DanMeon
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: excel,extraction,llm,pydantic,spreadsheet,structured-data
13
+ Classifier: Development Status :: 3 - Alpha
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Topic :: Office/Business :: Financial :: Spreadsheet
20
+ Classifier: Typing :: Typed
21
+ Requires-Python: >=3.11
22
+ Requires-Dist: fsspec>=2024.1
23
+ Requires-Dist: instructor<2,>=1.14.5
24
+ Requires-Dist: openpyxl>=3.1
25
+ Requires-Dist: pydantic>=2.0
26
+ Requires-Dist: python-calamine>=0.6.2
27
+ Requires-Dist: tiktoken>=0.7
28
+ Requires-Dist: typer>=0.9
29
+ Provides-Extra: all
30
+ Requires-Dist: adlfs; extra == 'all'
31
+ Requires-Dist: anthropic; extra == 'all'
32
+ Requires-Dist: gcsfs; extra == 'all'
33
+ Requires-Dist: google-genai; extra == 'all'
34
+ Requires-Dist: openai; extra == 'all'
35
+ Requires-Dist: s3fs; extra == 'all'
36
+ Provides-Extra: all-llm
37
+ Requires-Dist: anthropic; extra == 'all-llm'
38
+ Requires-Dist: google-genai; extra == 'all-llm'
39
+ Requires-Dist: openai; extra == 'all-llm'
40
+ Provides-Extra: all-storage
41
+ Requires-Dist: adlfs; extra == 'all-storage'
42
+ Requires-Dist: gcsfs; extra == 'all-storage'
43
+ Requires-Dist: s3fs; extra == 'all-storage'
44
+ Provides-Extra: anthropic
45
+ Requires-Dist: anthropic; extra == 'anthropic'
46
+ Provides-Extra: azure
47
+ Requires-Dist: adlfs; extra == 'azure'
48
+ Provides-Extra: docker
49
+ Requires-Dist: aiodocker>=0.26.0; extra == 'docker'
50
+ Provides-Extra: gcs
51
+ Requires-Dist: gcsfs; extra == 'gcs'
52
+ Provides-Extra: gemini
53
+ Requires-Dist: google-genai; extra == 'gemini'
54
+ Provides-Extra: openai
55
+ Requires-Dist: openai; extra == 'openai'
56
+ Provides-Extra: s3
57
+ Requires-Dist: s3fs; extra == 's3'
58
+ Description-Content-Type: text/markdown
59
+
60
+ # XLStruct
61
+
62
+ [![CI](https://github.com/DanMeon/xlstruct/actions/workflows/ci.yml/badge.svg)](https://github.com/DanMeon/xlstruct/actions/workflows/ci.yml)
63
+ [![PyPI version](https://img.shields.io/pypi/v/xlstruct)](https://pypi.org/project/xlstruct/)
64
+ [![Python 3.11+](https://img.shields.io/pypi/pyversions/xlstruct)](https://pypi.org/project/xlstruct/)
65
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
66
+
67
+ LLM-powered Excel/CSV parser — Define a Pydantic schema, get structured data from any spreadsheet.
68
+
69
+ ```
70
+ Excel File + Pydantic Schema → LLM → Validated Structured Data
71
+ ```
72
+
73
+ ## Features
74
+
75
+ - **Schema-driven extraction** — Define a Pydantic model, get validated instances from any spreadsheet. No parsing code needed.
76
+ - **Excel + CSV** — `.xlsx`, `.xlsm`, `.xltx`, `.xltm`, `.xls`, and `.csv` supported out of the box
77
+ - **Any Excel layout** — Flat tables, merged cells, multi-level headers, form+table hybrids — handled by a single API
78
+ - **Two extraction modes** — Direct LLM extraction for small sheets; code generation for large ones. Auto-routed by sheet size, or choose manually.
79
+ - **Reusable scripts** — Codegen mode produces a standalone Python script. Run it without LLM calls — pay for generation once, use forever.
80
+ - **Schema suggestion** — `suggest_schema()` analyzes a spreadsheet and generates a Pydantic model for you
81
+ - **Fast hybrid reader** — calamine (Rust) for speed + openpyxl for formula extraction. Both passes in one call.
82
+ - **Token usage tracking** — Every extraction returns token counts, per-call breakdown, and prompt cache hit metrics
83
+ - **Prompt caching** — Anthropic cache_control markers applied automatically; OpenAI cached_tokens tracked
84
+ - **Token-aware encoding** — Auto-selects encoding strategy (markdown vs compressed) and chunks large sheets to fit within token budget
85
+ - **Sandboxed execution** — Generated scripts run in a subprocess with blocked imports (network, subprocess) and stripped credentials
86
+ - **Multi-provider LLM** — OpenAI, Anthropic, Gemini via [Instructor](https://github.com/jxnl/instructor)
87
+ - **Cloud storage** — Read from S3, Azure Blob, GCS via fsspec
88
+ - **Async-first** — Async API with sync convenience wrappers. Jupyter-compatible via nest_asyncio.
89
+
90
+ ## Installation
91
+
92
+ ```bash
93
+ pip install xlstruct
94
+ ```
95
+
96
+ ### Extras
97
+
98
+ ```bash
99
+ # LLM providers
100
+ pip install "xlstruct[openai]"
101
+ pip install "xlstruct[anthropic]"
102
+ pip install "xlstruct[gemini]"
103
+
104
+ # Cloud storage
105
+ pip install "xlstruct[s3]" # AWS S3
106
+ pip install "xlstruct[azure]" # Azure Blob Storage
107
+ pip install "xlstruct[gcs]" # Google Cloud Storage
108
+
109
+ # Everything
110
+ pip install "xlstruct[all]"
111
+ ```
112
+
113
+ ## Quick Start
114
+
115
+ ### 1. Define a Pydantic schema
116
+
117
+ ```python
118
+ from pydantic import BaseModel, Field
119
+
120
+ class InvoiceItem(BaseModel):
121
+ description: str = Field(description="Item description")
122
+ quantity: int
123
+ unit_price: float
124
+ total: float
125
+ ```
126
+
127
+ ### 2. Extract data
128
+
129
+ ```python
130
+ from xlstruct import Extractor
131
+
132
+ extractor = Extractor(provider="openai/gpt-4o")
133
+ results = extractor.extract_sync("invoice.xlsx", schema=InvoiceItem)
134
+
135
+ for item in results:
136
+ print(item.model_dump())
137
+ ```
138
+
139
+ ### Async usage
140
+
141
+ ```python
142
+ import asyncio
143
+ from xlstruct import Extractor
144
+
145
+ async def main():
146
+ extractor = Extractor(provider="anthropic/claude-sonnet-4-6")
147
+ results = await extractor.extract("invoice.xlsx", schema=InvoiceItem)
148
+ for item in results:
149
+ print(item.model_dump())
150
+
151
+ asyncio.run(main())
152
+ ```
153
+
154
+ ### Fine-grained control with ExtractionConfig
155
+
156
+ ```python
157
+ from xlstruct import ExtractionConfig
158
+
159
+ config = ExtractionConfig(
160
+ output_schema=InvoiceItem,
161
+ sheet="Sheet1",
162
+ header_rows=[1],
163
+ instructions="Parse dates as YYYY-MM-DD. Skip empty rows.",
164
+ )
165
+
166
+ results = extractor.extract_sync("invoice.xlsx", extraction_config=config)
167
+ ```
168
+
169
+ ### Schema Suggestion
170
+
171
+ Don't know the spreadsheet structure? Let the LLM suggest a schema:
172
+
173
+ ```python
174
+ code = extractor.suggest_schema_sync("unknown_data.xlsx")
175
+ print(code) # Prints a Pydantic model definition
176
+ ```
177
+
178
+ ```python
179
+ # With hints
180
+ code = extractor.suggest_schema_sync(
181
+ "report.xlsx",
182
+ sheet="Q1 Sales",
183
+ instructions="Focus on financial columns only",
184
+ )
185
+ ```
186
+
187
+ ### Token Usage Tracking
188
+
189
+ Every extraction returns token usage details:
190
+
191
+ ```python
192
+ results = extractor.extract_sync("invoice.xlsx", schema=InvoiceItem)
193
+
194
+ print(results.usage.llm_calls) # Number of LLM API calls
195
+ print(results.usage.input_tokens) # Total input tokens
196
+ print(results.usage.output_tokens) # Total output tokens
197
+ print(results.usage.total_tokens) # Sum
198
+ print(results.usage.cache_read_tokens) # Prompt cache hits (Anthropic + OpenAI)
199
+ print(results.usage.breakdown) # Per-call breakdown: [(label, in, out), ...]
200
+ ```
201
+
202
+ ### CSV Support
203
+
204
+ CSV files work with the same API — no extra configuration needed:
205
+
206
+ ```python
207
+ results = extractor.extract_sync("data.csv", schema=MyModel)
208
+ ```
209
+
210
+ ## Extraction Modes
211
+
212
+ XLStruct auto-routes based on data row count:
213
+
214
+ | Mode | Trigger | How it works |
215
+ |------|---------|-------------|
216
+ | **Direct** | ≤ 20 data rows | Sheet encoded as markdown → LLM → Pydantic models |
217
+ | **Codegen** | > 20 data rows | LLM generates a standalone Python parsing script → runs in sandbox |
218
+
219
+ You can force a specific mode:
220
+
221
+ ```python
222
+ from xlstruct import ExtractionConfig, ExtractionMode
223
+
224
+ config = ExtractionConfig(
225
+ mode=ExtractionMode.CODEGEN, # Force code generation
226
+ output_schema=MySchema,
227
+ header_rows=[1],
228
+ )
229
+ ```
230
+
231
+ ### Code Generation Pipeline
232
+
233
+ 1. **Header Detection** — Auto-detect header rows via lightweight LLM call
234
+ 2. **Phase 0 (Analyzer)** — LLM analyzes spreadsheet structure → `MappingPlan`
235
+ 3. **Phase 1 (Parser Agent)** — LLM generates openpyxl-based parsing script → validated via subprocess
236
+ 4. **Phase 2 (Transformer)** — Optional: adds data transformation rules
237
+
238
+ Each phase includes self-correction — errors are fed back to the LLM (up to `max_codegen_retries`).
239
+
240
+ ### Generate Standalone Scripts
241
+
242
+ ```python
243
+ script = extractor.generate_script_sync("report.xlsx", config)
244
+ print(script.code) # Reusable Python script
245
+ print(script.explanation) # How it works
246
+ ```
247
+
248
+ ## Cloud Storage
249
+
250
+ ```python
251
+ # AWS S3
252
+ results = await extractor.extract(
253
+ "s3://my-bucket/data/report.xlsx",
254
+ schema=MySchema,
255
+ key="AWS_KEY", secret="AWS_SECRET",
256
+ )
257
+
258
+ # Azure Blob Storage
259
+ results = await extractor.extract(
260
+ "az://my-container/report.xlsx",
261
+ schema=MySchema,
262
+ account_name="...", account_key="...",
263
+ )
264
+
265
+ # Google Cloud Storage
266
+ results = await extractor.extract(
267
+ "gs://my-bucket/report.xlsx",
268
+ schema=MySchema,
269
+ token="/path/to/credentials.json",
270
+ )
271
+ ```
272
+
273
+ ## CLI
274
+
275
+ ```bash
276
+ # Basic usage
277
+ xlstruct extract invoice.xlsx --schema myapp.models:InvoiceItem
278
+
279
+ # With options
280
+ xlstruct extract report.xlsx \
281
+ --schema myapp.models:SalesRecord \
282
+ --provider anthropic/claude-sonnet-4-6 \
283
+ --sheet "Q1 Sales" \
284
+ --instructions "Ignore summary rows" \
285
+ --output results.json
286
+ ```
287
+
288
+ ## Configuration
289
+
290
+ ### ExtractorConfig
291
+
292
+ | Parameter | Default | Description |
293
+ |-----------|---------|-------------|
294
+ | `provider` | `"anthropic/claude-sonnet-4-6"` | LLM provider (`openai/gpt-4o`, `anthropic/claude-sonnet-4-6`, etc.) |
295
+ | `api_key` | `None` | API key (falls back to environment variables) |
296
+ | `max_retries` | `3` | LLM API retry count |
297
+ | `token_budget` | `100_000` | Max tokens per sheet |
298
+ | `temperature` | `0.0` | LLM temperature |
299
+ | `max_codegen_retries` | `3` | Self-correction attempts for code generation |
300
+ | `codegen_timeout` | `60` | Script execution timeout in seconds |
301
+ | `export_dir` | `None` | Directory to auto-save generated codegen scripts |
302
+
303
+ ```python
304
+ from xlstruct import ExtractorConfig
305
+
306
+ config = ExtractorConfig(
307
+ provider="openai/gpt-4o",
308
+ temperature=0.0,
309
+ token_budget=200_000,
310
+ )
311
+ extractor = Extractor(config=config)
312
+ ```
313
+
314
+ ### Docker Backend
315
+
316
+ For OS-level sandboxing, pass a `DockerBackend` via `execution_backend`:
317
+
318
+ ```bash
319
+ pip install xlstruct[docker]
320
+ ```
321
+
322
+ ```python
323
+ from xlstruct import Extractor
324
+ from xlstruct.codegen.backends.docker import DockerBackend, DockerConfig
325
+
326
+ extractor = Extractor(
327
+ execution_backend=DockerBackend(
328
+ config=DockerConfig(image="python:3.12-slim", mem_limit="1g"),
329
+ ),
330
+ )
331
+ ```
332
+
333
+ ## Architecture
334
+
335
+ ```
336
+ Storage (fsspec) → Reader (HybridReader / CsvReader) → CompressedEncoder → LLM Engine → Pydantic
337
+ ```
338
+
339
+ ### Reader
340
+
341
+ **Excel** — `HybridReader` uses a 2-pass approach:
342
+ - **Pass 1 (calamine, Rust)** — values, merged cells, data types, dimensions
343
+ - **Pass 2 (openpyxl, read-only)** — formula strings (`.xlsx`/`.xlsm` only)
344
+
345
+ **CSV** — `CsvReader` uses Python stdlib `csv` module. No extra dependencies.
346
+
347
+ ### Encoder
348
+
349
+ `CompressedEncoder` converts sheet data to markdown tables with structural metadata:
350
+ - Column types, formula patterns, merged regions
351
+ - Optional head+tail sampling for large sheets (20 rows by default)
352
+
353
+ ### Exceptions
354
+
355
+ ```
356
+ XLStructError (base)
357
+ ├── StorageError
358
+ ├── ReaderError
359
+ ├── ExtractionError
360
+ └── CodegenValidationError
361
+ ```
362
+
363
+ ## Development
364
+
365
+ ```bash
366
+ uv sync # Install dependencies
367
+ uv run pytest tests/ -v # Run all tests
368
+ uv run ruff check src/ tests/ # Lint
369
+ uv run mypy src/xlstruct/ # Type check
370
+ ```
371
+
372
+ ## License
373
+
374
+ MIT