xlstruct 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xlstruct-0.1.0/.github/workflows/ci.yml +62 -0
- xlstruct-0.1.0/.github/workflows/codeql.yml +29 -0
- xlstruct-0.1.0/.github/workflows/publish.yml +61 -0
- xlstruct-0.1.0/.gitignore +39 -0
- xlstruct-0.1.0/.pre-commit-config.yaml +16 -0
- xlstruct-0.1.0/CHANGELOG.md +22 -0
- xlstruct-0.1.0/CONTRIBUTING.md +19 -0
- xlstruct-0.1.0/LICENSE +21 -0
- xlstruct-0.1.0/PKG-INFO +374 -0
- xlstruct-0.1.0/README.md +315 -0
- xlstruct-0.1.0/examples/basic_extraction.py +82 -0
- xlstruct-0.1.0/examples/cloud_storage.py +81 -0
- xlstruct-0.1.0/examples/custom_instructions.py +102 -0
- xlstruct-0.1.0/pyproject.toml +104 -0
- xlstruct-0.1.0/src/xlstruct/__init__.py +28 -0
- xlstruct-0.1.0/src/xlstruct/_tokens.py +49 -0
- xlstruct-0.1.0/src/xlstruct/cli.py +86 -0
- xlstruct-0.1.0/src/xlstruct/codegen/__init__.py +0 -0
- xlstruct-0.1.0/src/xlstruct/codegen/backends/__init__.py +1 -0
- xlstruct-0.1.0/src/xlstruct/codegen/backends/base.py +31 -0
- xlstruct-0.1.0/src/xlstruct/codegen/backends/docker.py +273 -0
- xlstruct-0.1.0/src/xlstruct/codegen/backends/subprocess.py +111 -0
- xlstruct-0.1.0/src/xlstruct/codegen/engine.py +192 -0
- xlstruct-0.1.0/src/xlstruct/codegen/executor.py +170 -0
- xlstruct-0.1.0/src/xlstruct/codegen/orchestrator.py +322 -0
- xlstruct-0.1.0/src/xlstruct/codegen/schema_utils.py +83 -0
- xlstruct-0.1.0/src/xlstruct/codegen/validation.py +276 -0
- xlstruct-0.1.0/src/xlstruct/config.py +160 -0
- xlstruct-0.1.0/src/xlstruct/encoder/__init__.py +0 -0
- xlstruct-0.1.0/src/xlstruct/encoder/_formatting.py +404 -0
- xlstruct-0.1.0/src/xlstruct/encoder/base.py +21 -0
- xlstruct-0.1.0/src/xlstruct/encoder/compressed.py +174 -0
- xlstruct-0.1.0/src/xlstruct/exceptions.py +27 -0
- xlstruct-0.1.0/src/xlstruct/extraction/__init__.py +0 -0
- xlstruct-0.1.0/src/xlstruct/extraction/chunking.py +97 -0
- xlstruct-0.1.0/src/xlstruct/extraction/engine.py +74 -0
- xlstruct-0.1.0/src/xlstruct/extractor.py +458 -0
- xlstruct-0.1.0/src/xlstruct/prompts/__init__.py +0 -0
- xlstruct-0.1.0/src/xlstruct/prompts/codegen.py +654 -0
- xlstruct-0.1.0/src/xlstruct/prompts/extraction.py +45 -0
- xlstruct-0.1.0/src/xlstruct/prompts/system.py +22 -0
- xlstruct-0.1.0/src/xlstruct/py.typed +0 -0
- xlstruct-0.1.0/src/xlstruct/reader/__init__.py +0 -0
- xlstruct-0.1.0/src/xlstruct/reader/base.py +32 -0
- xlstruct-0.1.0/src/xlstruct/reader/csv_reader.py +109 -0
- xlstruct-0.1.0/src/xlstruct/reader/hybrid_reader.py +296 -0
- xlstruct-0.1.0/src/xlstruct/schemas/__init__.py +0 -0
- xlstruct-0.1.0/src/xlstruct/schemas/codegen.py +82 -0
- xlstruct-0.1.0/src/xlstruct/schemas/core.py +80 -0
- xlstruct-0.1.0/src/xlstruct/schemas/usage.py +100 -0
- xlstruct-0.1.0/src/xlstruct/storage.py +41 -0
- xlstruct-0.1.0/tests/__init__.py +0 -0
- xlstruct-0.1.0/tests/conftest.py +107 -0
- xlstruct-0.1.0/tests/fixtures/__init__.py +0 -0
- xlstruct-0.1.0/tests/fixtures/generate.py +1795 -0
- xlstruct-0.1.0/tests/fixtures/schemas.py +341 -0
- xlstruct-0.1.0/tests/integration/__init__.py +0 -0
- xlstruct-0.1.0/tests/integration/conftest.py +143 -0
- xlstruct-0.1.0/tests/integration/docker-compose.yml +36 -0
- xlstruct-0.1.0/tests/integration/test_extractor_cloud.py +163 -0
- xlstruct-0.1.0/tests/integration/test_storage.py +107 -0
- xlstruct-0.1.0/tests/test_chunking.py +163 -0
- xlstruct-0.1.0/tests/test_cli.py +146 -0
- xlstruct-0.1.0/tests/test_codegen.py +441 -0
- xlstruct-0.1.0/tests/test_config.py +42 -0
- xlstruct-0.1.0/tests/test_encoder.py +91 -0
- xlstruct-0.1.0/tests/test_extractor.py +207 -0
- xlstruct-0.1.0/tests/test_formatting.py +92 -0
- xlstruct-0.1.0/tests/test_models.py +55 -0
- xlstruct-0.1.0/tests/test_reader.py +296 -0
- xlstruct-0.1.0/tests/test_tokens.py +16 -0
- xlstruct-0.1.0/uv.lock +3085 -0
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: astral-sh/setup-uv@v5
|
|
15
|
+
with:
|
|
16
|
+
enable-cache: true
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
with:
|
|
19
|
+
python-version: "3.11"
|
|
20
|
+
- run: uv sync --group dev --all-extras
|
|
21
|
+
- name: Ruff lint
|
|
22
|
+
run: uv run ruff check src/ tests/
|
|
23
|
+
- name: Mypy
|
|
24
|
+
run: uv run mypy src/xlstruct/
|
|
25
|
+
|
|
26
|
+
test:
|
|
27
|
+
needs: [lint]
|
|
28
|
+
runs-on: ubuntu-latest
|
|
29
|
+
strategy:
|
|
30
|
+
matrix:
|
|
31
|
+
python-version: ["3.11", "3.12"]
|
|
32
|
+
steps:
|
|
33
|
+
- uses: actions/checkout@v4
|
|
34
|
+
- uses: astral-sh/setup-uv@v5
|
|
35
|
+
with:
|
|
36
|
+
enable-cache: true
|
|
37
|
+
- uses: actions/setup-python@v5
|
|
38
|
+
with:
|
|
39
|
+
python-version: ${{ matrix.python-version }}
|
|
40
|
+
- run: uv sync --group dev
|
|
41
|
+
- name: Run tests
|
|
42
|
+
run: uv run pytest tests/ -v --ignore=tests/integration --cov=xlstruct --cov-report=term-missing --cov-fail-under=60
|
|
43
|
+
|
|
44
|
+
integration:
|
|
45
|
+
runs-on: ubuntu-latest
|
|
46
|
+
needs: [lint, test]
|
|
47
|
+
steps:
|
|
48
|
+
- uses: actions/checkout@v4
|
|
49
|
+
- uses: astral-sh/setup-uv@v5
|
|
50
|
+
with:
|
|
51
|
+
enable-cache: true
|
|
52
|
+
- uses: actions/setup-python@v5
|
|
53
|
+
with:
|
|
54
|
+
python-version: "3.11"
|
|
55
|
+
- run: uv sync --group dev --extra all-storage
|
|
56
|
+
- name: Start emulators
|
|
57
|
+
run: docker compose -f tests/integration/docker-compose.yml up -d --wait
|
|
58
|
+
- name: Run integration tests
|
|
59
|
+
run: uv run pytest tests/integration/ -v
|
|
60
|
+
- name: Stop emulators
|
|
61
|
+
if: always()
|
|
62
|
+
run: docker compose -f tests/integration/docker-compose.yml down
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: CodeQL
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
schedule:
|
|
9
|
+
- cron: "0 6 * * 1" # Every Monday at 06:00 UTC
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
analyze:
|
|
13
|
+
name: Analyze
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
permissions:
|
|
16
|
+
security-events: write
|
|
17
|
+
contents: read
|
|
18
|
+
|
|
19
|
+
steps:
|
|
20
|
+
- name: Checkout
|
|
21
|
+
uses: actions/checkout@v4
|
|
22
|
+
|
|
23
|
+
- name: Initialize CodeQL
|
|
24
|
+
uses: github/codeql-action/init@v3
|
|
25
|
+
with:
|
|
26
|
+
languages: python
|
|
27
|
+
|
|
28
|
+
- name: Perform CodeQL Analysis
|
|
29
|
+
uses: github/codeql-action/analyze@v3
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
test:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
steps:
|
|
11
|
+
- uses: actions/checkout@v4
|
|
12
|
+
- uses: astral-sh/setup-uv@v5
|
|
13
|
+
with:
|
|
14
|
+
enable-cache: true
|
|
15
|
+
- uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.11"
|
|
18
|
+
- run: uv sync --group dev
|
|
19
|
+
- name: Ruff lint
|
|
20
|
+
run: uv run ruff check src/ tests/
|
|
21
|
+
- name: Mypy
|
|
22
|
+
run: uv run mypy src/xlstruct/
|
|
23
|
+
- name: Run tests
|
|
24
|
+
run: uv run pytest tests/ -v --ignore=tests/integration
|
|
25
|
+
|
|
26
|
+
verify-version:
|
|
27
|
+
runs-on: ubuntu-latest
|
|
28
|
+
steps:
|
|
29
|
+
- uses: actions/checkout@v4
|
|
30
|
+
- name: Check version matches tag
|
|
31
|
+
run: |
|
|
32
|
+
TAG_VERSION="${GITHUB_REF_NAME#v}"
|
|
33
|
+
PKG_VERSION=$(python -c "
|
|
34
|
+
import re
|
|
35
|
+
with open('pyproject.toml') as f:
|
|
36
|
+
match = re.search(r'version\s*=\s*\"(.+?)\"', f.read())
|
|
37
|
+
print(match.group(1))
|
|
38
|
+
")
|
|
39
|
+
if [ "$TAG_VERSION" != "$PKG_VERSION" ]; then
|
|
40
|
+
echo "ERROR: Tag version ($TAG_VERSION) != pyproject.toml version ($PKG_VERSION)"
|
|
41
|
+
exit 1
|
|
42
|
+
fi
|
|
43
|
+
echo "Version check passed: $TAG_VERSION"
|
|
44
|
+
|
|
45
|
+
publish:
|
|
46
|
+
runs-on: ubuntu-latest
|
|
47
|
+
needs: [test, verify-version]
|
|
48
|
+
permissions:
|
|
49
|
+
id-token: write
|
|
50
|
+
steps:
|
|
51
|
+
- uses: actions/checkout@v4
|
|
52
|
+
- uses: astral-sh/setup-uv@v5
|
|
53
|
+
with:
|
|
54
|
+
enable-cache: true
|
|
55
|
+
- uses: actions/setup-python@v5
|
|
56
|
+
with:
|
|
57
|
+
python-version: "3.11"
|
|
58
|
+
- name: Build package
|
|
59
|
+
run: uv build
|
|
60
|
+
- name: Publish to PyPI
|
|
61
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# * Environment
|
|
2
|
+
.env
|
|
3
|
+
scripts/.env
|
|
4
|
+
**/prd
|
|
5
|
+
|
|
6
|
+
# * Python
|
|
7
|
+
__pycache__/
|
|
8
|
+
*.py[cod]
|
|
9
|
+
*.egg-info/
|
|
10
|
+
dist/
|
|
11
|
+
build/
|
|
12
|
+
.venv/
|
|
13
|
+
data/
|
|
14
|
+
results/
|
|
15
|
+
generated/
|
|
16
|
+
*.json
|
|
17
|
+
|
|
18
|
+
# * IDE
|
|
19
|
+
.idea/
|
|
20
|
+
.vscode/
|
|
21
|
+
*.swp
|
|
22
|
+
|
|
23
|
+
# * Testing
|
|
24
|
+
.pytest_cache/
|
|
25
|
+
.mypy_cache/
|
|
26
|
+
.ruff_cache/
|
|
27
|
+
tests/fixtures/data/
|
|
28
|
+
|
|
29
|
+
# * Local scripts (project-specific, not for public repo)
|
|
30
|
+
scripts/
|
|
31
|
+
|
|
32
|
+
# * Project instructions (not for public repo)
|
|
33
|
+
CLAUDE.md
|
|
34
|
+
|
|
35
|
+
# * MAC OS
|
|
36
|
+
.DS_Store
|
|
37
|
+
|
|
38
|
+
# * MCP
|
|
39
|
+
.serena/
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: local
|
|
3
|
+
hooks:
|
|
4
|
+
- id: ruff
|
|
5
|
+
name: ruff
|
|
6
|
+
entry: uv run ruff check src/ tests/
|
|
7
|
+
language: system
|
|
8
|
+
pass_filenames: false
|
|
9
|
+
types: [python]
|
|
10
|
+
|
|
11
|
+
- id: mypy
|
|
12
|
+
name: mypy
|
|
13
|
+
entry: uv run mypy src/xlstruct/
|
|
14
|
+
language: system
|
|
15
|
+
pass_filenames: false
|
|
16
|
+
types: [python]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2026-03-10
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- Schema-driven Excel extraction via Pydantic models
|
|
13
|
+
- Two extraction modes: direct LLM extraction and code generation
|
|
14
|
+
- `HybridReader` — calamine (Rust) + openpyxl dual-pass reader
|
|
15
|
+
- `CompressedEncoder` — token-aware sheet encoding with sampling
|
|
16
|
+
- `ChunkSplitter` — automatic chunking for large sheets
|
|
17
|
+
- Code generation pipeline with self-correction (Analyzer → Parser → Transformer)
|
|
18
|
+
- Sandboxed script execution (`SubprocessBackend`) with blocked imports and stripped credentials
|
|
19
|
+
- Multi-provider LLM support via Instructor (OpenAI, Anthropic, Gemini)
|
|
20
|
+
- Cloud storage support via fsspec (S3, Azure Blob, GCS)
|
|
21
|
+
- Async-first API with `*_sync()` convenience wrappers
|
|
22
|
+
- Typer CLI (`xlstruct extract`)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Contributing to XLStruct
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in contributing! AI-assisted contributions (issue creation, coding, reviews) are welcome.
|
|
4
|
+
|
|
5
|
+
## Before You Submit
|
|
6
|
+
|
|
7
|
+
- `uv sync --group dev` to install dependencies
|
|
8
|
+
- `uv run pytest tests/ -v --ignore=tests/integration` must pass
|
|
9
|
+
- `uv run ruff check src/ tests/` and `uv run mypy src/xlstruct/` must pass
|
|
10
|
+
- Pre-commit hooks run these automatically
|
|
11
|
+
|
|
12
|
+
## Code Style
|
|
13
|
+
|
|
14
|
+
- Python 3.11+, `T | None` (not `Optional[T]`), Pydantic V2
|
|
15
|
+
|
|
16
|
+
## Pull Requests
|
|
17
|
+
|
|
18
|
+
1. Fork → feature branch → make changes with tests → PR against `main`
|
|
19
|
+
2. Keep PRs focused — one feature or fix per PR
|
xlstruct-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 DanMeon
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
xlstruct-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xlstruct
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: LLM-powered Excel parser — define a Pydantic schema, get structured data from any Excel file
|
|
5
|
+
Project-URL: Homepage, https://github.com/DanMeon/xlstruct
|
|
6
|
+
Project-URL: Repository, https://github.com/DanMeon/xlstruct
|
|
7
|
+
Project-URL: Issues, https://github.com/DanMeon/xlstruct/issues
|
|
8
|
+
Project-URL: Changelog, https://github.com/DanMeon/xlstruct/blob/main/CHANGELOG.md
|
|
9
|
+
Author: DanMeon
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: excel,extraction,llm,pydantic,spreadsheet,structured-data
|
|
13
|
+
Classifier: Development Status :: 3 - Alpha
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Office/Business :: Financial :: Spreadsheet
|
|
20
|
+
Classifier: Typing :: Typed
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Requires-Dist: fsspec>=2024.1
|
|
23
|
+
Requires-Dist: instructor<2,>=1.14.5
|
|
24
|
+
Requires-Dist: openpyxl>=3.1
|
|
25
|
+
Requires-Dist: pydantic>=2.0
|
|
26
|
+
Requires-Dist: python-calamine>=0.6.2
|
|
27
|
+
Requires-Dist: tiktoken>=0.7
|
|
28
|
+
Requires-Dist: typer>=0.9
|
|
29
|
+
Provides-Extra: all
|
|
30
|
+
Requires-Dist: adlfs; extra == 'all'
|
|
31
|
+
Requires-Dist: anthropic; extra == 'all'
|
|
32
|
+
Requires-Dist: gcsfs; extra == 'all'
|
|
33
|
+
Requires-Dist: google-genai; extra == 'all'
|
|
34
|
+
Requires-Dist: openai; extra == 'all'
|
|
35
|
+
Requires-Dist: s3fs; extra == 'all'
|
|
36
|
+
Provides-Extra: all-llm
|
|
37
|
+
Requires-Dist: anthropic; extra == 'all-llm'
|
|
38
|
+
Requires-Dist: google-genai; extra == 'all-llm'
|
|
39
|
+
Requires-Dist: openai; extra == 'all-llm'
|
|
40
|
+
Provides-Extra: all-storage
|
|
41
|
+
Requires-Dist: adlfs; extra == 'all-storage'
|
|
42
|
+
Requires-Dist: gcsfs; extra == 'all-storage'
|
|
43
|
+
Requires-Dist: s3fs; extra == 'all-storage'
|
|
44
|
+
Provides-Extra: anthropic
|
|
45
|
+
Requires-Dist: anthropic; extra == 'anthropic'
|
|
46
|
+
Provides-Extra: azure
|
|
47
|
+
Requires-Dist: adlfs; extra == 'azure'
|
|
48
|
+
Provides-Extra: docker
|
|
49
|
+
Requires-Dist: aiodocker>=0.26.0; extra == 'docker'
|
|
50
|
+
Provides-Extra: gcs
|
|
51
|
+
Requires-Dist: gcsfs; extra == 'gcs'
|
|
52
|
+
Provides-Extra: gemini
|
|
53
|
+
Requires-Dist: google-genai; extra == 'gemini'
|
|
54
|
+
Provides-Extra: openai
|
|
55
|
+
Requires-Dist: openai; extra == 'openai'
|
|
56
|
+
Provides-Extra: s3
|
|
57
|
+
Requires-Dist: s3fs; extra == 's3'
|
|
58
|
+
Description-Content-Type: text/markdown
|
|
59
|
+
|
|
60
|
+
# XLStruct
|
|
61
|
+
|
|
62
|
+
[](https://github.com/DanMeon/xlstruct/actions/workflows/ci.yml)
|
|
63
|
+
[](https://pypi.org/project/xlstruct/)
|
|
64
|
+
[](https://pypi.org/project/xlstruct/)
|
|
65
|
+
[](https://opensource.org/licenses/MIT)
|
|
66
|
+
|
|
67
|
+
LLM-powered Excel/CSV parser — Define a Pydantic schema, get structured data from any spreadsheet.
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
Excel File + Pydantic Schema → LLM → Validated Structured Data
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Features
|
|
74
|
+
|
|
75
|
+
- **Schema-driven extraction** — Define a Pydantic model, get validated instances from any spreadsheet. No parsing code needed.
|
|
76
|
+
- **Excel + CSV** — `.xlsx`, `.xlsm`, `.xltx`, `.xltm`, `.xls`, and `.csv` supported out of the box
|
|
77
|
+
- **Any Excel layout** — Flat tables, merged cells, multi-level headers, form+table hybrids — handled by a single API
|
|
78
|
+
- **Two extraction modes** — Direct LLM extraction for small sheets; code generation for large ones. Auto-routed by sheet size, or choose manually.
|
|
79
|
+
- **Reusable scripts** — Codegen mode produces a standalone Python script. Run it without LLM calls — pay for generation once, use forever.
|
|
80
|
+
- **Schema suggestion** — `suggest_schema()` analyzes a spreadsheet and generates a Pydantic model for you
|
|
81
|
+
- **Fast hybrid reader** — calamine (Rust) for speed + openpyxl for formula extraction. Both passes in one call.
|
|
82
|
+
- **Token usage tracking** — Every extraction returns token counts, per-call breakdown, and prompt cache hit metrics
|
|
83
|
+
- **Prompt caching** — Anthropic cache_control markers applied automatically; OpenAI cached_tokens tracked
|
|
84
|
+
- **Token-aware encoding** — Auto-selects encoding strategy (markdown vs compressed) and chunks large sheets to fit within token budget
|
|
85
|
+
- **Sandboxed execution** — Generated scripts run in a subprocess with blocked imports (network, subprocess) and stripped credentials
|
|
86
|
+
- **Multi-provider LLM** — OpenAI, Anthropic, Gemini via [Instructor](https://github.com/jxnl/instructor)
|
|
87
|
+
- **Cloud storage** — Read from S3, Azure Blob, GCS via fsspec
|
|
88
|
+
- **Async-first** — Async API with sync convenience wrappers. Jupyter-compatible via nest_asyncio.
|
|
89
|
+
|
|
90
|
+
## Installation
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install xlstruct
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Extras
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# LLM providers
|
|
100
|
+
pip install "xlstruct[openai]"
|
|
101
|
+
pip install "xlstruct[anthropic]"
|
|
102
|
+
pip install "xlstruct[gemini]"
|
|
103
|
+
|
|
104
|
+
# Cloud storage
|
|
105
|
+
pip install "xlstruct[s3]" # AWS S3
|
|
106
|
+
pip install "xlstruct[azure]" # Azure Blob Storage
|
|
107
|
+
pip install "xlstruct[gcs]" # Google Cloud Storage
|
|
108
|
+
|
|
109
|
+
# Everything
|
|
110
|
+
pip install "xlstruct[all]"
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Quick Start
|
|
114
|
+
|
|
115
|
+
### 1. Define a Pydantic schema
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from pydantic import BaseModel, Field
|
|
119
|
+
|
|
120
|
+
class InvoiceItem(BaseModel):
|
|
121
|
+
description: str = Field(description="Item description")
|
|
122
|
+
quantity: int
|
|
123
|
+
unit_price: float
|
|
124
|
+
total: float
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### 2. Extract data
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from xlstruct import Extractor
|
|
131
|
+
|
|
132
|
+
extractor = Extractor(provider="openai/gpt-4o")
|
|
133
|
+
results = extractor.extract_sync("invoice.xlsx", schema=InvoiceItem)
|
|
134
|
+
|
|
135
|
+
for item in results:
|
|
136
|
+
print(item.model_dump())
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Async usage
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
import asyncio
|
|
143
|
+
from xlstruct import Extractor
|
|
144
|
+
|
|
145
|
+
async def main():
|
|
146
|
+
extractor = Extractor(provider="anthropic/claude-sonnet-4-6")
|
|
147
|
+
results = await extractor.extract("invoice.xlsx", schema=InvoiceItem)
|
|
148
|
+
for item in results:
|
|
149
|
+
print(item.model_dump())
|
|
150
|
+
|
|
151
|
+
asyncio.run(main())
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Fine-grained control with ExtractionConfig
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from xlstruct import ExtractionConfig
|
|
158
|
+
|
|
159
|
+
config = ExtractionConfig(
|
|
160
|
+
output_schema=InvoiceItem,
|
|
161
|
+
sheet="Sheet1",
|
|
162
|
+
header_rows=[1],
|
|
163
|
+
instructions="Parse dates as YYYY-MM-DD. Skip empty rows.",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
results = extractor.extract_sync("invoice.xlsx", extraction_config=config)
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Schema Suggestion
|
|
170
|
+
|
|
171
|
+
Don't know the spreadsheet structure? Let the LLM suggest a schema:
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
code = extractor.suggest_schema_sync("unknown_data.xlsx")
|
|
175
|
+
print(code) # Prints a Pydantic model definition
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
# With hints
|
|
180
|
+
code = extractor.suggest_schema_sync(
|
|
181
|
+
"report.xlsx",
|
|
182
|
+
sheet="Q1 Sales",
|
|
183
|
+
instructions="Focus on financial columns only",
|
|
184
|
+
)
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
### Token Usage Tracking
|
|
188
|
+
|
|
189
|
+
Every extraction returns token usage details:
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
results = extractor.extract_sync("invoice.xlsx", schema=InvoiceItem)
|
|
193
|
+
|
|
194
|
+
print(results.usage.llm_calls) # Number of LLM API calls
|
|
195
|
+
print(results.usage.input_tokens) # Total input tokens
|
|
196
|
+
print(results.usage.output_tokens) # Total output tokens
|
|
197
|
+
print(results.usage.total_tokens) # Sum
|
|
198
|
+
print(results.usage.cache_read_tokens) # Prompt cache hits (Anthropic + OpenAI)
|
|
199
|
+
print(results.usage.breakdown) # Per-call breakdown: [(label, in, out), ...]
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### CSV Support
|
|
203
|
+
|
|
204
|
+
CSV files work with the same API — no extra configuration needed:
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
results = extractor.extract_sync("data.csv", schema=MyModel)
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
## Extraction Modes
|
|
211
|
+
|
|
212
|
+
XLStruct auto-routes based on data row count:
|
|
213
|
+
|
|
214
|
+
| Mode | Trigger | How it works |
|
|
215
|
+
|------|---------|-------------|
|
|
216
|
+
| **Direct** | ≤ 20 data rows | Sheet encoded as markdown → LLM → Pydantic models |
|
|
217
|
+
| **Codegen** | > 20 data rows | LLM generates a standalone Python parsing script → runs in sandbox |
|
|
218
|
+
|
|
219
|
+
You can force a specific mode:
|
|
220
|
+
|
|
221
|
+
```python
|
|
222
|
+
from xlstruct import ExtractionConfig, ExtractionMode
|
|
223
|
+
|
|
224
|
+
config = ExtractionConfig(
|
|
225
|
+
mode=ExtractionMode.CODEGEN, # Force code generation
|
|
226
|
+
output_schema=MySchema,
|
|
227
|
+
header_rows=[1],
|
|
228
|
+
)
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Code Generation Pipeline
|
|
232
|
+
|
|
233
|
+
1. **Header Detection** — Auto-detect header rows via lightweight LLM call
|
|
234
|
+
2. **Phase 0 (Analyzer)** — LLM analyzes spreadsheet structure → `MappingPlan`
|
|
235
|
+
3. **Phase 1 (Parser Agent)** — LLM generates openpyxl-based parsing script → validated via subprocess
|
|
236
|
+
4. **Phase 2 (Transformer)** — Optional: adds data transformation rules
|
|
237
|
+
|
|
238
|
+
Each phase includes self-correction — errors are fed back to the LLM (up to `max_codegen_retries`).
|
|
239
|
+
|
|
240
|
+
### Generate Standalone Scripts
|
|
241
|
+
|
|
242
|
+
```python
|
|
243
|
+
script = extractor.generate_script_sync("report.xlsx", config)
|
|
244
|
+
print(script.code) # Reusable Python script
|
|
245
|
+
print(script.explanation) # How it works
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## Cloud Storage
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
# AWS S3
|
|
252
|
+
results = await extractor.extract(
|
|
253
|
+
"s3://my-bucket/data/report.xlsx",
|
|
254
|
+
schema=MySchema,
|
|
255
|
+
key="AWS_KEY", secret="AWS_SECRET",
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
# Azure Blob Storage
|
|
259
|
+
results = await extractor.extract(
|
|
260
|
+
"az://my-container/report.xlsx",
|
|
261
|
+
schema=MySchema,
|
|
262
|
+
account_name="...", account_key="...",
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
# Google Cloud Storage
|
|
266
|
+
results = await extractor.extract(
|
|
267
|
+
"gs://my-bucket/report.xlsx",
|
|
268
|
+
schema=MySchema,
|
|
269
|
+
token="/path/to/credentials.json",
|
|
270
|
+
)
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
## CLI
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
# Basic usage
|
|
277
|
+
xlstruct extract invoice.xlsx --schema myapp.models:InvoiceItem
|
|
278
|
+
|
|
279
|
+
# With options
|
|
280
|
+
xlstruct extract report.xlsx \
|
|
281
|
+
--schema myapp.models:SalesRecord \
|
|
282
|
+
--provider anthropic/claude-sonnet-4-6 \
|
|
283
|
+
--sheet "Q1 Sales" \
|
|
284
|
+
--instructions "Ignore summary rows" \
|
|
285
|
+
--output results.json
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
## Configuration
|
|
289
|
+
|
|
290
|
+
### ExtractorConfig
|
|
291
|
+
|
|
292
|
+
| Parameter | Default | Description |
|
|
293
|
+
|-----------|---------|-------------|
|
|
294
|
+
| `provider` | `"anthropic/claude-sonnet-4-6"` | LLM provider (`openai/gpt-4o`, `anthropic/claude-sonnet-4-6`, etc.) |
|
|
295
|
+
| `api_key` | `None` | API key (falls back to environment variables) |
|
|
296
|
+
| `max_retries` | `3` | LLM API retry count |
|
|
297
|
+
| `token_budget` | `100_000` | Max tokens per sheet |
|
|
298
|
+
| `temperature` | `0.0` | LLM temperature |
|
|
299
|
+
| `max_codegen_retries` | `3` | Self-correction attempts for code generation |
|
|
300
|
+
| `codegen_timeout` | `60` | Script execution timeout in seconds |
|
|
301
|
+
| `export_dir` | `None` | Directory to auto-save generated codegen scripts |
|
|
302
|
+
|
|
303
|
+
```python
|
|
304
|
+
from xlstruct import ExtractorConfig
|
|
305
|
+
|
|
306
|
+
config = ExtractorConfig(
|
|
307
|
+
provider="openai/gpt-4o",
|
|
308
|
+
temperature=0.0,
|
|
309
|
+
token_budget=200_000,
|
|
310
|
+
)
|
|
311
|
+
extractor = Extractor(config=config)
|
|
312
|
+
```
|
|
313
|
+
|
|
314
|
+
### Docker Backend
|
|
315
|
+
|
|
316
|
+
For OS-level sandboxing, pass a `DockerBackend` via `execution_backend`:
|
|
317
|
+
|
|
318
|
+
```bash
|
|
319
|
+
pip install xlstruct[docker]
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
```python
|
|
323
|
+
from xlstruct import Extractor
|
|
324
|
+
from xlstruct.codegen.backends.docker import DockerBackend, DockerConfig
|
|
325
|
+
|
|
326
|
+
extractor = Extractor(
|
|
327
|
+
execution_backend=DockerBackend(
|
|
328
|
+
config=DockerConfig(image="python:3.12-slim", mem_limit="1g"),
|
|
329
|
+
),
|
|
330
|
+
)
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
## Architecture
|
|
334
|
+
|
|
335
|
+
```
|
|
336
|
+
Storage (fsspec) → Reader (HybridReader / CsvReader) → CompressedEncoder → LLM Engine → Pydantic
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
### Reader
|
|
340
|
+
|
|
341
|
+
**Excel** — `HybridReader` uses a 2-pass approach:
|
|
342
|
+
- **Pass 1 (calamine, Rust)** — values, merged cells, data types, dimensions
|
|
343
|
+
- **Pass 2 (openpyxl, read-only)** — formula strings (`.xlsx`/`.xlsm` only)
|
|
344
|
+
|
|
345
|
+
**CSV** — `CsvReader` uses Python stdlib `csv` module. No extra dependencies.
|
|
346
|
+
|
|
347
|
+
### Encoder
|
|
348
|
+
|
|
349
|
+
`CompressedEncoder` converts sheet data to markdown tables with structural metadata:
|
|
350
|
+
- Column types, formula patterns, merged regions
|
|
351
|
+
- Optional head+tail sampling for large sheets (20 rows by default)
|
|
352
|
+
|
|
353
|
+
### Exceptions
|
|
354
|
+
|
|
355
|
+
```
|
|
356
|
+
XLStructError (base)
|
|
357
|
+
├── StorageError
|
|
358
|
+
├── ReaderError
|
|
359
|
+
├── ExtractionError
|
|
360
|
+
└── CodegenValidationError
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
## Development
|
|
364
|
+
|
|
365
|
+
```bash
|
|
366
|
+
uv sync # Install dependencies
|
|
367
|
+
uv run pytest tests/ -v # Run all tests
|
|
368
|
+
uv run ruff check src/ tests/ # Lint
|
|
369
|
+
uv run mypy src/xlstruct/ # Type check
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
## License
|
|
373
|
+
|
|
374
|
+
MIT
|