terbium-parse 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- terbium_parse-0.1.0/LICENSE +21 -0
- terbium_parse-0.1.0/PKG-INFO +176 -0
- terbium_parse-0.1.0/README.md +145 -0
- terbium_parse-0.1.0/pyproject.toml +45 -0
- terbium_parse-0.1.0/setup.cfg +4 -0
- terbium_parse-0.1.0/src/terbium/__init__.py +31 -0
- terbium_parse-0.1.0/src/terbium/api.py +106 -0
- terbium_parse-0.1.0/src/terbium/cli.py +53 -0
- terbium_parse-0.1.0/src/terbium/documents/__init__.py +8 -0
- terbium_parse-0.1.0/src/terbium/documents/base.py +46 -0
- terbium_parse-0.1.0/src/terbium/documents/csv_adapter.py +78 -0
- terbium_parse-0.1.0/src/terbium/documents/pdf.py +110 -0
- terbium_parse-0.1.0/src/terbium/documents/pptx_adapter.py +129 -0
- terbium_parse-0.1.0/src/terbium/documents/xlsx_adapter.py +88 -0
- terbium_parse-0.1.0/src/terbium/harness/__init__.py +7 -0
- terbium_parse-0.1.0/src/terbium/harness/ai.py +40 -0
- terbium_parse-0.1.0/src/terbium/harness/arrange.py +120 -0
- terbium_parse-0.1.0/src/terbium/harness/escalation.py +39 -0
- terbium_parse-0.1.0/src/terbium/harness/providers/__init__.py +29 -0
- terbium_parse-0.1.0/src/terbium/harness/providers/anthropic_provider.py +50 -0
- terbium_parse-0.1.0/src/terbium/harness/providers/base.py +13 -0
- terbium_parse-0.1.0/src/terbium/harness/providers/gemini_provider.py +48 -0
- terbium_parse-0.1.0/src/terbium/harness/router.py +46 -0
- terbium_parse-0.1.0/src/terbium/harness/vision.py +41 -0
- terbium_parse-0.1.0/src/terbium/layout/__init__.py +3 -0
- terbium_parse-0.1.0/src/terbium/layout/columns.py +32 -0
- terbium_parse-0.1.0/src/terbium/layout/confidence.py +50 -0
- terbium_parse-0.1.0/src/terbium/layout/dehead.py +64 -0
- terbium_parse-0.1.0/src/terbium/layout/grid.py +214 -0
- terbium_parse-0.1.0/src/terbium/layout/images.py +20 -0
- terbium_parse-0.1.0/src/terbium/layout/lines.py +34 -0
- terbium_parse-0.1.0/src/terbium/layout/signals.py +81 -0
- terbium_parse-0.1.0/src/terbium/model/__init__.py +15 -0
- terbium_parse-0.1.0/src/terbium/model/document.py +69 -0
- terbium_parse-0.1.0/src/terbium/model/elements.py +88 -0
- terbium_parse-0.1.0/src/terbium/model/record.py +43 -0
- terbium_parse-0.1.0/src/terbium/model/table.py +45 -0
- terbium_parse-0.1.0/src/terbium/py.typed +0 -0
- terbium_parse-0.1.0/src/terbium/schema/__init__.py +5 -0
- terbium_parse-0.1.0/src/terbium/schema/base.py +40 -0
- terbium_parse-0.1.0/src/terbium/schema/furniture.py +51 -0
- terbium_parse-0.1.0/src/terbium/schema/generic.py +76 -0
- terbium_parse-0.1.0/src/terbium_parse.egg-info/PKG-INFO +176 -0
- terbium_parse-0.1.0/src/terbium_parse.egg-info/SOURCES.txt +47 -0
- terbium_parse-0.1.0/src/terbium_parse.egg-info/dependency_links.txt +1 -0
- terbium_parse-0.1.0/src/terbium_parse.egg-info/entry_points.txt +2 -0
- terbium_parse-0.1.0/src/terbium_parse.egg-info/requires.txt +17 -0
- terbium_parse-0.1.0/src/terbium_parse.egg-info/top_level.txt +1 -0
- terbium_parse-0.1.0/tests/test_smoke.py +82 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 anishfyi
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: terbium-parse
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A god-level algorithmic multi-file parser (PDF/PPTX/XLSX/CSV) that scores its own confidence and only reaches for AI when it is genuinely stuck.
|
|
5
|
+
Author: anishfyi
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://anishfyi.github.io/terbium
|
|
8
|
+
Project-URL: Repository, https://github.com/anishfyi/terbium
|
|
9
|
+
Keywords: parser,pdf,pptx,xlsx,csv,extraction,document,ai,llm
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Topic :: Text Processing :: General
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: PyMuPDF>=1.24
|
|
18
|
+
Requires-Dist: python-pptx>=0.6.23
|
|
19
|
+
Requires-Dist: openpyxl>=3.1
|
|
20
|
+
Requires-Dist: pillow>=10.0
|
|
21
|
+
Provides-Extra: anthropic
|
|
22
|
+
Requires-Dist: anthropic>=0.34; extra == "anthropic"
|
|
23
|
+
Provides-Extra: gemini
|
|
24
|
+
Requires-Dist: google-generativeai>=0.7; extra == "gemini"
|
|
25
|
+
Provides-Extra: ai
|
|
26
|
+
Requires-Dist: anthropic>=0.34; extra == "ai"
|
|
27
|
+
Requires-Dist: google-generativeai>=0.7; extra == "ai"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
<div align="center">
|
|
33
|
+
|
|
34
|
+
<img src="https://raw.githubusercontent.com/anishfyi/terbium/main/assets/logo.png" width="150" alt="terbium: a periodic-table tile reading 65 Tb terbium">
|
|
35
|
+
|
|
36
|
+
# terbium
|
|
37
|
+
|
|
38
|
+
**A god-level algorithmic multi-file parser that knows when it is stuck.**
|
|
39
|
+
It reconstructs a document's structure from geometry, scores its own confidence,
|
|
40
|
+
and only reaches for an AI model when the algorithm cannot be sure.
|
|
41
|
+
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
[](pyproject.toml)
|
|
44
|
+
[](#what-it-parses)
|
|
45
|
+
[](pyproject.toml)
|
|
46
|
+
|
|
47
|
+
[Website](https://anishfyi.github.io/terbium) · [Trove](https://github.com/anishfyi/trove)
|
|
48
|
+
|
|
49
|
+
</div>
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
A vendor document carries most of its content as text but almost none of its
|
|
54
|
+
structure. A furniture catalogue page is a 2-D matrix: rows are sizes, columns
|
|
55
|
+
are finishes, and the cells are article numbers. Flatten it to text and the grid
|
|
56
|
+
is gone, the columns collapse into a single line, and the numbers lose their
|
|
57
|
+
meaning. terbium rebuilds that structure from the raw position of every word,
|
|
58
|
+
and it is honest about how sure it is.
|
|
59
|
+
|
|
60
|
+
Most parsers do one of two things: they fail silently on the hard pages, or they
|
|
61
|
+
throw the whole document at an LLM and bill you for the easy pages too. terbium
|
|
62
|
+
does neither. It solves what it can algorithmically, scores every record, and
|
|
63
|
+
when a page is genuinely ambiguous it either routes just that page to the right
|
|
64
|
+
model tier, or, if you gave it no key, tells you so in plain words.
|
|
65
|
+
|
|
66
|
+
## The loop
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
FILE -> ADAPT -> RECONSTRUCT -> SCORE -> [ESCALATE]
|
|
70
|
+
| | | |
|
|
71
|
+
pdf/pptx/ columns, rows, confidence hard pages only:
|
|
72
|
+
xlsx/csv matrices from per record AI if key, else
|
|
73
|
+
geometry "add a key" message
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
| Phase | What happens |
|
|
77
|
+
|---|---|
|
|
78
|
+
| **Adapt** | One adapter per format normalizes bytes into positioned words + images |
|
|
79
|
+
| **Reconstruct** | Strip repeated headers, split two-page spreads, rebuild columns/rows/matrices from word geometry |
|
|
80
|
+
| **Score** | Every table gets a 0-1 confidence from grid regularity, header presence, and fill |
|
|
81
|
+
| **Escalate** | Below threshold: route the page to Haiku/Sonnet/Opus, or announce that a key would resolve it |
|
|
82
|
+
|
|
83
|
+
## Quickstart
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
pip install terbium-parse
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
import terbium
|
|
91
|
+
|
|
92
|
+
doc = terbium.parse("Furniture Catalogue.pdf") # algorithmic only, no key needed
|
|
93
|
+
print(doc.stats) # Stats(total=725, confident=712, ambiguous=13)
|
|
94
|
+
|
|
95
|
+
for r in doc.records:
|
|
96
|
+
print(r.sku, r.fields)
|
|
97
|
+
|
|
98
|
+
# opt into AI only for the pages the engine could not resolve
|
|
99
|
+
doc = terbium.parse("Furniture Catalogue.pdf",
|
|
100
|
+
schema="furniture",
|
|
101
|
+
ai=terbium.AI(anthropic_key="sk-..."))
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Run it from the shell:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
terbium "Furniture Catalogue.pdf" --schema furniture
|
|
108
|
+
terbium report.xlsx --json out.json
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## What it parses
|
|
112
|
+
|
|
113
|
+
| Format | Engine | How |
|
|
114
|
+
|---|---|---|
|
|
115
|
+
| **PDF** | word-level geometry | rebuild columns/rows/matrices from the position of every word |
|
|
116
|
+
| **PPTX** | python-pptx | native slides, tables and images, straight from the deck structure |
|
|
117
|
+
| **XLSX** | openpyxl | cells, merged ranges propagated, wide/long layouts |
|
|
118
|
+
| **CSV** | stdlib | delimiter, encoding and type inference |
|
|
119
|
+
|
|
120
|
+
PDF gets the full geometry engine because a PDF throws its structure away. PPTX,
|
|
121
|
+
XLSX and CSV already carry native structure, so terbium leans on it and parses
|
|
122
|
+
them cleanly and cheaply.
|
|
123
|
+
|
|
124
|
+
## Confidence and escalation
|
|
125
|
+
|
|
126
|
+
terbium never pretends a shaky parse is solid. When it cannot be sure and no key
|
|
127
|
+
is set, it prints exactly what it could not do:
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
terbium: 712/725 records parsed confidently.
|
|
131
|
+
3 table(s) on page(s) 15, 26, 30 are ambiguous (no product title found above
|
|
132
|
+
the table; sparse matrix: 5/9 cells filled; 2 row(s) do not line up).
|
|
133
|
+
-> set ANTHROPIC_API_KEY or pass ai=terbium.AI(...) recommended tier: Sonnet
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Every record exposes its own `confidence` and the `reasons` behind it, so you can
|
|
137
|
+
filter, sort, or route on it yourself.
|
|
138
|
+
|
|
139
|
+
## The AI lane
|
|
140
|
+
|
|
141
|
+
The AI lane is opt-in and only ever sees the hard pages.
|
|
142
|
+
|
|
143
|
+
- **Routing.** Difficulty scales the tier: trivial to Haiku, moderate to Sonnet,
|
|
144
|
+
hard or low-confidence to Opus. Pin a tier with `terbium.AI(force_tier="opus")`.
|
|
145
|
+
- **Arrange.** A hard table is handed to the routed model with the page's raw
|
|
146
|
+
text and, for PDFs, a rendered image, and rebuilt into a clean matrix.
|
|
147
|
+
- **Vision.** Material icons (FSC, oiled, varnished) and finish swatches live only
|
|
148
|
+
in the pixels; `terbium.read_images(path, page, ai)` reads them with a vision
|
|
149
|
+
model. Note: Nano Banana (Gemini image) is for generation, not reading, so it is
|
|
150
|
+
not on the parse path.
|
|
151
|
+
|
|
152
|
+
Keys come from `terbium.AI(...)` or the `ANTHROPIC_API_KEY` / `GEMINI_API_KEY`
|
|
153
|
+
environment variables.
|
|
154
|
+
|
|
155
|
+
## Schemas
|
|
156
|
+
|
|
157
|
+
A schema turns reconstructed tables into typed records. Ships with two:
|
|
158
|
+
|
|
159
|
+
- `generic` (default): one record per row for grids, one per cell for matrices.
|
|
160
|
+
- `furniture`: product, size, finish, and metric + imperial dimensions per SKU.
|
|
161
|
+
|
|
162
|
+
Add your own by subclassing `terbium.schema.Schema` and registering it.
|
|
163
|
+
|
|
164
|
+
## Install from source
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
git clone https://github.com/anishfyi/terbium.git
|
|
168
|
+
cd terbium
|
|
169
|
+
pip install -e .
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## License
|
|
173
|
+
|
|
174
|
+
MIT. Built by [anishfyi](https://github.com/anishfyi).
|
|
175
|
+
|
|
176
|
+
<div align="center"><sub>terbium · Tb · 65</sub></div>
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
<img src="https://raw.githubusercontent.com/anishfyi/terbium/main/assets/logo.png" width="150" alt="terbium: a periodic-table tile reading 65 Tb terbium">
|
|
4
|
+
|
|
5
|
+
# terbium
|
|
6
|
+
|
|
7
|
+
**A god-level algorithmic multi-file parser that knows when it is stuck.**
|
|
8
|
+
It reconstructs a document's structure from geometry, scores its own confidence,
|
|
9
|
+
and only reaches for an AI model when the algorithm cannot be sure.
|
|
10
|
+
|
|
11
|
+
[](LICENSE)
|
|
12
|
+
[](pyproject.toml)
|
|
13
|
+
[](#what-it-parses)
|
|
14
|
+
[](pyproject.toml)
|
|
15
|
+
|
|
16
|
+
[Website](https://anishfyi.github.io/terbium) · [Trove](https://github.com/anishfyi/trove)
|
|
17
|
+
|
|
18
|
+
</div>
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
A vendor document carries most of its content as text but almost none of its
|
|
23
|
+
structure. A furniture catalogue page is a 2-D matrix: rows are sizes, columns
|
|
24
|
+
are finishes, and the cells are article numbers. Flatten it to text and the grid
|
|
25
|
+
is gone, the columns collapse into a single line, and the numbers lose their
|
|
26
|
+
meaning. terbium rebuilds that structure from the raw position of every word,
|
|
27
|
+
and it is honest about how sure it is.
|
|
28
|
+
|
|
29
|
+
Most parsers do one of two things: they fail silently on the hard pages, or they
|
|
30
|
+
throw the whole document at an LLM and bill you for the easy pages too. terbium
|
|
31
|
+
does neither. It solves what it can algorithmically, scores every record, and
|
|
32
|
+
when a page is genuinely ambiguous it either routes just that page to the right
|
|
33
|
+
model tier, or, if you gave it no key, tells you so in plain words.
|
|
34
|
+
|
|
35
|
+
## The loop
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
FILE -> ADAPT -> RECONSTRUCT -> SCORE -> [ESCALATE]
|
|
39
|
+
| | | |
|
|
40
|
+
pdf/pptx/ columns, rows, confidence hard pages only:
|
|
41
|
+
xlsx/csv matrices from per record AI if key, else
|
|
42
|
+
geometry "add a key" message
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
| Phase | What happens |
|
|
46
|
+
|---|---|
|
|
47
|
+
| **Adapt** | One adapter per format normalizes bytes into positioned words + images |
|
|
48
|
+
| **Reconstruct** | Strip repeated headers, split two-page spreads, rebuild columns/rows/matrices from word geometry |
|
|
49
|
+
| **Score** | Every table gets a 0-1 confidence from grid regularity, header presence, and fill |
|
|
50
|
+
| **Escalate** | Below threshold: route the page to Haiku/Sonnet/Opus, or announce that a key would resolve it |
|
|
51
|
+
|
|
52
|
+
## Quickstart
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install terbium-parse
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
import terbium
|
|
60
|
+
|
|
61
|
+
doc = terbium.parse("Furniture Catalogue.pdf") # algorithmic only, no key needed
|
|
62
|
+
print(doc.stats) # Stats(total=725, confident=712, ambiguous=13)
|
|
63
|
+
|
|
64
|
+
for r in doc.records:
|
|
65
|
+
print(r.sku, r.fields)
|
|
66
|
+
|
|
67
|
+
# opt into AI only for the pages the engine could not resolve
|
|
68
|
+
doc = terbium.parse("Furniture Catalogue.pdf",
|
|
69
|
+
schema="furniture",
|
|
70
|
+
ai=terbium.AI(anthropic_key="sk-..."))
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Run it from the shell:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
terbium "Furniture Catalogue.pdf" --schema furniture
|
|
77
|
+
terbium report.xlsx --json out.json
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## What it parses
|
|
81
|
+
|
|
82
|
+
| Format | Engine | How |
|
|
83
|
+
|---|---|---|
|
|
84
|
+
| **PDF** | word-level geometry | rebuild columns/rows/matrices from the position of every word |
|
|
85
|
+
| **PPTX** | python-pptx | native slides, tables and images, straight from the deck structure |
|
|
86
|
+
| **XLSX** | openpyxl | cells, merged ranges propagated, wide/long layouts |
|
|
87
|
+
| **CSV** | stdlib | delimiter, encoding and type inference |
|
|
88
|
+
|
|
89
|
+
PDF gets the full geometry engine because a PDF throws its structure away. PPTX,
|
|
90
|
+
XLSX and CSV already carry native structure, so terbium leans on it and parses
|
|
91
|
+
them cleanly and cheaply.
|
|
92
|
+
|
|
93
|
+
## Confidence and escalation
|
|
94
|
+
|
|
95
|
+
terbium never pretends a shaky parse is solid. When it cannot be sure and no key
|
|
96
|
+
is set, it prints exactly what it could not do:
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
terbium: 712/725 records parsed confidently.
|
|
100
|
+
3 table(s) on page(s) 15, 26, 30 are ambiguous (no product title found above
|
|
101
|
+
the table; sparse matrix: 5/9 cells filled; 2 row(s) do not line up).
|
|
102
|
+
-> set ANTHROPIC_API_KEY or pass ai=terbium.AI(...) recommended tier: Sonnet
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Every record exposes its own `confidence` and the `reasons` behind it, so you can
|
|
106
|
+
filter, sort, or route on it yourself.
|
|
107
|
+
|
|
108
|
+
## The AI lane
|
|
109
|
+
|
|
110
|
+
The AI lane is opt-in and only ever sees the hard pages.
|
|
111
|
+
|
|
112
|
+
- **Routing.** Difficulty scales the tier: trivial to Haiku, moderate to Sonnet,
|
|
113
|
+
hard or low-confidence to Opus. Pin a tier with `terbium.AI(force_tier="opus")`.
|
|
114
|
+
- **Arrange.** A hard table is handed to the routed model with the page's raw
|
|
115
|
+
text and, for PDFs, a rendered image, and rebuilt into a clean matrix.
|
|
116
|
+
- **Vision.** Material icons (FSC, oiled, varnished) and finish swatches live only
|
|
117
|
+
in the pixels; `terbium.read_images(path, page, ai)` reads them with a vision
|
|
118
|
+
model. Note: Nano Banana (Gemini image) is for generation, not reading, so it is
|
|
119
|
+
not on the parse path.
|
|
120
|
+
|
|
121
|
+
Keys come from `terbium.AI(...)` or the `ANTHROPIC_API_KEY` / `GEMINI_API_KEY`
|
|
122
|
+
environment variables.
|
|
123
|
+
|
|
124
|
+
## Schemas
|
|
125
|
+
|
|
126
|
+
A schema turns reconstructed tables into typed records. Ships with two:
|
|
127
|
+
|
|
128
|
+
- `generic` (default): one record per row for grids, one per cell for matrices.
|
|
129
|
+
- `furniture`: product, size, finish, and metric + imperial dimensions per SKU.
|
|
130
|
+
|
|
131
|
+
Add your own by subclassing `terbium.schema.Schema` and registering it.
|
|
132
|
+
|
|
133
|
+
## Install from source
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
git clone https://github.com/anishfyi/terbium.git
|
|
137
|
+
cd terbium
|
|
138
|
+
pip install -e .
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## License
|
|
142
|
+
|
|
143
|
+
MIT. Built by [anishfyi](https://github.com/anishfyi).
|
|
144
|
+
|
|
145
|
+
<div align="center"><sub>terbium · Tb · 65</sub></div>
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "terbium-parse"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A god-level algorithmic multi-file parser (PDF/PPTX/XLSX/CSV) that scores its own confidence and only reaches for AI when it is genuinely stuck."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "anishfyi" }]
|
|
13
|
+
keywords = ["parser", "pdf", "pptx", "xlsx", "csv", "extraction", "document", "ai", "llm"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Topic :: Text Processing :: General",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"PyMuPDF>=1.24",
|
|
22
|
+
"python-pptx>=0.6.23",
|
|
23
|
+
"openpyxl>=3.1",
|
|
24
|
+
"pillow>=10.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.optional-dependencies]
|
|
28
|
+
# The AI lane is opt-in. terbium runs fully without either of these.
|
|
29
|
+
anthropic = ["anthropic>=0.34"]
|
|
30
|
+
gemini = ["google-generativeai>=0.7"]
|
|
31
|
+
ai = ["anthropic>=0.34", "google-generativeai>=0.7"]
|
|
32
|
+
dev = ["pytest>=7"]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://anishfyi.github.io/terbium"
|
|
36
|
+
Repository = "https://github.com/anishfyi/terbium"
|
|
37
|
+
|
|
38
|
+
[project.scripts]
|
|
39
|
+
terbium = "terbium.cli:main"
|
|
40
|
+
|
|
41
|
+
[tool.setuptools.packages.find]
|
|
42
|
+
where = ["src"]
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.package-data]
|
|
45
|
+
terbium = ["py.typed"]
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""terbium - a god-level algorithmic multi-file parser that scores its own
|
|
2
|
+
confidence and only reaches for AI when it is genuinely stuck.
|
|
3
|
+
|
|
4
|
+
import terbium
|
|
5
|
+
doc = terbium.parse("catalogue.pdf")
|
|
6
|
+
print(doc.stats)
|
|
7
|
+
for r in doc.records:
|
|
8
|
+
print(r.sku, r.fields)
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .api import parse, supported_extensions, DEFAULT_THRESHOLD
|
|
13
|
+
from .harness import AI
|
|
14
|
+
from .harness.vision import read_page as read_images
|
|
15
|
+
from .model.document import ParsedDocument, Stats
|
|
16
|
+
from .model.record import Record
|
|
17
|
+
from .model.table import ExtractedTable
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"parse",
|
|
23
|
+
"AI",
|
|
24
|
+
"read_images",
|
|
25
|
+
"supported_extensions",
|
|
26
|
+
"ParsedDocument",
|
|
27
|
+
"Record",
|
|
28
|
+
"ExtractedTable",
|
|
29
|
+
"Stats",
|
|
30
|
+
"__version__",
|
|
31
|
+
]
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""``terbium.parse`` - the one function most users call.
|
|
2
|
+
|
|
3
|
+
Flow: adapt -> assemble tables (native, or reconstructed from PDF geometry) ->
|
|
4
|
+
score confidence -> (optionally) send only the hard tables to AI -> build typed
|
|
5
|
+
records -> if anything is still shaky and no key was given, attach and announce
|
|
6
|
+
an escalation message.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from .documents import get_adapter, supported_extensions
|
|
14
|
+
from .layout import confidence as _confidence
|
|
15
|
+
from .layout import dehead, grid
|
|
16
|
+
from .layout.columns import split_columns
|
|
17
|
+
from .layout.lines import cluster_lines
|
|
18
|
+
from .model.document import ParsedDocument, Stats
|
|
19
|
+
from .model.elements import Page
|
|
20
|
+
from .model.table import ExtractedTable
|
|
21
|
+
from .schema import get_schema
|
|
22
|
+
from .harness import arrange_tables, build_message, resolve
|
|
23
|
+
|
|
24
|
+
DEFAULT_THRESHOLD = 0.72
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _assemble_tables(pages: List[Page]) -> List[ExtractedTable]:
|
|
28
|
+
tables: List[ExtractedTable] = []
|
|
29
|
+
pdf_pages = [p for p in pages if p.source_kind == "pdf" and p.words]
|
|
30
|
+
stripper = dehead.build_stripper(pdf_pages) if pdf_pages else None
|
|
31
|
+
for p in pages:
|
|
32
|
+
if p.native_tables:
|
|
33
|
+
tables.extend(p.native_tables)
|
|
34
|
+
elif p.source_kind == "pdf" and p.words:
|
|
35
|
+
for word_group in split_columns(p):
|
|
36
|
+
lines = cluster_lines(word_group)
|
|
37
|
+
if stripper:
|
|
38
|
+
lines = [ln for ln in lines if not stripper(ln, p)]
|
|
39
|
+
tables.extend(grid.extract_tables(lines, p))
|
|
40
|
+
return tables
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def parse(
|
|
44
|
+
path: str,
|
|
45
|
+
schema=None,
|
|
46
|
+
ai=None,
|
|
47
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
48
|
+
announce: bool = True,
|
|
49
|
+
) -> ParsedDocument:
|
|
50
|
+
"""Parse a PDF/PPTX/XLSX/CSV file into structured, confidence-scored records.
|
|
51
|
+
|
|
52
|
+
``schema``: "generic" (default) or "furniture", or a Schema instance.
|
|
53
|
+
``ai``: a ``terbium.AI(...)``, ``True`` (use env keys), or ``None`` (off).
|
|
54
|
+
``threshold``: confidence below which a record is "ambiguous".
|
|
55
|
+
``announce``: print the escalation message to stderr when AI could help but
|
|
56
|
+
no key is set. This is terbium telling you it is stuck.
|
|
57
|
+
"""
|
|
58
|
+
adapter = get_adapter(path)
|
|
59
|
+
pages = adapter.parse(path)
|
|
60
|
+
source_kind = pages[0].source_kind if pages else "unknown"
|
|
61
|
+
|
|
62
|
+
tables = _assemble_tables(pages)
|
|
63
|
+
for t in tables:
|
|
64
|
+
_confidence.score_table(t)
|
|
65
|
+
|
|
66
|
+
ai_cfg = resolve(ai)
|
|
67
|
+
hard = [t for t in tables if t.confidence < threshold]
|
|
68
|
+
used_ai = False
|
|
69
|
+
if hard and ai_cfg is not None:
|
|
70
|
+
fixed = arrange_tables(path, pages, hard, ai_cfg)
|
|
71
|
+
used_ai = fixed > 0
|
|
72
|
+
hard = [t for t in tables if t.confidence < threshold]
|
|
73
|
+
|
|
74
|
+
schema_obj = get_schema(schema)
|
|
75
|
+
records = []
|
|
76
|
+
for t in tables:
|
|
77
|
+
recs = schema_obj.build_records([t])
|
|
78
|
+
if t.origin == "ai":
|
|
79
|
+
for r in recs:
|
|
80
|
+
r.origin = "ai"
|
|
81
|
+
records.extend(recs)
|
|
82
|
+
|
|
83
|
+
stats = Stats(
|
|
84
|
+
total=len(records),
|
|
85
|
+
confident=sum(1 for r in records if r.confidence >= threshold),
|
|
86
|
+
ambiguous=sum(1 for r in records if r.confidence < threshold),
|
|
87
|
+
threshold=threshold,
|
|
88
|
+
)
|
|
89
|
+
doc = ParsedDocument(
|
|
90
|
+
path=path,
|
|
91
|
+
source_kind=source_kind,
|
|
92
|
+
pages=pages,
|
|
93
|
+
records=records,
|
|
94
|
+
stats=stats,
|
|
95
|
+
used_ai=used_ai,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if hard:
|
|
99
|
+
doc.escalation = build_message(records, hard, threshold)
|
|
100
|
+
if announce and ai_cfg is None:
|
|
101
|
+
print(doc.escalation, file=sys.stderr)
|
|
102
|
+
|
|
103
|
+
return doc
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
__all__ = ["parse", "supported_extensions", "DEFAULT_THRESHOLD"]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""`terbium <file>` - parse from the command line."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from . import __version__
|
|
8
|
+
from .api import parse, supported_extensions
|
|
9
|
+
from .harness import AI
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main(argv=None) -> int:
|
|
13
|
+
ap = argparse.ArgumentParser(
|
|
14
|
+
prog="terbium",
|
|
15
|
+
description="Algorithmic multi-file parser (PDF/PPTX/XLSX/CSV) that knows when it is stuck.",
|
|
16
|
+
)
|
|
17
|
+
ap.add_argument("file", help="path to a " + "/".join(supported_extensions()) + " file")
|
|
18
|
+
ap.add_argument("--schema", default="generic", help="generic (default) or furniture")
|
|
19
|
+
ap.add_argument("--json", metavar="OUT", help="write records as JSON to this path (or - for stdout)")
|
|
20
|
+
ap.add_argument("--ai", action="store_true", help="enable the AI lane using env keys")
|
|
21
|
+
ap.add_argument("--tier", choices=["haiku", "sonnet", "opus"], help="pin the AI model tier")
|
|
22
|
+
ap.add_argument("--limit", type=int, default=12, help="how many records to preview")
|
|
23
|
+
ap.add_argument("--version", action="version", version=f"terbium {__version__}")
|
|
24
|
+
args = ap.parse_args(argv)
|
|
25
|
+
|
|
26
|
+
ai = AI(force_tier=args.tier) if args.ai else None
|
|
27
|
+
doc = parse(args.file, schema=args.schema, ai=ai)
|
|
28
|
+
|
|
29
|
+
if args.json:
|
|
30
|
+
payload = doc.to_json()
|
|
31
|
+
if args.json == "-":
|
|
32
|
+
print(payload)
|
|
33
|
+
else:
|
|
34
|
+
with open(args.json, "w", encoding="utf-8") as f:
|
|
35
|
+
f.write(payload)
|
|
36
|
+
print(f"wrote {len(doc.records)} records -> {args.json}", file=sys.stderr)
|
|
37
|
+
return 0
|
|
38
|
+
|
|
39
|
+
print(f"terbium {__version__} · {doc.source_kind} · {len(doc.pages)} pages")
|
|
40
|
+
print(f"records: {doc.stats.total} (confident {doc.stats.confident}, ambiguous {doc.stats.ambiguous})")
|
|
41
|
+
if doc.used_ai:
|
|
42
|
+
print("AI lane: engaged on hard tables")
|
|
43
|
+
print("-" * 60)
|
|
44
|
+
for r in doc.records[: args.limit]:
|
|
45
|
+
flag = "" if r.confidence >= doc.stats.threshold else " [ambiguous]"
|
|
46
|
+
print(f"{r.sku or '-':>8} {r.confidence:.2f} {r.fields}{flag}")
|
|
47
|
+
if doc.stats.total > args.limit:
|
|
48
|
+
print(f"... and {doc.stats.total - args.limit} more")
|
|
49
|
+
return 0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Importing this package registers every adapter as a side effect."""
|
|
2
|
+
from .base import DocumentAdapter, get_adapter, register, supported_extensions
|
|
3
|
+
from . import pdf as _pdf
|
|
4
|
+
from . import pptx_adapter as _pptx
|
|
5
|
+
from . import xlsx_adapter as _xlsx
|
|
6
|
+
from . import csv_adapter as _csv
|
|
7
|
+
|
|
8
|
+
__all__ = ["DocumentAdapter", "get_adapter", "register", "supported_extensions"]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Adapter interface + registry. One adapter per file format.
|
|
2
|
+
|
|
3
|
+
Adapters do exactly one job: turn bytes on disk into normalized ``Page`` objects
|
|
4
|
+
(words with positions, images, and - when the format exposes it natively - ready
|
|
5
|
+
made tables). Everything smart happens after, on that uniform representation.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
from ..model.elements import Page
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DocumentAdapter(ABC):
|
|
17
|
+
extensions: tuple = ()
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def parse(self, path: str) -> List[Page]:
|
|
21
|
+
...
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_REGISTRY: dict = {}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def register(adapter_cls):
|
|
28
|
+
"""Class decorator: instantiate the adapter and index it by extension."""
|
|
29
|
+
instance = adapter_cls()
|
|
30
|
+
for ext in adapter_cls.extensions:
|
|
31
|
+
_REGISTRY[ext.lower()] = instance
|
|
32
|
+
return adapter_cls
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_adapter(path: str) -> DocumentAdapter:
|
|
36
|
+
ext = os.path.splitext(path)[1].lower().lstrip(".")
|
|
37
|
+
if ext not in _REGISTRY:
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"terbium has no adapter for '.{ext}'. Supported: "
|
|
40
|
+
+ ", ".join(sorted(_REGISTRY)) + "."
|
|
41
|
+
)
|
|
42
|
+
return _REGISTRY[ext]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def supported_extensions() -> List[str]:
|
|
46
|
+
return sorted(_REGISTRY)
|