bidreader 0.2.0__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bidreader-0.8.1/PKG-INFO +219 -0
- bidreader-0.8.1/README.md +186 -0
- {bidreader-0.2.0 → bidreader-0.8.1}/bidreader/__init__.py +1 -1
- {bidreader-0.2.0 → bidreader-0.8.1}/bidreader/cli.py +30 -2
- bidreader-0.8.1/bidreader/extract.py +216 -0
- bidreader-0.8.1/bidreader/leveling.py +203 -0
- {bidreader-0.2.0 → bidreader-0.8.1}/bidreader/mcp_server.py +24 -3
- bidreader-0.8.1/bidreader/ocr.py +46 -0
- bidreader-0.8.1/bidreader.egg-info/PKG-INFO +219 -0
- {bidreader-0.2.0 → bidreader-0.8.1}/bidreader.egg-info/SOURCES.txt +4 -1
- bidreader-0.8.1/bidreader.egg-info/requires.txt +19 -0
- {bidreader-0.2.0 → bidreader-0.8.1}/pyproject.toml +4 -1
- bidreader-0.8.1/tests/test_offline.py +106 -0
- bidreader-0.2.0/PKG-INFO +0 -109
- bidreader-0.2.0/README.md +0 -84
- bidreader-0.2.0/bidreader/extract.py +0 -97
- bidreader-0.2.0/bidreader.egg-info/PKG-INFO +0 -109
- bidreader-0.2.0/bidreader.egg-info/requires.txt +0 -8
- {bidreader-0.2.0 → bidreader-0.8.1}/LICENSE +0 -0
- {bidreader-0.2.0 → bidreader-0.8.1}/bidreader.egg-info/dependency_links.txt +0 -0
- {bidreader-0.2.0 → bidreader-0.8.1}/bidreader.egg-info/entry_points.txt +0 -0
- {bidreader-0.2.0 → bidreader-0.8.1}/bidreader.egg-info/top_level.txt +0 -0
- {bidreader-0.2.0 → bidreader-0.8.1}/setup.cfg +0 -0
bidreader-0.8.1/PKG-INFO
ADDED
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: bidreader
|
|
3
|
+
Version: 0.8.1
|
|
4
|
+
Summary: Read messy construction sub-quotes, bid packages & spec PDFs into clean structured data — and catch the scope gaps/exclusions vendors bury. Every value cited to its page.
|
|
5
|
+
Author-email: Anmol <anmol@attentive.ai>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/anmolsam/bidreader
|
|
8
|
+
Project-URL: Issues, https://github.com/anmolsam/bidreader/issues
|
|
9
|
+
Keywords: construction,estimating,takeoff,subcontractor,bid,quote,scope,exclusions,spec,AEC,preconstruction,BOQ,LLM,MCP
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Office/Business :: Financial
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: pymupdf>=1.24
|
|
19
|
+
Requires-Dist: certifi>=2024.0
|
|
20
|
+
Provides-Extra: tables
|
|
21
|
+
Requires-Dist: pdfplumber>=0.11; extra == "tables"
|
|
22
|
+
Provides-Extra: xlsx
|
|
23
|
+
Requires-Dist: openpyxl>=3.1; extra == "xlsx"
|
|
24
|
+
Provides-Extra: ocr
|
|
25
|
+
Requires-Dist: pytesseract>=0.3.10; extra == "ocr"
|
|
26
|
+
Requires-Dist: pillow>=10; extra == "ocr"
|
|
27
|
+
Provides-Extra: mcp
|
|
28
|
+
Requires-Dist: mcp>=1.2; extra == "mcp"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
31
|
+
Requires-Dist: openpyxl>=3.1; extra == "dev"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
<div align="center">
|
|
35
|
+
|
|
36
|
+
# 📄 BidReader
|
|
37
|
+
|
|
38
|
+
### Read messy construction sub-quotes, bid packages & spec PDFs into clean structured data — and catch the scope gaps and exclusions vendors bury in the fine print.
|
|
39
|
+
|
|
40
|
+
Every line item carries its **page**, the **exact source text** it came from, and an **arithmetic check** (`qty × unit_price == amount`) — verification on top of extraction, not just an LLM guess.
|
|
41
|
+
|
|
42
|
+
[](https://pypi.org/project/bidreader/)
|
|
43
|
+
[](LICENSE)
|
|
44
|
+
[](pyproject.toml)
|
|
45
|
+
[](docs/MCP.md)
|
|
46
|
+
[](docs/FREE_MODELS.md)
|
|
47
|
+
|
|
48
|
+
</div>
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
> *"Manually typing numbers from a PDF into Excel because the formatting is a crime scene… hunting for the one line where a sub quietly excluded 'trash removal' in size-8 font."*
|
|
53
|
+
> — r/Construction, **498 upvotes** ([source](https://www.reddit.com/r/Construction/comments/1pq34ur/))
|
|
54
|
+
|
|
55
|
+
The construction-AI gold rush is all chasing the same crowded, resisted thing — autonomous *takeoff*. The **loudest unmet pain** of estimators is upstream and downstream of it: wrangling crime-scene PDFs into clean data, and **catching what subcontractors quietly excluded** before it costs six figures on the job.
|
|
56
|
+
|
|
57
|
+
No permissively-licensed library did this. **BidReader is that primitive** — MIT, `pip install`, runs on free LLMs, and callable from any AI agent over MCP.
|
|
58
|
+
|
|
59
|
+
## Quickstart (copy-paste, ~30 seconds)
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install bidreader
|
|
63
|
+
|
|
64
|
+
# Use any one — a FREE key works (see docs/FREE_MODELS.md):
|
|
65
|
+
export GEMINI_API_KEY=... # free at aistudio.google.com
|
|
66
|
+
# or export OPENROUTER_API_KEY=... (has :free models)
|
|
67
|
+
# or export REQUESTY_API_KEY=...
|
|
68
|
+
|
|
69
|
+
bidreader your_sub_quote.pdf
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from bidreader import read
|
|
74
|
+
|
|
75
|
+
doc = read("sub_quote.pdf")
|
|
76
|
+
doc.line_items # [{section, description, qty, unit, amount, page}, ...]
|
|
77
|
+
doc.exclusions # [{item, quote, page, risk}, ...] <- the buried stuff
|
|
78
|
+
doc.scope_gaps # trade-standard scope NOT in the doc — confirm before bidding
|
|
79
|
+
doc.to_json()
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Private mode — bids never leave your machine
|
|
83
|
+
|
|
84
|
+
Sub bids are confidential. Run BidReader fully offline against a local [Ollama](https://ollama.com) model — **no document text is sent to any cloud LLM, no API key**:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
ollama pull llama3.1
|
|
88
|
+
export BID_MODEL=ollama/llama3.1
|
|
89
|
+
bidreader your_sub_quote.pdf # 100% local
|
|
90
|
+
```
|
|
91
|
+
Full guide + on-prem/shared-host options: [docs/LOCAL_MODELS.md](docs/LOCAL_MODELS.md).
|
|
92
|
+
|
|
93
|
+
## Real output
|
|
94
|
+
|
|
95
|
+
On a real **$324,240.61 drywall estimate** (72 line items, scanned in seconds), BidReader's scope engine caught a genuinely expensive hole:
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
!! SCOPE GAPS TO CONFIRM:
|
|
99
|
+
- Finishing (taping, mudding, sanding) -- the gypsum line items price the BOARD
|
|
100
|
+
only, not the finishing labor to reach a paint-ready surface.
|
|
101
|
+
- Door hardware -- "Door W/ Frame" lines don't include hinges/locks/closers.
|
|
102
|
+
- Firestopping at rated assemblies -- life-safety scope, commonly omitted.
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
On a real **25-page multi-trade GC estimate**, it parsed **959 line items across 16 CSI divisions** (demolition → concrete → steel → finishes → plumbing → fire suppression), each page-cited. See [docs/RESULTS.md](docs/RESULTS.md) and a full worked example in [`examples/`](examples/).
|
|
106
|
+
|
|
107
|
+
## Scanned PDFs
|
|
108
|
+
|
|
109
|
+
Lots of real bids are scans with no text layer. BidReader auto-detects those and
|
|
110
|
+
falls back to **local Tesseract OCR** — same structured output, still private:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
pip install "bidreader[ocr]" # + tesseract binary: brew install tesseract
|
|
114
|
+
bidreader scanned_quote.pdf # auto-OCR; or force with --ocr always
|
|
115
|
+
```
|
|
116
|
+
Verified on an image-only quote: recovered all line items, total, and exclusions
|
|
117
|
+
purely from the page image.
|
|
118
|
+
|
|
119
|
+
## Bid leveling — compare subs side-by-side → Excel
|
|
120
|
+
|
|
121
|
+
The bid-day workflow: read every sub's quote and level them apples-to-apples.
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
pip install "bidreader[xlsx]"
|
|
125
|
+
bidreader level voltage_bros.pdf current_co.pdf sparky.pdf -o leveling.xlsx
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
It builds an Excel workbook (bidders as columns) with a **scope/exclusion matrix** that exposes the catch every estimator dreads — the *apparent* low bid that quietly carved out scope:
|
|
129
|
+
|
|
130
|
+
```
|
|
131
|
+
Voltage Bros Current Co Sparky
|
|
132
|
+
Bid total $64,300 $108,890 $77,520
|
|
133
|
+
◀ LOW
|
|
134
|
+
EXCLUSION MATRIX (filled = this bidder EXCLUDED it):
|
|
135
|
+
Fire alarm system EXCL p1 — EXCL p1
|
|
136
|
+
Temporary power EXCL p1 — EXCL p1
|
|
137
|
+
Permits — — EXCL p1
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
The "$64,300 low bid" excluded the fire alarm the $108,890 bid *includes* — not actually the cheapest. Plus per-bidder detail sheets with line items + arithmetic flags. (Try it: `python examples/make_leveling_sample.py` → `examples/leveling_demo.xlsx`.)
|
|
141
|
+
|
|
142
|
+
## Use it from an AI agent (MCP)
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
pip install "bidreader[mcp]"
|
|
146
|
+
```
|
|
147
|
+
```json
|
|
148
|
+
{ "mcpServers": { "bidreader": {
|
|
149
|
+
"command": "bidreader-mcp",
|
|
150
|
+
"env": { "GEMINI_API_KEY": "..." }
|
|
151
|
+
}}}
|
|
152
|
+
```
|
|
153
|
+
Tools: `read_document`, `catch_exclusions`, `extract_line_items`. Now your agent can answer *"which subs excluded fire-stopping across this bid folder?"* Full guide: [docs/MCP.md](docs/MCP.md).
|
|
154
|
+
|
|
155
|
+
## How it works
|
|
156
|
+
|
|
157
|
+
```
|
|
158
|
+
PDF (sub-quote / bid package / spec / schedule)
|
|
159
|
+
→ page-tagged text extraction (PyMuPDF)
|
|
160
|
+
→ chunk by page (scales to 25+ page, 900+ line-item estimates)
|
|
161
|
+
→ LLM structured extraction (line items · exclusions · assumptions · alternates · scope gaps)
|
|
162
|
+
→ merge + page-cited output (JSON / CLI / MCP)
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Text-based, so it runs great on **free** models — see [docs/FREE_MODELS.md](docs/FREE_MODELS.md).
|
|
166
|
+
|
|
167
|
+
## Evidence pack — see what it does on 14 messy bids
|
|
168
|
+
|
|
169
|
+
[**`demo/EVIDENCE.md`**](demo/EVIDENCE.md) runs BidReader across 14 deliberately-messy
|
|
170
|
+
synthetic bids (prose-buried exclusions, fine-print footnotes, two-column layouts,
|
|
171
|
+
planted arithmetic errors, multi-page, **scanned** image-only docs) and reports
|
|
172
|
+
honestly — wins *and* failures:
|
|
173
|
+
|
|
174
|
+
- **100%** line-item recall · **97%** exclusion-catch · **100%** bid-total · **3/3** planted arithmetic errors caught · **2/2** scanned docs OCR'd
|
|
175
|
+
- One honest miss documented: a low-DPI scan dropped 1 of 3 exclusions.
|
|
176
|
+
- Two committed Excel leveling workbooks (electrical 4-sub, drywall 3-sub) showing the apparent-low-bid-that-carved-out-scope.
|
|
177
|
+
|
|
178
|
+
Reproduce: `python demo/make_corpus.py && python demo/run_eval.py`.
|
|
179
|
+
|
|
180
|
+
## Benchmark
|
|
181
|
+
|
|
182
|
+
Reproducible ground-truth benchmark ([`benchmark/`](benchmark/)) — synthetic docs we author, so truth is exact and the PDFs ship in-repo:
|
|
183
|
+
|
|
184
|
+
| metric | score |
|
|
185
|
+
|---|---|
|
|
186
|
+
| Line-item recall | **100%** |
|
|
187
|
+
| Exclusion-catch recall (incl. prose-buried) | **100%** |
|
|
188
|
+
| No-hallucination rate (clean docs) | **100%** |
|
|
189
|
+
| Bid-total accuracy (±2%) | **100%** |
|
|
190
|
+
| Arithmetic errors caught | **2/2**, 0 false positives |
|
|
191
|
+
|
|
192
|
+
Honest caveat: synthetic docs are cleaner than real scans — these are an **upper bound** on well-structured input, not a claim about messy real bids. Uncontrolled real-document results are in [docs/RESULTS.md](docs/RESULTS.md). Reproduce: `python benchmark/generate.py && python benchmark/run.py`.
|
|
193
|
+
|
|
194
|
+
## Why this, and why now — the evidence
|
|
195
|
+
|
|
196
|
+
A full write-up (problem, market data, prior-art gap, method, results) is in **[PAPER.md](PAPER.md)**. The short version:
|
|
197
|
+
|
|
198
|
+
- **Loudest, most-shared pain** in construction-estimating communities (the 498-upvote thread above; more cited in the paper).
|
|
199
|
+
- **It works *today*** — document extraction is LLM-native, unlike floor-plan symbol detection (academic SOTA tops out ~83% mAP).
|
|
200
|
+
- **Empty slot** — `bidreader`, `blueprint-parser`, `pytakeoff` were all unclaimed on PyPI; the only adjacent tools are AGPL/non-commercial or abandoned toys.
|
|
201
|
+
- **Broadest base** — every estimator *and* every construction-AI builder needs document extraction. The library is the dependency; the MCP server is the agent-era surface.
|
|
202
|
+
|
|
203
|
+
## Roadmap
|
|
204
|
+
|
|
205
|
+
- [x] Multi-quote **leveling** → Excel (compare subs side-by-side) — v0.6
|
|
206
|
+
- [x] Fully-local / private mode via **Ollama** — v0.7
|
|
207
|
+
- [x] **Scanned-PDF OCR** (local Tesseract) — v0.8
|
|
208
|
+
- [ ] Source-grounded **click-back review UI** (data already carries `source_text`)
|
|
209
|
+
- [ ] Revision/addendum **diff** ("what changed between Addendum 3 and 4")
|
|
210
|
+
- [ ] CSI/UNIFORMAT mapping + UOM normalization for estimator-grade leveling
|
|
211
|
+
- [ ] Region/trade notation packs (AISC, BS/IS, AUS)
|
|
212
|
+
|
|
213
|
+
## Contributing
|
|
214
|
+
|
|
215
|
+
PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md). Good first issues: add a notation parser, a new export format, or a test fixture.
|
|
216
|
+
|
|
217
|
+
## License
|
|
218
|
+
|
|
219
|
+
[MIT](LICENSE) © 2026. Cite via [CITATION.cff](CITATION.cff).
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# 📄 BidReader
|
|
4
|
+
|
|
5
|
+
### Read messy construction sub-quotes, bid packages & spec PDFs into clean structured data — and catch the scope gaps and exclusions vendors bury in the fine print.
|
|
6
|
+
|
|
7
|
+
Every line item carries its **page**, the **exact source text** it came from, and an **arithmetic check** (`qty × unit_price == amount`) — verification on top of extraction, not just an LLM guess.
|
|
8
|
+
|
|
9
|
+
[](https://pypi.org/project/bidreader/)
|
|
10
|
+
[](LICENSE)
|
|
11
|
+
[](pyproject.toml)
|
|
12
|
+
[](docs/MCP.md)
|
|
13
|
+
[](docs/FREE_MODELS.md)
|
|
14
|
+
|
|
15
|
+
</div>
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
> *"Manually typing numbers from a PDF into Excel because the formatting is a crime scene… hunting for the one line where a sub quietly excluded 'trash removal' in size-8 font."*
|
|
20
|
+
> — r/Construction, **498 upvotes** ([source](https://www.reddit.com/r/Construction/comments/1pq34ur/))
|
|
21
|
+
|
|
22
|
+
The construction-AI gold rush is all chasing the same crowded, resisted thing — autonomous *takeoff*. The **loudest unmet pain** of estimators is upstream and downstream of it: wrangling crime-scene PDFs into clean data, and **catching what subcontractors quietly excluded** before it costs six figures on the job.
|
|
23
|
+
|
|
24
|
+
No permissively-licensed library did this. **BidReader is that primitive** — MIT, `pip install`, runs on free LLMs, and callable from any AI agent over MCP.
|
|
25
|
+
|
|
26
|
+
## Quickstart (copy-paste, ~30 seconds)
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install bidreader
|
|
30
|
+
|
|
31
|
+
# Use any one — a FREE key works (see docs/FREE_MODELS.md):
|
|
32
|
+
export GEMINI_API_KEY=... # free at aistudio.google.com
|
|
33
|
+
# or export OPENROUTER_API_KEY=... (has :free models)
|
|
34
|
+
# or export REQUESTY_API_KEY=...
|
|
35
|
+
|
|
36
|
+
bidreader your_sub_quote.pdf
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from bidreader import read
|
|
41
|
+
|
|
42
|
+
doc = read("sub_quote.pdf")
|
|
43
|
+
doc.line_items # [{section, description, qty, unit, amount, page}, ...]
|
|
44
|
+
doc.exclusions # [{item, quote, page, risk}, ...] <- the buried stuff
|
|
45
|
+
doc.scope_gaps # trade-standard scope NOT in the doc — confirm before bidding
|
|
46
|
+
doc.to_json()
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Private mode — bids never leave your machine
|
|
50
|
+
|
|
51
|
+
Sub bids are confidential. Run BidReader fully offline against a local [Ollama](https://ollama.com) model — **no document text is sent to any cloud LLM, no API key**:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
ollama pull llama3.1
|
|
55
|
+
export BID_MODEL=ollama/llama3.1
|
|
56
|
+
bidreader your_sub_quote.pdf # 100% local
|
|
57
|
+
```
|
|
58
|
+
Full guide + on-prem/shared-host options: [docs/LOCAL_MODELS.md](docs/LOCAL_MODELS.md).
|
|
59
|
+
|
|
60
|
+
## Real output
|
|
61
|
+
|
|
62
|
+
On a real **$324,240.61 drywall estimate** (72 line items, scanned in seconds), BidReader's scope engine caught a genuinely expensive hole:
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
!! SCOPE GAPS TO CONFIRM:
|
|
66
|
+
- Finishing (taping, mudding, sanding) -- the gypsum line items price the BOARD
|
|
67
|
+
only, not the finishing labor to reach a paint-ready surface.
|
|
68
|
+
- Door hardware -- "Door W/ Frame" lines don't include hinges/locks/closers.
|
|
69
|
+
- Firestopping at rated assemblies -- life-safety scope, commonly omitted.
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
On a real **25-page multi-trade GC estimate**, it parsed **959 line items across 16 CSI divisions** (demolition → concrete → steel → finishes → plumbing → fire suppression), each page-cited. See [docs/RESULTS.md](docs/RESULTS.md) and a full worked example in [`examples/`](examples/).
|
|
73
|
+
|
|
74
|
+
## Scanned PDFs
|
|
75
|
+
|
|
76
|
+
Lots of real bids are scans with no text layer. BidReader auto-detects those and
|
|
77
|
+
falls back to **local Tesseract OCR** — same structured output, still private:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install "bidreader[ocr]" # + tesseract binary: brew install tesseract
|
|
81
|
+
bidreader scanned_quote.pdf # auto-OCR; or force with --ocr always
|
|
82
|
+
```
|
|
83
|
+
Verified on an image-only quote: recovered all line items, total, and exclusions
|
|
84
|
+
purely from the page image.
|
|
85
|
+
|
|
86
|
+
## Bid leveling — compare subs side-by-side → Excel
|
|
87
|
+
|
|
88
|
+
The bid-day workflow: read every sub's quote and level them apples-to-apples.
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
pip install "bidreader[xlsx]"
|
|
92
|
+
bidreader level voltage_bros.pdf current_co.pdf sparky.pdf -o leveling.xlsx
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
It builds an Excel workbook (bidders as columns) with a **scope/exclusion matrix** that exposes the catch every estimator dreads — the *apparent* low bid that quietly carved out scope:
|
|
96
|
+
|
|
97
|
+
```
|
|
98
|
+
Voltage Bros Current Co Sparky
|
|
99
|
+
Bid total $64,300 $108,890 $77,520
|
|
100
|
+
◀ LOW
|
|
101
|
+
EXCLUSION MATRIX (filled = this bidder EXCLUDED it):
|
|
102
|
+
Fire alarm system EXCL p1 — EXCL p1
|
|
103
|
+
Temporary power EXCL p1 — EXCL p1
|
|
104
|
+
Permits — — EXCL p1
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
The "$64,300 low bid" excluded the fire alarm the $108,890 bid *includes* — not actually the cheapest. Plus per-bidder detail sheets with line items + arithmetic flags. (Try it: `python examples/make_leveling_sample.py` → `examples/leveling_demo.xlsx`.)
|
|
108
|
+
|
|
109
|
+
## Use it from an AI agent (MCP)
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
pip install "bidreader[mcp]"
|
|
113
|
+
```
|
|
114
|
+
```json
|
|
115
|
+
{ "mcpServers": { "bidreader": {
|
|
116
|
+
"command": "bidreader-mcp",
|
|
117
|
+
"env": { "GEMINI_API_KEY": "..." }
|
|
118
|
+
}}}
|
|
119
|
+
```
|
|
120
|
+
Tools: `read_document`, `catch_exclusions`, `extract_line_items`. Now your agent can answer *"which subs excluded fire-stopping across this bid folder?"* Full guide: [docs/MCP.md](docs/MCP.md).
|
|
121
|
+
|
|
122
|
+
## How it works
|
|
123
|
+
|
|
124
|
+
```
|
|
125
|
+
PDF (sub-quote / bid package / spec / schedule)
|
|
126
|
+
→ page-tagged text extraction (PyMuPDF)
|
|
127
|
+
→ chunk by page (scales to 25+ page, 900+ line-item estimates)
|
|
128
|
+
→ LLM structured extraction (line items · exclusions · assumptions · alternates · scope gaps)
|
|
129
|
+
→ merge + page-cited output (JSON / CLI / MCP)
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
Text-based, so it runs great on **free** models — see [docs/FREE_MODELS.md](docs/FREE_MODELS.md).
|
|
133
|
+
|
|
134
|
+
## Evidence pack — see what it does on 14 messy bids
|
|
135
|
+
|
|
136
|
+
[**`demo/EVIDENCE.md`**](demo/EVIDENCE.md) runs BidReader across 14 deliberately-messy
|
|
137
|
+
synthetic bids (prose-buried exclusions, fine-print footnotes, two-column layouts,
|
|
138
|
+
planted arithmetic errors, multi-page, **scanned** image-only docs) and reports
|
|
139
|
+
honestly — wins *and* failures:
|
|
140
|
+
|
|
141
|
+
- **100%** line-item recall · **97%** exclusion-catch · **100%** bid-total · **3/3** planted arithmetic errors caught · **2/2** scanned docs OCR'd
|
|
142
|
+
- One honest miss documented: a low-DPI scan dropped 1 of 3 exclusions.
|
|
143
|
+
- Two committed Excel leveling workbooks (electrical 4-sub, drywall 3-sub) showing the apparent-low-bid-that-carved-out-scope.
|
|
144
|
+
|
|
145
|
+
Reproduce: `python demo/make_corpus.py && python demo/run_eval.py`.
|
|
146
|
+
|
|
147
|
+
## Benchmark
|
|
148
|
+
|
|
149
|
+
Reproducible ground-truth benchmark ([`benchmark/`](benchmark/)) — synthetic docs we author, so truth is exact and the PDFs ship in-repo:
|
|
150
|
+
|
|
151
|
+
| metric | score |
|
|
152
|
+
|---|---|
|
|
153
|
+
| Line-item recall | **100%** |
|
|
154
|
+
| Exclusion-catch recall (incl. prose-buried) | **100%** |
|
|
155
|
+
| No-hallucination rate (clean docs) | **100%** |
|
|
156
|
+
| Bid-total accuracy (±2%) | **100%** |
|
|
157
|
+
| Arithmetic errors caught | **2/2**, 0 false positives |
|
|
158
|
+
|
|
159
|
+
Honest caveat: synthetic docs are cleaner than real scans — these are an **upper bound** on well-structured input, not a claim about messy real bids. Uncontrolled real-document results are in [docs/RESULTS.md](docs/RESULTS.md). Reproduce: `python benchmark/generate.py && python benchmark/run.py`.
|
|
160
|
+
|
|
161
|
+
## Why this, and why now — the evidence
|
|
162
|
+
|
|
163
|
+
A full write-up (problem, market data, prior-art gap, method, results) is in **[PAPER.md](PAPER.md)**. The short version:
|
|
164
|
+
|
|
165
|
+
- **Loudest, most-shared pain** in construction-estimating communities (the 498-upvote thread above; more cited in the paper).
|
|
166
|
+
- **It works *today*** — document extraction is LLM-native, unlike floor-plan symbol detection (academic SOTA tops out ~83% mAP).
|
|
167
|
+
- **Empty slot** — `bidreader`, `blueprint-parser`, `pytakeoff` were all unclaimed on PyPI; the only adjacent tools are AGPL/non-commercial or abandoned toys.
|
|
168
|
+
- **Broadest base** — every estimator *and* every construction-AI builder needs document extraction. The library is the dependency; the MCP server is the agent-era surface.
|
|
169
|
+
|
|
170
|
+
## Roadmap
|
|
171
|
+
|
|
172
|
+
- [x] Multi-quote **leveling** → Excel (compare subs side-by-side) — v0.6
|
|
173
|
+
- [x] Fully-local / private mode via **Ollama** — v0.7
|
|
174
|
+
- [x] **Scanned-PDF OCR** (local Tesseract) — v0.8
|
|
175
|
+
- [ ] Source-grounded **click-back review UI** (data already carries `source_text`)
|
|
176
|
+
- [ ] Revision/addendum **diff** ("what changed between Addendum 3 and 4")
|
|
177
|
+
- [ ] CSI/UNIFORMAT mapping + UOM normalization for estimator-grade leveling
|
|
178
|
+
- [ ] Region/trade notation packs (AISC, BS/IS, AUS)
|
|
179
|
+
|
|
180
|
+
## Contributing
|
|
181
|
+
|
|
182
|
+
PRs welcome — see [CONTRIBUTING.md](CONTRIBUTING.md). Good first issues: add a notation parser, a new export format, or a test fixture.
|
|
183
|
+
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
[MIT](LICENSE) © 2026. Cite via [CITATION.cff](CITATION.cff).
|
|
@@ -3,13 +3,35 @@ import sys, json
|
|
|
3
3
|
from .extract import read
|
|
4
4
|
|
|
5
5
|
|
|
6
|
+
def _level(argv):
|
|
7
|
+
from .leveling import level, to_xlsx, summary_text
|
|
8
|
+
out = "leveling.xlsx"
|
|
9
|
+
if "-o" in argv:
|
|
10
|
+
i = argv.index("-o"); out = argv[i + 1]; argv = argv[:i] + argv[i + 2:]
|
|
11
|
+
pdfs = [a for a in argv if a.lower().endswith(".pdf")]
|
|
12
|
+
if len(pdfs) < 2:
|
|
13
|
+
print("usage: bidreader level <quote1.pdf> <quote2.pdf> [...] [-o leveling.xlsx]"); sys.exit(1)
|
|
14
|
+
print(f"leveling {len(pdfs)} bids ...", file=sys.stderr)
|
|
15
|
+
result = level(pdfs)
|
|
16
|
+
print(summary_text(result))
|
|
17
|
+
to_xlsx(result, out)
|
|
18
|
+
print(f"\nwrote {out}")
|
|
19
|
+
|
|
20
|
+
|
|
6
21
|
def main():
|
|
7
22
|
args = [a for a in sys.argv[1:]]
|
|
23
|
+
if args and args[0] == "level":
|
|
24
|
+
_level(args[1:]); return
|
|
8
25
|
as_json = "--json" in args
|
|
26
|
+
ocr = "auto"
|
|
27
|
+
if "--ocr" in args:
|
|
28
|
+
i = args.index("--ocr"); ocr = args[i + 1] if i + 1 < len(args) else "auto"
|
|
29
|
+
args = args[:i] + args[i + 2:]
|
|
9
30
|
paths = [a for a in args if not a.startswith("-")]
|
|
10
31
|
if not paths:
|
|
11
|
-
print("usage: bidreader <document.pdf> [--json]"
|
|
12
|
-
|
|
32
|
+
print("usage: bidreader <document.pdf> [--json] [--ocr auto|always|never]\n"
|
|
33
|
+
" bidreader level <q1.pdf> <q2.pdf> [...] [-o leveling.xlsx]"); sys.exit(1)
|
|
34
|
+
d = read(paths[0], ocr=ocr)
|
|
13
35
|
if as_json:
|
|
14
36
|
print(d.to_json()); return
|
|
15
37
|
print("=" * 74)
|
|
@@ -24,6 +46,12 @@ def main():
|
|
|
24
46
|
f"{str(li.get('qty') or ''):>8s}{str(li.get('unit') or ''):>5s}{amt:>13s} p{li.get('page','?')}")
|
|
25
47
|
if d.get('bid_total'):
|
|
26
48
|
print(f" {'BID TOTAL':56s}{'$' + format(d['bid_total'], ',.2f'):>13s}")
|
|
49
|
+
mm = [li for li in d.line_items if li.get('math_check') == 'mismatch']
|
|
50
|
+
if mm:
|
|
51
|
+
print(f"\n!! ARITHMETIC MISMATCHES ({len(mm)}) — qty x unit_price != amount:")
|
|
52
|
+
for li in mm[:10]:
|
|
53
|
+
print(f" - p{li.get('page','?')} {str(li.get('description',''))[:46]}: "
|
|
54
|
+
f"stated {li.get('amount')}, computed {li.get('math_expected')}")
|
|
27
55
|
print(f"\n!! EXCLUSIONS CAUGHT ({len(d.exclusions)}):")
|
|
28
56
|
for e in d.exclusions:
|
|
29
57
|
print(f" - {e.get('item','?')} (page {e.get('page','?')})")
|