bidreader 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
bidreader/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from .extract import read, Doc
2
+ __all__ = ["read", "Doc"]
3
+ __version__ = "0.2.0"
bidreader/cli.py ADDED
@@ -0,0 +1,39 @@
1
+ """`bidreader <file.pdf>` — print an estimator report, or `--json` for piping."""
2
+ import sys, json
3
+ from .extract import read
4
+
5
+
6
+ def main():
7
+ args = [a for a in sys.argv[1:]]
8
+ as_json = "--json" in args
9
+ paths = [a for a in args if not a.startswith("-")]
10
+ if not paths:
11
+ print("usage: bidreader <document.pdf> [--json]"); sys.exit(1)
12
+ d = read(paths[0])
13
+ if as_json:
14
+ print(d.to_json()); return
15
+ print("=" * 74)
16
+ print(f"BIDREADER | {d['_source']} ({d['_pages']}p)")
17
+ print("=" * 74)
18
+ print(f"{d.get('doc_type','?')} | {d.get('vendor','?')} | trade: {d.get('trade','?')} | {d.get('currency','')}")
19
+ print(f"Project: {d.get('project','?')}")
20
+ print(f"\nLINE ITEMS ({len(d.line_items)}):")
21
+ for li in d.line_items:
22
+ amt = ('$' + format(li['amount'], ',.2f')) if li.get('amount') is not None else ''
23
+ print(f" {str(li.get('section') or '-'):10s}{str(li.get('description',''))[:40]:41s}"
24
+ f"{str(li.get('qty') or ''):>8s}{str(li.get('unit') or ''):>5s}{amt:>13s} p{li.get('page','?')}")
25
+ if d.get('bid_total'):
26
+ print(f" {'BID TOTAL':56s}{'$' + format(d['bid_total'], ',.2f'):>13s}")
27
+ print(f"\n!! EXCLUSIONS CAUGHT ({len(d.exclusions)}):")
28
+ for e in d.exclusions:
29
+ print(f" - {e.get('item','?')} (page {e.get('page','?')})")
30
+ print(f" \"{e.get('quote','')[:120]}\"")
31
+ print(f" risk: {e.get('risk','')}")
32
+ if d.scope_gaps:
33
+ print(f"\nSCOPE GAPS TO CONFIRM ({len(d.scope_gaps)}):")
34
+ for g in d.scope_gaps:
35
+ print(f" - {g.get('missing','')} -- {g.get('why','')}")
36
+
37
+
38
+ if __name__ == "__main__":
39
+ main()
bidreader/extract.py ADDED
@@ -0,0 +1,97 @@
1
+ """
2
+ BidReader — read messy construction documents (sub-quotes, bid packages, spec
3
+ sections, schedules) into clean structured data, and CATCH the scope gaps /
4
+ exclusions vendors bury. Every value cites its page + the exact source text.
5
+
6
+ The valued, repeated estimator pain (r/Construction, 498 upvotes): not the
7
+ takeoff — it's wrangling "crime-scene formatting" PDFs into clean tables and
8
+ finding the one line where a sub quietly excluded something in size-8 font.
9
+
10
+ LLM-native (works today). Provider-agnostic key from env: REQUESTY / OPENROUTER
11
+ / GEMINI. MIT.
12
+ """
13
+ from __future__ import annotations
14
+ import os, json, ssl, urllib.request, certifi
15
+ import fitz
16
+
17
+ SSLCTX = ssl.create_default_context(cafile=certifi.where())
18
+ MODEL = os.environ.get("BID_MODEL", "google/gemini-2.5-flash")
19
+
20
+ SCHEMA_PROMPT = """You are a construction estimating assistant reading a vendor/subcontractor document
21
+ (quote, bid, proposal, spec section, or schedule). Read the TEXT (page-tagged) and return STRICT JSON only:
22
+ {
23
+ "doc_type": "<sub-quote|bid package|spec section|schedule|invoice|other>",
24
+ "vendor": "<company or null>", "project": "<project/title or null>",
25
+ "trade": "<trade e.g. Drywall, Electrical or null>", "currency": "<e.g. USD or null>",
26
+ "bid_total": <number or null>,
27
+ "line_items": [{"section":"<csi/section or null>","description":"<text>","qty":<num|null>,"unit":"<EA/SF/LF/LS|null>","unit_price":<num|null>,"amount":<num|null>,"page":<int>}],
28
+ "exclusions": [{"item":"<short label>","quote":"<EXACT text as written>","page":<int>,"risk":"<one line: why this matters / who eats the cost>"}],
29
+ "assumptions": [{"text":"<exact>","page":<int>}],
30
+ "alternates": [{"text":"<exact>","amount":<num|null>,"page":<int>}],
31
+ "scope_gaps": [{"missing":"<scope likely NOT covered>","why":"<why an estimator should confirm>"}],
32
+ "notes": "<one line on confidence/legibility>"
33
+ }
34
+ Rules: exclusions are CRITICAL — hunt everywhere, including fine print, footnotes, "by others", "not included",
35
+ "excludes", "assumes", "clarifications". Quote them verbatim with the page. Do not invent values; use null if unsure.
36
+ For scope_gaps, infer trade-standard scope a vendor commonly omits that is NOT mentioned in this doc."""
37
+
38
+
39
+ def _page_text(doc):
40
+ parts = []
41
+ for i, p in enumerate(doc):
42
+ t = p.get_text().strip()
43
+ if t:
44
+ parts.append(f"[PAGE {i+1}]\n{t}")
45
+ return "\n\n".join(parts)
46
+
47
+
48
+ def _llm(text):
49
+ rq = os.environ.get("REQUESTY_API_KEY"); ork = os.environ.get("OPENROUTER_API_KEY")
50
+ gk = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
51
+ user = SCHEMA_PROMPT + "\n\n=== DOCUMENT TEXT ===\n" + text[:120000]
52
+ if rq: base, key = "https://router.requesty.ai/v1/chat/completions", rq
53
+ elif ork: base, key = "https://openrouter.ai/api/v1/chat/completions", ork
54
+ elif gk:
55
+ m = MODEL.split("/")[-1]
56
+ url = f"https://generativelanguage.googleapis.com/v1beta/models/{m}:generateContent?key={gk}"
57
+ body = {"contents":[{"parts":[{"text":user}]}],
58
+ "generationConfig":{"response_mime_type":"application/json","temperature":0}}
59
+ req = urllib.request.Request(url, data=json.dumps(body).encode(), headers={"Content-Type":"application/json"})
60
+ r = json.load(urllib.request.urlopen(req, timeout=180, context=SSLCTX))
61
+ return _clean(r["candidates"][0]["content"]["parts"][0]["text"])
62
+ else:
63
+ raise RuntimeError("Set REQUESTY_API_KEY / OPENROUTER_API_KEY / GEMINI_API_KEY")
64
+ body = {"model": MODEL, "temperature": 0, "max_tokens": 16000, "reasoning_effort": "low",
65
+ "messages": [{"role": "user", "content": user}]}
66
+ req = urllib.request.Request(base, data=json.dumps(body).encode(),
67
+ headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"})
68
+ r = json.load(urllib.request.urlopen(req, timeout=180, context=SSLCTX))
69
+ return _clean(r["choices"][0]["message"]["content"])
70
+
71
+
72
+ def _clean(s):
73
+ s = s.strip().removeprefix("```json").removeprefix("```").removesuffix("```").strip()
74
+ return json.loads(s)
75
+
76
+
77
+ class Doc(dict):
78
+ """Result with convenience accessors."""
79
+ @property
80
+ def line_items(self): return self.get("line_items", [])
81
+ @property
82
+ def exclusions(self): return self.get("exclusions", [])
83
+ @property
84
+ def scope_gaps(self): return self.get("scope_gaps", [])
85
+ def to_json(self, **kw): return json.dumps(self, indent=2, **kw)
86
+
87
+
88
+ def read(path: str) -> Doc:
89
+ """Read a construction PDF into structured, page-cited data."""
90
+ doc = fitz.open(path)
91
+ text = _page_text(doc)
92
+ if len(text) < 40:
93
+ raise RuntimeError("No extractable text (scanned PDF) — vision OCR path TODO.")
94
+ data = _llm(text)
95
+ data["_source"] = path.split("/")[-1]
96
+ data["_pages"] = doc.page_count
97
+ return Doc(data)
@@ -0,0 +1,61 @@
1
+ """BidReader MCP server — lets any agent (Claude Desktop, Cursor, etc.) read a
2
+ construction document and get structured data + caught exclusions.
3
+
4
+ Run: bidreader-mcp (stdio transport)
5
+ Needs an LLM key in env: REQUESTY_API_KEY / OPENROUTER_API_KEY / GEMINI_API_KEY.
6
+
7
+ Claude Desktop / Cursor config:
8
+ {"mcpServers": {"bidreader": {"command": "bidreader-mcp",
9
+ "env": {"REQUESTY_API_KEY": "rqsty-..."}}}}
10
+ """
11
+ from __future__ import annotations
12
+ import os
13
+ from mcp.server.fastmcp import FastMCP
14
+ from .extract import read
15
+
16
+ mcp = FastMCP("bidreader")
17
+
18
+
19
+ def _check_key():
20
+ if not (os.environ.get("REQUESTY_API_KEY") or os.environ.get("OPENROUTER_API_KEY")
21
+ or os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")):
22
+ raise RuntimeError("No LLM key in env (REQUESTY_API_KEY / OPENROUTER_API_KEY / GEMINI_API_KEY).")
23
+
24
+
25
+ @mcp.tool()
26
+ def read_document(path: str) -> dict:
27
+ """Read a construction PDF (sub-quote, bid package, spec section, schedule) into
28
+ clean structured data: line_items, exclusions, assumptions, alternates, scope_gaps —
29
+ every value cited to its page. `path` is an absolute path to a local PDF.
30
+ Returns the full structured object."""
31
+ _check_key()
32
+ return dict(read(path))
33
+
34
+
35
+ @mcp.tool()
36
+ def catch_exclusions(path: str) -> dict:
37
+ """Scan a construction quote/spec PDF for scope EXCLUSIONS and gaps a vendor buried
38
+ (fine print, 'by others', 'not included'). Returns {exclusions:[{item,quote,page,risk}],
39
+ scope_gaps:[{missing,why}]} — what an estimator must confirm before bidding."""
40
+ _check_key()
41
+ d = read(path)
42
+ return {"vendor": d.get("vendor"), "trade": d.get("trade"),
43
+ "exclusions": d.get("exclusions", []), "scope_gaps": d.get("scope_gaps", [])}
44
+
45
+
46
+ @mcp.tool()
47
+ def extract_line_items(path: str) -> dict:
48
+ """Extract priced line items (section, description, qty, unit, amount, page) and the
49
+ bid total from a construction quote/bid PDF. Returns {bid_total, currency, line_items}."""
50
+ _check_key()
51
+ d = read(path)
52
+ return {"bid_total": d.get("bid_total"), "currency": d.get("currency"),
53
+ "line_items": d.get("line_items", [])}
54
+
55
+
56
+ def main():
57
+ mcp.run()
58
+
59
+
60
+ if __name__ == "__main__":
61
+ main()
@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.4
2
+ Name: bidreader
3
+ Version: 0.2.0
4
+ Summary: Read messy construction sub-quotes, bid packages & spec PDFs into clean structured data — and catch the scope gaps/exclusions vendors bury. Every value cited to its page.
5
+ Author-email: Anmol <anmol@attentive.ai>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/anmolsam/bidreader
8
+ Project-URL: Issues, https://github.com/anmolsam/bidreader/issues
9
+ Keywords: construction,estimating,takeoff,subcontractor,bid,quote,scope,exclusions,spec,AEC,preconstruction,BOQ,LLM,MCP
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Office/Business :: Financial
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: pymupdf>=1.24
19
+ Requires-Dist: certifi>=2024.0
20
+ Provides-Extra: tables
21
+ Requires-Dist: pdfplumber>=0.11; extra == "tables"
22
+ Provides-Extra: mcp
23
+ Requires-Dist: mcp>=1.2; extra == "mcp"
24
+ Dynamic: license-file
25
+
26
+ # BidReader
27
+
28
+ **Read messy construction sub-quotes, bid packages & spec PDFs into clean structured data — and catch the scope gaps and exclusions vendors bury in the fine print.** Every value is cited to its page and exact source text.
29
+
30
+ MIT · `pip install bidreader` · works on the PDFs estimators actually get.
31
+
32
+ > *"Manually typing numbers from a PDF into Excel because the formatting is a crime scene… hunting for the one line where a sub quietly excluded 'trash removal' in size-8 font."* — r/Construction (498 upvotes)
33
+
34
+ BidReader is that junior who never sleeps: it doesn't write anything new — it finds what's already there and points to it.
35
+
36
+ ---
37
+
38
+ ## What it does
39
+
40
+ ```bash
41
+ pip install bidreader
42
+ export REQUESTY_API_KEY=... # or OPENROUTER_API_KEY / GEMINI_API_KEY (free tier works)
43
+ bidreader sub_quote.pdf
44
+ ```
45
+
46
+ ```python
47
+ from bidreader import read
48
+ doc = read("sub_quote.pdf")
49
+ doc.line_items # [{section, description, qty, unit, amount, page}, ...]
50
+ doc.exclusions # [{item, quote, page, risk}, ...] <- the stuff they buried
51
+ doc.scope_gaps # trade-standard scope NOT mentioned (confirm before you bid)
52
+ doc.to_json()
53
+ ```
54
+
55
+ ## Real output (drywall sub-quote, exclusion buried in size-7 font)
56
+
57
+ ```
58
+ LINE ITEMS (5):
59
+ 09 22 16 Metal stud framing, 3-5/8" 25ga walls 12400 SF $35,340.00 p1
60
+ 09 29 00 5/8" Type X gypsum board, both faces 24800 SF $40,920.00 p1
61
+ 09 29 00 Tape & finish, Level 4 24800 SF $23,560.00 p1
62
+ ... BID TOTAL $121,628.00
63
+
64
+ !! EXCLUSIONS CAUGHT (4):
65
+ - Fire-stopping/firecaulking (page 1)
66
+ "this proposal EXCLUDES fire-stopping/firecaulking at rated assemblies"
67
+ risk: life-safety scope; another trade or a change order eats this cost.
68
+ - Debris removal/haul-off (page 1)
69
+ "removal/haul-off of construction debris (by others)"
70
+ ...
71
+
72
+ SCOPE GAPS TO CONFIRM (5):
73
+ - Acoustic ceiling tiles -- grid framing is included but the tiles within it are not.
74
+ - Rough carpentry blocking/backing for fixtures -- not mentioned.
75
+ ```
76
+
77
+ ## Why
78
+
79
+ The construction-AI gold rush is all building the same crowded, resisted thing — autonomous *takeoff*. The loudest, most-repeated, **unmet** estimator pain is upstream and downstream of it: turning crime-scene PDFs into clean data and **catching what subs quietly excluded**. No permissive library does this. BidReader is that primitive.
80
+
81
+ - **MIT** — depend on it inside your commercial estimating/BIM product (no AGPL/NC contamination).
82
+ - **Provider-agnostic** — Requesty, OpenRouter, or Google AI Studio (free tier).
83
+ - **Cited** — every number traces to a page + the exact source text. Trust is the real adoption blocker; this is built for it.
84
+
85
+ ## Use it from an AI agent (MCP)
86
+
87
+ Any MCP client — Claude Desktop, Cursor, etc. — can call BidReader:
88
+
89
+ ```bash
90
+ pip install "bidreader[mcp]"
91
+ ```
92
+ ```json
93
+ {
94
+ "mcpServers": {
95
+ "bidreader": {
96
+ "command": "bidreader-mcp",
97
+ "env": { "REQUESTY_API_KEY": "rqsty-..." }
98
+ }
99
+ }
100
+ }
101
+ ```
102
+ Tools exposed: `read_document(path)`, `catch_exclusions(path)`, `extract_line_items(path)`.
103
+ Now your agent can answer *"which subs excluded fire-stopping?"* across a bid folder.
104
+
105
+ ## Roadmap
106
+ - Scanned-PDF vision OCR path · revision/addendum **diff** ("what changed between Addendum 3 and 4") · `bidreader-mcp` server (any agent can call it) · Excel/CSV export · multi-quote leveling (compare subs side-by-side).
107
+
108
+ ## License
109
+ MIT.
@@ -0,0 +1,10 @@
1
+ bidreader/__init__.py,sha256=6FpDOl5Uqo_AwzdfdHJN0cWPHZnwSUoWjH8JVIRzMi8,79
2
+ bidreader/cli.py,sha256=WroAYRpr8HZmQPzj3u66zSzCQ_z22BZ-EMvNGfT_T54,1673
3
+ bidreader/extract.py,sha256=tUIpntnxvoelEKmjCfGp-mFM5thgGouD9IcQRHM7HVE,4820
4
+ bidreader/mcp_server.py,sha256=NbtzNQC5WwGR88MpK18y1N089EzELzoQjVqo8WkOP70,2239
5
+ bidreader-0.2.0.dist-info/licenses/LICENSE,sha256=TEAVtI0pnzQzIFRnRSYuB_UdXGIKEYa0EEtn0kc-vdw,1062
6
+ bidreader-0.2.0.dist-info/METADATA,sha256=5Rl2GffwhCdC_fn8gZNDrDezfdPg3lcX5gYZ5sfwTog,4539
7
+ bidreader-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
8
+ bidreader-0.2.0.dist-info/entry_points.txt,sha256=FNi7_QrwF3dEQc8hXUDkU1-eoMrSQaLtKJC93-LANbE,91
9
+ bidreader-0.2.0.dist-info/top_level.txt,sha256=7w-Do-kKxkje0iLiLhIrmGcDqcZWvMe1zI9EVBMaSCg,10
10
+ bidreader-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ bidreader = bidreader.cli:main
3
+ bidreader-mcp = bidreader.mcp_server:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Anmol
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ bidreader