terbium-parse 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. terbium/__init__.py +31 -0
  2. terbium/api.py +106 -0
  3. terbium/cli.py +53 -0
  4. terbium/documents/__init__.py +8 -0
  5. terbium/documents/base.py +46 -0
  6. terbium/documents/csv_adapter.py +78 -0
  7. terbium/documents/pdf.py +110 -0
  8. terbium/documents/pptx_adapter.py +129 -0
  9. terbium/documents/xlsx_adapter.py +88 -0
  10. terbium/harness/__init__.py +7 -0
  11. terbium/harness/ai.py +40 -0
  12. terbium/harness/arrange.py +120 -0
  13. terbium/harness/escalation.py +39 -0
  14. terbium/harness/providers/__init__.py +29 -0
  15. terbium/harness/providers/anthropic_provider.py +50 -0
  16. terbium/harness/providers/base.py +13 -0
  17. terbium/harness/providers/gemini_provider.py +48 -0
  18. terbium/harness/router.py +46 -0
  19. terbium/harness/vision.py +41 -0
  20. terbium/layout/__init__.py +3 -0
  21. terbium/layout/columns.py +32 -0
  22. terbium/layout/confidence.py +50 -0
  23. terbium/layout/dehead.py +64 -0
  24. terbium/layout/grid.py +214 -0
  25. terbium/layout/images.py +20 -0
  26. terbium/layout/lines.py +34 -0
  27. terbium/layout/signals.py +81 -0
  28. terbium/model/__init__.py +15 -0
  29. terbium/model/document.py +69 -0
  30. terbium/model/elements.py +88 -0
  31. terbium/model/record.py +43 -0
  32. terbium/model/table.py +45 -0
  33. terbium/py.typed +0 -0
  34. terbium/schema/__init__.py +5 -0
  35. terbium/schema/base.py +40 -0
  36. terbium/schema/furniture.py +51 -0
  37. terbium/schema/generic.py +76 -0
  38. terbium_parse-0.1.0.dist-info/METADATA +176 -0
  39. terbium_parse-0.1.0.dist-info/RECORD +43 -0
  40. terbium_parse-0.1.0.dist-info/WHEEL +5 -0
  41. terbium_parse-0.1.0.dist-info/entry_points.txt +2 -0
  42. terbium_parse-0.1.0.dist-info/licenses/LICENSE +21 -0
  43. terbium_parse-0.1.0.dist-info/top_level.txt +1 -0
terbium/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ """terbium - a god-level algorithmic multi-file parser that scores its own
2
+ confidence and only reaches for AI when it is genuinely stuck.
3
+
4
+ import terbium
5
+ doc = terbium.parse("catalogue.pdf")
6
+ print(doc.stats)
7
+ for r in doc.records:
8
+ print(r.sku, r.fields)
9
+ """
10
+ from __future__ import annotations
11
+
12
+ from .api import parse, supported_extensions, DEFAULT_THRESHOLD
13
+ from .harness import AI
14
+ from .harness.vision import read_page as read_images
15
+ from .model.document import ParsedDocument, Stats
16
+ from .model.record import Record
17
+ from .model.table import ExtractedTable
18
+
19
+ __version__ = "0.1.0"
20
+
21
+ __all__ = [
22
+ "parse",
23
+ "AI",
24
+ "read_images",
25
+ "supported_extensions",
26
+ "ParsedDocument",
27
+ "Record",
28
+ "ExtractedTable",
29
+ "Stats",
30
+ "__version__",
31
+ ]
terbium/api.py ADDED
@@ -0,0 +1,106 @@
1
+ """``terbium.parse`` - the one function most users call.
2
+
3
+ Flow: adapt -> assemble tables (native, or reconstructed from PDF geometry) ->
4
+ score confidence -> (optionally) send only the hard tables to AI -> build typed
5
+ records -> if anything is still shaky and no key was given, attach and announce
6
+ an escalation message.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import sys
11
+ from typing import List, Optional
12
+
13
+ from .documents import get_adapter, supported_extensions
14
+ from .layout import confidence as _confidence
15
+ from .layout import dehead, grid
16
+ from .layout.columns import split_columns
17
+ from .layout.lines import cluster_lines
18
+ from .model.document import ParsedDocument, Stats
19
+ from .model.elements import Page
20
+ from .model.table import ExtractedTable
21
+ from .schema import get_schema
22
+ from .harness import arrange_tables, build_message, resolve
23
+
24
+ DEFAULT_THRESHOLD = 0.72
25
+
26
+
27
+ def _assemble_tables(pages: List[Page]) -> List[ExtractedTable]:
28
+ tables: List[ExtractedTable] = []
29
+ pdf_pages = [p for p in pages if p.source_kind == "pdf" and p.words]
30
+ stripper = dehead.build_stripper(pdf_pages) if pdf_pages else None
31
+ for p in pages:
32
+ if p.native_tables:
33
+ tables.extend(p.native_tables)
34
+ elif p.source_kind == "pdf" and p.words:
35
+ for word_group in split_columns(p):
36
+ lines = cluster_lines(word_group)
37
+ if stripper:
38
+ lines = [ln for ln in lines if not stripper(ln, p)]
39
+ tables.extend(grid.extract_tables(lines, p))
40
+ return tables
41
+
42
+
43
+ def parse(
44
+ path: str,
45
+ schema=None,
46
+ ai=None,
47
+ threshold: float = DEFAULT_THRESHOLD,
48
+ announce: bool = True,
49
+ ) -> ParsedDocument:
50
+ """Parse a PDF/PPTX/XLSX/CSV file into structured, confidence-scored records.
51
+
52
+ ``schema``: "generic" (default) or "furniture", or a Schema instance.
53
+ ``ai``: a ``terbium.AI(...)``, ``True`` (use env keys), or ``None`` (off).
54
+ ``threshold``: confidence below which a record is "ambiguous".
55
+ ``announce``: print the escalation message to stderr when AI could help but
56
+ no key is set. This is terbium telling you it is stuck.
57
+ """
58
+ adapter = get_adapter(path)
59
+ pages = adapter.parse(path)
60
+ source_kind = pages[0].source_kind if pages else "unknown"
61
+
62
+ tables = _assemble_tables(pages)
63
+ for t in tables:
64
+ _confidence.score_table(t)
65
+
66
+ ai_cfg = resolve(ai)
67
+ hard = [t for t in tables if t.confidence < threshold]
68
+ used_ai = False
69
+ if hard and ai_cfg is not None:
70
+ fixed = arrange_tables(path, pages, hard, ai_cfg)
71
+ used_ai = fixed > 0
72
+ hard = [t for t in tables if t.confidence < threshold]
73
+
74
+ schema_obj = get_schema(schema)
75
+ records = []
76
+ for t in tables:
77
+ recs = schema_obj.build_records([t])
78
+ if t.origin == "ai":
79
+ for r in recs:
80
+ r.origin = "ai"
81
+ records.extend(recs)
82
+
83
+ stats = Stats(
84
+ total=len(records),
85
+ confident=sum(1 for r in records if r.confidence >= threshold),
86
+ ambiguous=sum(1 for r in records if r.confidence < threshold),
87
+ threshold=threshold,
88
+ )
89
+ doc = ParsedDocument(
90
+ path=path,
91
+ source_kind=source_kind,
92
+ pages=pages,
93
+ records=records,
94
+ stats=stats,
95
+ used_ai=used_ai,
96
+ )
97
+
98
+ if hard:
99
+ doc.escalation = build_message(records, hard, threshold)
100
+ if announce and ai_cfg is None:
101
+ print(doc.escalation, file=sys.stderr)
102
+
103
+ return doc
104
+
105
+
106
+ __all__ = ["parse", "supported_extensions", "DEFAULT_THRESHOLD"]
terbium/cli.py ADDED
@@ -0,0 +1,53 @@
1
+ """`terbium <file>` - parse from the command line."""
2
+ from __future__ import annotations
3
+
4
+ import argparse
5
+ import sys
6
+
7
+ from . import __version__
8
+ from .api import parse, supported_extensions
9
+ from .harness import AI
10
+
11
+
12
+ def main(argv=None) -> int:
13
+ ap = argparse.ArgumentParser(
14
+ prog="terbium",
15
+ description="Algorithmic multi-file parser (PDF/PPTX/XLSX/CSV) that knows when it is stuck.",
16
+ )
17
+ ap.add_argument("file", help="path to a " + "/".join(supported_extensions()) + " file")
18
+ ap.add_argument("--schema", default="generic", help="generic (default) or furniture")
19
+ ap.add_argument("--json", metavar="OUT", help="write records as JSON to this path (or - for stdout)")
20
+ ap.add_argument("--ai", action="store_true", help="enable the AI lane using env keys")
21
+ ap.add_argument("--tier", choices=["haiku", "sonnet", "opus"], help="pin the AI model tier")
22
+ ap.add_argument("--limit", type=int, default=12, help="how many records to preview")
23
+ ap.add_argument("--version", action="version", version=f"terbium {__version__}")
24
+ args = ap.parse_args(argv)
25
+
26
+ ai = AI(force_tier=args.tier) if args.ai else None
27
+ doc = parse(args.file, schema=args.schema, ai=ai)
28
+
29
+ if args.json:
30
+ payload = doc.to_json()
31
+ if args.json == "-":
32
+ print(payload)
33
+ else:
34
+ with open(args.json, "w", encoding="utf-8") as f:
35
+ f.write(payload)
36
+ print(f"wrote {len(doc.records)} records -> {args.json}", file=sys.stderr)
37
+ return 0
38
+
39
+ print(f"terbium {__version__} · {doc.source_kind} · {len(doc.pages)} pages")
40
+ print(f"records: {doc.stats.total} (confident {doc.stats.confident}, ambiguous {doc.stats.ambiguous})")
41
+ if doc.used_ai:
42
+ print("AI lane: engaged on hard tables")
43
+ print("-" * 60)
44
+ for r in doc.records[: args.limit]:
45
+ flag = "" if r.confidence >= doc.stats.threshold else " [ambiguous]"
46
+ print(f"{r.sku or '-':>8} {r.confidence:.2f} {r.fields}{flag}")
47
+ if doc.stats.total > args.limit:
48
+ print(f"... and {doc.stats.total - args.limit} more")
49
+ return 0
50
+
51
+
52
+ if __name__ == "__main__":
53
+ raise SystemExit(main())
@@ -0,0 +1,8 @@
1
+ """Importing this package registers every adapter as a side effect."""
2
+ from .base import DocumentAdapter, get_adapter, register, supported_extensions
3
+ from . import pdf as _pdf
4
+ from . import pptx_adapter as _pptx
5
+ from . import xlsx_adapter as _xlsx
6
+ from . import csv_adapter as _csv
7
+
8
+ __all__ = ["DocumentAdapter", "get_adapter", "register", "supported_extensions"]
@@ -0,0 +1,46 @@
1
+ """Adapter interface + registry. One adapter per file format.
2
+
3
+ Adapters do exactly one job: turn bytes on disk into normalized ``Page`` objects
4
+ (words with positions, images, and - when the format exposes it natively - ready
5
+ made tables). Everything smart happens after, on that uniform representation.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from abc import ABC, abstractmethod
11
+ from typing import List
12
+
13
+ from ..model.elements import Page
14
+
15
+
16
+ class DocumentAdapter(ABC):
17
+ extensions: tuple = ()
18
+
19
+ @abstractmethod
20
+ def parse(self, path: str) -> List[Page]:
21
+ ...
22
+
23
+
24
+ _REGISTRY: dict = {}
25
+
26
+
27
+ def register(adapter_cls):
28
+ """Class decorator: instantiate the adapter and index it by extension."""
29
+ instance = adapter_cls()
30
+ for ext in adapter_cls.extensions:
31
+ _REGISTRY[ext.lower()] = instance
32
+ return adapter_cls
33
+
34
+
35
+ def get_adapter(path: str) -> DocumentAdapter:
36
+ ext = os.path.splitext(path)[1].lower().lstrip(".")
37
+ if ext not in _REGISTRY:
38
+ raise ValueError(
39
+ f"terbium has no adapter for '.{ext}'. Supported: "
40
+ + ", ".join(sorted(_REGISTRY)) + "."
41
+ )
42
+ return _REGISTRY[ext]
43
+
44
+
45
+ def supported_extensions() -> List[str]:
46
+ return sorted(_REGISTRY)
@@ -0,0 +1,78 @@
1
+ """CSV adapter (stdlib).
2
+
3
+ Sniffs the delimiter and whether a header row exists, tolerates messy encodings,
4
+ and hands back a single native table. The easy case, handled honestly.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import csv as _csv
9
+ from typing import List, Optional
10
+
11
+ from ..model.elements import Page
12
+ from ..model.table import ExtractedTable
13
+ from .base import DocumentAdapter, register
14
+
15
+ _MAX_ROWS = 20000
16
+
17
+
18
+ def _read_text(path: str) -> str:
19
+ for enc in ("utf-8-sig", "utf-8", "latin-1"):
20
+ try:
21
+ with open(path, "r", encoding=enc, newline="") as f:
22
+ return f.read()
23
+ except UnicodeDecodeError:
24
+ continue
25
+ with open(path, "r", encoding="latin-1", errors="replace", newline="") as f:
26
+ return f.read()
27
+
28
+
29
+ @register
30
+ class CsvAdapter(DocumentAdapter):
31
+ extensions = ("csv", "tsv")
32
+
33
+ def parse(self, path: str) -> List[Page]:
34
+ text = _read_text(path)
35
+ sample = text[:8192]
36
+ try:
37
+ dialect = _csv.Sniffer().sniff(sample, delimiters=",;\t|")
38
+ except _csv.Error:
39
+ dialect = _csv.excel
40
+ dialect.delimiter = "\t" if path.lower().endswith(".tsv") else ","
41
+ try:
42
+ has_header = _csv.Sniffer().has_header(sample)
43
+ except _csv.Error:
44
+ has_header = True
45
+
46
+ rows: List[List[str]] = []
47
+ for i, row in enumerate(_csv.reader(text.splitlines(), dialect)):
48
+ if i >= _MAX_ROWS:
49
+ break
50
+ rows.append([c.strip() for c in row])
51
+ rows = [r for r in rows if any(c for c in r)]
52
+ table = _rows_to_table(rows, has_header)
53
+ page = Page(index=0, width=0, height=0, source_kind="csv", native_tables=[table] if table else [])
54
+ return [page]
55
+
56
+
57
+ def _rows_to_table(rows: List[List[str]], has_header: bool) -> Optional[ExtractedTable]:
58
+ if not rows:
59
+ return None
60
+ ncol = max(len(r) for r in rows)
61
+ rows = [r + [""] * (ncol - len(r)) for r in rows]
62
+ if has_header:
63
+ header = rows[0]
64
+ body = rows[1:]
65
+ col_headers = header[1:] if ncol > 1 else header
66
+ else:
67
+ body = rows
68
+ col_headers = [f"col{i + 1}" for i in range(ncol - 1 if ncol > 1 else ncol)]
69
+ row_headers = [r[0] for r in body] if ncol > 1 else ["" for _ in body]
70
+ cells = [[(v or None) for v in (r[1:] if ncol > 1 else r)] for r in body]
71
+ return ExtractedTable(
72
+ title=None,
73
+ row_headers=row_headers,
74
+ col_headers=[h or f"col{i + 1}" for i, h in enumerate(col_headers)],
75
+ cells=cells,
76
+ source_page=0,
77
+ kind="grid",
78
+ )
@@ -0,0 +1,110 @@
1
+ """PDF adapter (PyMuPDF).
2
+
3
+ Produces token-level words with positions AND font sizes - the two inputs the
4
+ geometry engine needs to rebuild columns, rows, and titles. It also enumerates
5
+ embedded images with pixel dimensions so they can be classified, and can render
6
+ a page to PNG for the vision lane.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ from typing import List
11
+
12
+ import fitz # PyMuPDF
13
+
14
+ from ..layout.images import classify
15
+ from ..model.elements import ImageRef, Page, Word
16
+ from .base import DocumentAdapter, register
17
+
18
+ _BOLD_FLAG = 1 << 4
19
+
20
+
21
+ def _strip_pua(text: str) -> str:
22
+ """Drop private-use glyphs (U+E000..U+F8FF).
23
+
24
+ Catalogues encode material icons (FSC, oiled, varnished) as private-use font
25
+ glyphs that leak into extracted text as noise like 'FSC\\ue514'. The icons
26
+ themselves belong to the vision lane, not the text.
27
+ """
28
+ return "".join(c for c in text if not (0xE000 <= ord(c) <= 0xF8FF))
29
+
30
+
31
+ def _span_words(span: dict) -> List[Word]:
32
+ """Split a span into whitespace tokens, distributing x by character offset.
33
+
34
+ Span geometry is exact; per-token x is interpolated across the span box.
35
+ Because catalogue article numbers are fixed-width digits, this places SKU
36
+ cells accurately enough to align them into columns.
37
+ """
38
+ text = _strip_pua(span.get("text", ""))
39
+ if not text.strip():
40
+ return []
41
+ x0, y0, x1, y1 = span["bbox"]
42
+ size = float(span.get("size", 0.0))
43
+ font = str(span.get("font", "")).lower()
44
+ bold = bool(span.get("flags", 0) & _BOLD_FLAG) or "bold" in font
45
+ width = x1 - x0
46
+ n = len(text)
47
+ words: List[Word] = []
48
+ idx = 0
49
+ for tok in text.split():
50
+ start = text.index(tok, idx)
51
+ end = start + len(tok)
52
+ idx = end
53
+ wx0 = x0 + width * (start / n) if n else x0
54
+ wx1 = x0 + width * (end / n) if n else x1
55
+ words.append(Word(text=tok, x0=wx0, y0=y0, x1=wx1, y1=y1, size=size, bold=bold))
56
+ return words
57
+
58
+
59
+ @register
60
+ class PdfAdapter(DocumentAdapter):
61
+ extensions = ("pdf",)
62
+
63
+ def parse(self, path: str) -> List[Page]:
64
+ pages: List[Page] = []
65
+ with fitz.open(path) as doc:
66
+ for i, page in enumerate(doc):
67
+ rect = page.rect
68
+ words: List[Word] = []
69
+ data = page.get_text("dict")
70
+ for block in data.get("blocks", []):
71
+ for line in block.get("lines", []):
72
+ for span in line.get("spans", []):
73
+ words.extend(_span_words(span))
74
+ images: List[ImageRef] = []
75
+ try:
76
+ for info in page.get_image_info(xrefs=True):
77
+ w = int(info.get("width", 0))
78
+ h = int(info.get("height", 0))
79
+ if w and h:
80
+ images.append(
81
+ ImageRef(
82
+ page=i,
83
+ width=w,
84
+ height=h,
85
+ kind=classify(w, h),
86
+ bbox=tuple(info.get("bbox")) if info.get("bbox") else None,
87
+ xref=info.get("xref"),
88
+ )
89
+ )
90
+ except Exception:
91
+ pass
92
+ pages.append(
93
+ Page(
94
+ index=i,
95
+ width=rect.width,
96
+ height=rect.height,
97
+ words=words,
98
+ images=images,
99
+ source_kind="pdf",
100
+ )
101
+ )
102
+ return pages
103
+
104
+
105
+ def render_page_png(path: str, index: int, dpi: int = 120) -> bytes:
106
+ """Render one page to PNG bytes (used by the vision lane)."""
107
+ with fitz.open(path) as doc:
108
+ page = doc[index]
109
+ pix = page.get_pixmap(dpi=dpi)
110
+ return pix.tobytes("png")
@@ -0,0 +1,129 @@
1
+ """PPTX adapter (python-pptx).
2
+
3
+ PowerPoint is a gift compared to PDF: shapes carry positions, tables are native
4
+ (rows and cells, no reconstruction), and pictures expose real pixels. So we hand
5
+ tables straight through as high-confidence ``native_tables`` and only fall back
6
+ to geometry for free-floating text boxes.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import io
11
+ from typing import List, Optional
12
+
13
+ from pptx import Presentation
14
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
15
+
16
+ from ..layout.images import classify
17
+ from ..model.elements import ImageRef, Page, Word
18
+ from ..model.table import ExtractedTable
19
+ from .base import DocumentAdapter, register
20
+
21
+ _EMU_PER_PT = 12700.0
22
+
23
+
24
+ def _pt(emu) -> float:
25
+ return float(emu) / _EMU_PER_PT if emu is not None else 0.0
26
+
27
+
28
+ def _tokens_in_box(text: str, x0: float, y: float, x1: float, size: float, bold: bool) -> List[Word]:
29
+ if not text.strip():
30
+ return []
31
+ width = max(1.0, x1 - x0)
32
+ n = len(text)
33
+ out: List[Word] = []
34
+ idx = 0
35
+ for tok in text.split():
36
+ start = text.index(tok, idx)
37
+ end = start + len(tok)
38
+ idx = end
39
+ wx0 = x0 + width * (start / n)
40
+ wx1 = x0 + width * (end / n)
41
+ out.append(Word(text=tok, x0=wx0, y0=y, x1=wx1, y1=y + max(size, 8), size=size, bold=bold))
42
+ return out
43
+
44
+
45
+ def _slide_title(slide) -> Optional[str]:
46
+ try:
47
+ if slide.shapes.title is not None and slide.shapes.title.text.strip():
48
+ return slide.shapes.title.text.strip()
49
+ except Exception:
50
+ pass
51
+ return None
52
+
53
+
54
+ def _table_to_extracted(shape, page_index: int, title: Optional[str]) -> ExtractedTable:
55
+ tbl = shape.table
56
+ rows = list(tbl.rows)
57
+ grid = [[cell.text.strip() for cell in row.cells] for row in rows]
58
+ if not grid:
59
+ return ExtractedTable(title, [], [], [], page_index, kind="grid")
60
+ col_headers = grid[0]
61
+ body = grid[1:] if len(grid) > 1 else []
62
+ row_headers = [r[0] if r else "" for r in body]
63
+ cells = [[(v if v else None) for v in (r[1:] if len(r) > 1 else r)] for r in body]
64
+ col_headers = col_headers[1:] if len(col_headers) > 1 else col_headers
65
+ return ExtractedTable(
66
+ title=title,
67
+ row_headers=row_headers,
68
+ col_headers=col_headers,
69
+ cells=cells if cells else [[v or None for v in r] for r in body],
70
+ source_page=page_index,
71
+ kind="grid",
72
+ )
73
+
74
+
75
+ @register
76
+ class PptxAdapter(DocumentAdapter):
77
+ extensions = ("pptx",)
78
+
79
+ def parse(self, path: str) -> List[Page]:
80
+ prs = Presentation(path)
81
+ sw, sh = _pt(prs.slide_width), _pt(prs.slide_height)
82
+ pages: List[Page] = []
83
+ for i, slide in enumerate(prs.slides):
84
+ words: List[Word] = []
85
+ images: List[ImageRef] = []
86
+ native: List[ExtractedTable] = []
87
+ title = _slide_title(slide)
88
+ for shape in slide.shapes:
89
+ x0, y0 = _pt(shape.left), _pt(shape.top)
90
+ x1 = x0 + _pt(shape.width)
91
+ if shape.has_table:
92
+ native.append(_table_to_extracted(shape, i, title))
93
+ continue
94
+ if shape.has_text_frame:
95
+ y = y0
96
+ for para in shape.text_frame.paragraphs:
97
+ text = "".join(run.text for run in para.runs) or para.text
98
+ size = 0.0
99
+ bold = False
100
+ for run in para.runs:
101
+ if run.font.size is not None:
102
+ size = run.font.size.pt
103
+ if run.font.bold:
104
+ bold = True
105
+ words.extend(_tokens_in_box(text, x0, y, x1, size, bold))
106
+ y += max(size, 14) * 1.2
107
+ continue
108
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
109
+ try:
110
+ blob = shape.image.blob
111
+ from PIL import Image
112
+
113
+ with Image.open(io.BytesIO(blob)) as im:
114
+ w, h = im.size
115
+ images.append(ImageRef(page=i, width=w, height=h, kind=classify(w, h)))
116
+ except Exception:
117
+ pass
118
+ pages.append(
119
+ Page(
120
+ index=i,
121
+ width=sw,
122
+ height=sh,
123
+ words=words,
124
+ images=images,
125
+ source_kind="pptx",
126
+ native_tables=native,
127
+ )
128
+ )
129
+ return pages
@@ -0,0 +1,88 @@
1
+ """XLSX adapter (openpyxl).
2
+
3
+ Spreadsheets are already a grid, so the work is mostly faithful transcription:
4
+ one sheet -> one page, merged header cells propagated across their span so
5
+ multi-column headers survive, values coerced to strings.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from typing import List, Optional
10
+
11
+ from openpyxl import load_workbook
12
+
13
+ from ..model.elements import Page
14
+ from ..model.table import ExtractedTable
15
+ from .base import DocumentAdapter, register
16
+
17
+ _MAX_ROWS = 5000
18
+ _MAX_COLS = 200
19
+
20
+
21
+ def _s(v) -> Optional[str]:
22
+ if v is None:
23
+ return None
24
+ s = str(v).strip()
25
+ return s or None
26
+
27
+
28
+ @register
29
+ class XlsxAdapter(DocumentAdapter):
30
+ extensions = ("xlsx", "xlsm")
31
+
32
+ def parse(self, path: str) -> List[Page]:
33
+ wb = load_workbook(path, data_only=True)
34
+ pages: List[Page] = []
35
+ for si, ws in enumerate(wb.worksheets):
36
+ max_row = min(ws.max_row or 0, _MAX_ROWS)
37
+ max_col = min(ws.max_column or 0, _MAX_COLS)
38
+ if max_row == 0 or max_col == 0:
39
+ pages.append(Page(index=si, width=0, height=0, source_kind="xlsx"))
40
+ continue
41
+ grid = [[_s(ws.cell(r, c).value) for c in range(1, max_col + 1)] for r in range(1, max_row + 1)]
42
+ # propagate merged header/label cells across their range
43
+ for rng in ws.merged_cells.ranges:
44
+ tl = _s(ws.cell(rng.min_row, rng.min_col).value)
45
+ for r in range(rng.min_row, min(rng.max_row, max_row) + 1):
46
+ for c in range(rng.min_col, min(rng.max_col, max_col) + 1):
47
+ grid[r - 1][c - 1] = tl
48
+ grid = _trim(grid)
49
+ table = _grid_to_table(grid, si, title=ws.title)
50
+ pages.append(
51
+ Page(index=si, width=0, height=0, source_kind="xlsx", native_tables=[table] if table else [])
52
+ )
53
+ return pages
54
+
55
+
56
+ def _trim(grid: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
57
+ while grid and all(v is None for v in grid[-1]):
58
+ grid.pop()
59
+ while grid and all(v is None for v in grid[0]):
60
+ grid.pop(0)
61
+ if not grid:
62
+ return grid
63
+ ncol = len(grid[0])
64
+ last = 0
65
+ for row in grid:
66
+ for c in range(ncol - 1, -1, -1):
67
+ if c < len(row) and row[c] is not None:
68
+ last = max(last, c)
69
+ break
70
+ return [row[: last + 1] for row in grid]
71
+
72
+
73
+ def _grid_to_table(grid: List[List[Optional[str]]], page_index: int, title: Optional[str]) -> Optional[ExtractedTable]:
74
+ if not grid:
75
+ return None
76
+ header = grid[0]
77
+ body = grid[1:]
78
+ col_headers = header[1:] if len(header) > 1 else header
79
+ row_headers = [(r[0] if r else "") or "" for r in body]
80
+ cells = [r[1:] if len(r) > 1 else r for r in body]
81
+ return ExtractedTable(
82
+ title=title,
83
+ row_headers=row_headers,
84
+ col_headers=[h or f"col{i + 1}" for i, h in enumerate(col_headers)],
85
+ cells=cells if cells else [[v for v in r] for r in body],
86
+ source_page=page_index,
87
+ kind="grid",
88
+ )
@@ -0,0 +1,7 @@
1
+ from .ai import AI, resolve
2
+ from .escalation import build_message
3
+ from .arrange import arrange_tables
4
+ from .vision import read_page
5
+ from . import router
6
+
7
+ __all__ = ["AI", "resolve", "build_message", "arrange_tables", "read_page", "router"]