terbium-parse 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- terbium/__init__.py +31 -0
- terbium/api.py +106 -0
- terbium/cli.py +53 -0
- terbium/documents/__init__.py +8 -0
- terbium/documents/base.py +46 -0
- terbium/documents/csv_adapter.py +78 -0
- terbium/documents/pdf.py +110 -0
- terbium/documents/pptx_adapter.py +129 -0
- terbium/documents/xlsx_adapter.py +88 -0
- terbium/harness/__init__.py +7 -0
- terbium/harness/ai.py +40 -0
- terbium/harness/arrange.py +120 -0
- terbium/harness/escalation.py +39 -0
- terbium/harness/providers/__init__.py +29 -0
- terbium/harness/providers/anthropic_provider.py +50 -0
- terbium/harness/providers/base.py +13 -0
- terbium/harness/providers/gemini_provider.py +48 -0
- terbium/harness/router.py +46 -0
- terbium/harness/vision.py +41 -0
- terbium/layout/__init__.py +3 -0
- terbium/layout/columns.py +32 -0
- terbium/layout/confidence.py +50 -0
- terbium/layout/dehead.py +64 -0
- terbium/layout/grid.py +214 -0
- terbium/layout/images.py +20 -0
- terbium/layout/lines.py +34 -0
- terbium/layout/signals.py +81 -0
- terbium/model/__init__.py +15 -0
- terbium/model/document.py +69 -0
- terbium/model/elements.py +88 -0
- terbium/model/record.py +43 -0
- terbium/model/table.py +45 -0
- terbium/py.typed +0 -0
- terbium/schema/__init__.py +5 -0
- terbium/schema/base.py +40 -0
- terbium/schema/furniture.py +51 -0
- terbium/schema/generic.py +76 -0
- terbium_parse-0.1.0.dist-info/METADATA +176 -0
- terbium_parse-0.1.0.dist-info/RECORD +43 -0
- terbium_parse-0.1.0.dist-info/WHEEL +5 -0
- terbium_parse-0.1.0.dist-info/entry_points.txt +2 -0
- terbium_parse-0.1.0.dist-info/licenses/LICENSE +21 -0
- terbium_parse-0.1.0.dist-info/top_level.txt +1 -0
terbium/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""terbium - a god-level algorithmic multi-file parser that scores its own
|
|
2
|
+
confidence and only reaches for AI when it is genuinely stuck.
|
|
3
|
+
|
|
4
|
+
import terbium
|
|
5
|
+
doc = terbium.parse("catalogue.pdf")
|
|
6
|
+
print(doc.stats)
|
|
7
|
+
for r in doc.records:
|
|
8
|
+
print(r.sku, r.fields)
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .api import parse, supported_extensions, DEFAULT_THRESHOLD
|
|
13
|
+
from .harness import AI
|
|
14
|
+
from .harness.vision import read_page as read_images
|
|
15
|
+
from .model.document import ParsedDocument, Stats
|
|
16
|
+
from .model.record import Record
|
|
17
|
+
from .model.table import ExtractedTable
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"parse",
|
|
23
|
+
"AI",
|
|
24
|
+
"read_images",
|
|
25
|
+
"supported_extensions",
|
|
26
|
+
"ParsedDocument",
|
|
27
|
+
"Record",
|
|
28
|
+
"ExtractedTable",
|
|
29
|
+
"Stats",
|
|
30
|
+
"__version__",
|
|
31
|
+
]
|
terbium/api.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""``terbium.parse`` - the one function most users call.
|
|
2
|
+
|
|
3
|
+
Flow: adapt -> assemble tables (native, or reconstructed from PDF geometry) ->
|
|
4
|
+
score confidence -> (optionally) send only the hard tables to AI -> build typed
|
|
5
|
+
records -> if anything is still shaky and no key was given, attach and announce
|
|
6
|
+
an escalation message.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from .documents import get_adapter, supported_extensions
|
|
14
|
+
from .layout import confidence as _confidence
|
|
15
|
+
from .layout import dehead, grid
|
|
16
|
+
from .layout.columns import split_columns
|
|
17
|
+
from .layout.lines import cluster_lines
|
|
18
|
+
from .model.document import ParsedDocument, Stats
|
|
19
|
+
from .model.elements import Page
|
|
20
|
+
from .model.table import ExtractedTable
|
|
21
|
+
from .schema import get_schema
|
|
22
|
+
from .harness import arrange_tables, build_message, resolve
|
|
23
|
+
|
|
24
|
+
DEFAULT_THRESHOLD = 0.72
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _assemble_tables(pages: List[Page]) -> List[ExtractedTable]:
|
|
28
|
+
tables: List[ExtractedTable] = []
|
|
29
|
+
pdf_pages = [p for p in pages if p.source_kind == "pdf" and p.words]
|
|
30
|
+
stripper = dehead.build_stripper(pdf_pages) if pdf_pages else None
|
|
31
|
+
for p in pages:
|
|
32
|
+
if p.native_tables:
|
|
33
|
+
tables.extend(p.native_tables)
|
|
34
|
+
elif p.source_kind == "pdf" and p.words:
|
|
35
|
+
for word_group in split_columns(p):
|
|
36
|
+
lines = cluster_lines(word_group)
|
|
37
|
+
if stripper:
|
|
38
|
+
lines = [ln for ln in lines if not stripper(ln, p)]
|
|
39
|
+
tables.extend(grid.extract_tables(lines, p))
|
|
40
|
+
return tables
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def parse(
|
|
44
|
+
path: str,
|
|
45
|
+
schema=None,
|
|
46
|
+
ai=None,
|
|
47
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
48
|
+
announce: bool = True,
|
|
49
|
+
) -> ParsedDocument:
|
|
50
|
+
"""Parse a PDF/PPTX/XLSX/CSV file into structured, confidence-scored records.
|
|
51
|
+
|
|
52
|
+
``schema``: "generic" (default) or "furniture", or a Schema instance.
|
|
53
|
+
``ai``: a ``terbium.AI(...)``, ``True`` (use env keys), or ``None`` (off).
|
|
54
|
+
``threshold``: confidence below which a record is "ambiguous".
|
|
55
|
+
``announce``: print the escalation message to stderr when AI could help but
|
|
56
|
+
no key is set. This is terbium telling you it is stuck.
|
|
57
|
+
"""
|
|
58
|
+
adapter = get_adapter(path)
|
|
59
|
+
pages = adapter.parse(path)
|
|
60
|
+
source_kind = pages[0].source_kind if pages else "unknown"
|
|
61
|
+
|
|
62
|
+
tables = _assemble_tables(pages)
|
|
63
|
+
for t in tables:
|
|
64
|
+
_confidence.score_table(t)
|
|
65
|
+
|
|
66
|
+
ai_cfg = resolve(ai)
|
|
67
|
+
hard = [t for t in tables if t.confidence < threshold]
|
|
68
|
+
used_ai = False
|
|
69
|
+
if hard and ai_cfg is not None:
|
|
70
|
+
fixed = arrange_tables(path, pages, hard, ai_cfg)
|
|
71
|
+
used_ai = fixed > 0
|
|
72
|
+
hard = [t for t in tables if t.confidence < threshold]
|
|
73
|
+
|
|
74
|
+
schema_obj = get_schema(schema)
|
|
75
|
+
records = []
|
|
76
|
+
for t in tables:
|
|
77
|
+
recs = schema_obj.build_records([t])
|
|
78
|
+
if t.origin == "ai":
|
|
79
|
+
for r in recs:
|
|
80
|
+
r.origin = "ai"
|
|
81
|
+
records.extend(recs)
|
|
82
|
+
|
|
83
|
+
stats = Stats(
|
|
84
|
+
total=len(records),
|
|
85
|
+
confident=sum(1 for r in records if r.confidence >= threshold),
|
|
86
|
+
ambiguous=sum(1 for r in records if r.confidence < threshold),
|
|
87
|
+
threshold=threshold,
|
|
88
|
+
)
|
|
89
|
+
doc = ParsedDocument(
|
|
90
|
+
path=path,
|
|
91
|
+
source_kind=source_kind,
|
|
92
|
+
pages=pages,
|
|
93
|
+
records=records,
|
|
94
|
+
stats=stats,
|
|
95
|
+
used_ai=used_ai,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
if hard:
|
|
99
|
+
doc.escalation = build_message(records, hard, threshold)
|
|
100
|
+
if announce and ai_cfg is None:
|
|
101
|
+
print(doc.escalation, file=sys.stderr)
|
|
102
|
+
|
|
103
|
+
return doc
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
__all__ = ["parse", "supported_extensions", "DEFAULT_THRESHOLD"]
|
terbium/cli.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""`terbium <file>` - parse from the command line."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
from . import __version__
|
|
8
|
+
from .api import parse, supported_extensions
|
|
9
|
+
from .harness import AI
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main(argv=None) -> int:
|
|
13
|
+
ap = argparse.ArgumentParser(
|
|
14
|
+
prog="terbium",
|
|
15
|
+
description="Algorithmic multi-file parser (PDF/PPTX/XLSX/CSV) that knows when it is stuck.",
|
|
16
|
+
)
|
|
17
|
+
ap.add_argument("file", help="path to a " + "/".join(supported_extensions()) + " file")
|
|
18
|
+
ap.add_argument("--schema", default="generic", help="generic (default) or furniture")
|
|
19
|
+
ap.add_argument("--json", metavar="OUT", help="write records as JSON to this path (or - for stdout)")
|
|
20
|
+
ap.add_argument("--ai", action="store_true", help="enable the AI lane using env keys")
|
|
21
|
+
ap.add_argument("--tier", choices=["haiku", "sonnet", "opus"], help="pin the AI model tier")
|
|
22
|
+
ap.add_argument("--limit", type=int, default=12, help="how many records to preview")
|
|
23
|
+
ap.add_argument("--version", action="version", version=f"terbium {__version__}")
|
|
24
|
+
args = ap.parse_args(argv)
|
|
25
|
+
|
|
26
|
+
ai = AI(force_tier=args.tier) if args.ai else None
|
|
27
|
+
doc = parse(args.file, schema=args.schema, ai=ai)
|
|
28
|
+
|
|
29
|
+
if args.json:
|
|
30
|
+
payload = doc.to_json()
|
|
31
|
+
if args.json == "-":
|
|
32
|
+
print(payload)
|
|
33
|
+
else:
|
|
34
|
+
with open(args.json, "w", encoding="utf-8") as f:
|
|
35
|
+
f.write(payload)
|
|
36
|
+
print(f"wrote {len(doc.records)} records -> {args.json}", file=sys.stderr)
|
|
37
|
+
return 0
|
|
38
|
+
|
|
39
|
+
print(f"terbium {__version__} · {doc.source_kind} · {len(doc.pages)} pages")
|
|
40
|
+
print(f"records: {doc.stats.total} (confident {doc.stats.confident}, ambiguous {doc.stats.ambiguous})")
|
|
41
|
+
if doc.used_ai:
|
|
42
|
+
print("AI lane: engaged on hard tables")
|
|
43
|
+
print("-" * 60)
|
|
44
|
+
for r in doc.records[: args.limit]:
|
|
45
|
+
flag = "" if r.confidence >= doc.stats.threshold else " [ambiguous]"
|
|
46
|
+
print(f"{r.sku or '-':>8} {r.confidence:.2f} {r.fields}{flag}")
|
|
47
|
+
if doc.stats.total > args.limit:
|
|
48
|
+
print(f"... and {doc.stats.total - args.limit} more")
|
|
49
|
+
return 0
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Importing this package registers every adapter as a side effect."""
|
|
2
|
+
from .base import DocumentAdapter, get_adapter, register, supported_extensions
|
|
3
|
+
from . import pdf as _pdf
|
|
4
|
+
from . import pptx_adapter as _pptx
|
|
5
|
+
from . import xlsx_adapter as _xlsx
|
|
6
|
+
from . import csv_adapter as _csv
|
|
7
|
+
|
|
8
|
+
__all__ = ["DocumentAdapter", "get_adapter", "register", "supported_extensions"]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Adapter interface + registry. One adapter per file format.
|
|
2
|
+
|
|
3
|
+
Adapters do exactly one job: turn bytes on disk into normalized ``Page`` objects
|
|
4
|
+
(words with positions, images, and - when the format exposes it natively - ready
|
|
5
|
+
made tables). Everything smart happens after, on that uniform representation.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from typing import List
|
|
12
|
+
|
|
13
|
+
from ..model.elements import Page
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DocumentAdapter(ABC):
|
|
17
|
+
extensions: tuple = ()
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def parse(self, path: str) -> List[Page]:
|
|
21
|
+
...
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
_REGISTRY: dict = {}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def register(adapter_cls):
|
|
28
|
+
"""Class decorator: instantiate the adapter and index it by extension."""
|
|
29
|
+
instance = adapter_cls()
|
|
30
|
+
for ext in adapter_cls.extensions:
|
|
31
|
+
_REGISTRY[ext.lower()] = instance
|
|
32
|
+
return adapter_cls
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_adapter(path: str) -> DocumentAdapter:
|
|
36
|
+
ext = os.path.splitext(path)[1].lower().lstrip(".")
|
|
37
|
+
if ext not in _REGISTRY:
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"terbium has no adapter for '.{ext}'. Supported: "
|
|
40
|
+
+ ", ".join(sorted(_REGISTRY)) + "."
|
|
41
|
+
)
|
|
42
|
+
return _REGISTRY[ext]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def supported_extensions() -> List[str]:
|
|
46
|
+
return sorted(_REGISTRY)
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""CSV adapter (stdlib).
|
|
2
|
+
|
|
3
|
+
Sniffs the delimiter and whether a header row exists, tolerates messy encodings,
|
|
4
|
+
and hands back a single native table. The easy case, handled honestly.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import csv as _csv
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
|
|
11
|
+
from ..model.elements import Page
|
|
12
|
+
from ..model.table import ExtractedTable
|
|
13
|
+
from .base import DocumentAdapter, register
|
|
14
|
+
|
|
15
|
+
_MAX_ROWS = 20000
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _read_text(path: str) -> str:
|
|
19
|
+
for enc in ("utf-8-sig", "utf-8", "latin-1"):
|
|
20
|
+
try:
|
|
21
|
+
with open(path, "r", encoding=enc, newline="") as f:
|
|
22
|
+
return f.read()
|
|
23
|
+
except UnicodeDecodeError:
|
|
24
|
+
continue
|
|
25
|
+
with open(path, "r", encoding="latin-1", errors="replace", newline="") as f:
|
|
26
|
+
return f.read()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@register
|
|
30
|
+
class CsvAdapter(DocumentAdapter):
|
|
31
|
+
extensions = ("csv", "tsv")
|
|
32
|
+
|
|
33
|
+
def parse(self, path: str) -> List[Page]:
|
|
34
|
+
text = _read_text(path)
|
|
35
|
+
sample = text[:8192]
|
|
36
|
+
try:
|
|
37
|
+
dialect = _csv.Sniffer().sniff(sample, delimiters=",;\t|")
|
|
38
|
+
except _csv.Error:
|
|
39
|
+
dialect = _csv.excel
|
|
40
|
+
dialect.delimiter = "\t" if path.lower().endswith(".tsv") else ","
|
|
41
|
+
try:
|
|
42
|
+
has_header = _csv.Sniffer().has_header(sample)
|
|
43
|
+
except _csv.Error:
|
|
44
|
+
has_header = True
|
|
45
|
+
|
|
46
|
+
rows: List[List[str]] = []
|
|
47
|
+
for i, row in enumerate(_csv.reader(text.splitlines(), dialect)):
|
|
48
|
+
if i >= _MAX_ROWS:
|
|
49
|
+
break
|
|
50
|
+
rows.append([c.strip() for c in row])
|
|
51
|
+
rows = [r for r in rows if any(c for c in r)]
|
|
52
|
+
table = _rows_to_table(rows, has_header)
|
|
53
|
+
page = Page(index=0, width=0, height=0, source_kind="csv", native_tables=[table] if table else [])
|
|
54
|
+
return [page]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _rows_to_table(rows: List[List[str]], has_header: bool) -> Optional[ExtractedTable]:
|
|
58
|
+
if not rows:
|
|
59
|
+
return None
|
|
60
|
+
ncol = max(len(r) for r in rows)
|
|
61
|
+
rows = [r + [""] * (ncol - len(r)) for r in rows]
|
|
62
|
+
if has_header:
|
|
63
|
+
header = rows[0]
|
|
64
|
+
body = rows[1:]
|
|
65
|
+
col_headers = header[1:] if ncol > 1 else header
|
|
66
|
+
else:
|
|
67
|
+
body = rows
|
|
68
|
+
col_headers = [f"col{i + 1}" for i in range(ncol - 1 if ncol > 1 else ncol)]
|
|
69
|
+
row_headers = [r[0] for r in body] if ncol > 1 else ["" for _ in body]
|
|
70
|
+
cells = [[(v or None) for v in (r[1:] if ncol > 1 else r)] for r in body]
|
|
71
|
+
return ExtractedTable(
|
|
72
|
+
title=None,
|
|
73
|
+
row_headers=row_headers,
|
|
74
|
+
col_headers=[h or f"col{i + 1}" for i, h in enumerate(col_headers)],
|
|
75
|
+
cells=cells,
|
|
76
|
+
source_page=0,
|
|
77
|
+
kind="grid",
|
|
78
|
+
)
|
terbium/documents/pdf.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""PDF adapter (PyMuPDF).
|
|
2
|
+
|
|
3
|
+
Produces token-level words with positions AND font sizes - the two inputs the
|
|
4
|
+
geometry engine needs to rebuild columns, rows, and titles. It also enumerates
|
|
5
|
+
embedded images with pixel dimensions so they can be classified, and can render
|
|
6
|
+
a page to PNG for the vision lane.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import List
|
|
11
|
+
|
|
12
|
+
import fitz # PyMuPDF
|
|
13
|
+
|
|
14
|
+
from ..layout.images import classify
|
|
15
|
+
from ..model.elements import ImageRef, Page, Word
|
|
16
|
+
from .base import DocumentAdapter, register
|
|
17
|
+
|
|
18
|
+
_BOLD_FLAG = 1 << 4
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _strip_pua(text: str) -> str:
|
|
22
|
+
"""Drop private-use glyphs (U+E000..U+F8FF).
|
|
23
|
+
|
|
24
|
+
Catalogues encode material icons (FSC, oiled, varnished) as private-use font
|
|
25
|
+
glyphs that leak into extracted text as noise like 'FSC\\ue514'. The icons
|
|
26
|
+
themselves belong to the vision lane, not the text.
|
|
27
|
+
"""
|
|
28
|
+
return "".join(c for c in text if not (0xE000 <= ord(c) <= 0xF8FF))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _span_words(span: dict) -> List[Word]:
|
|
32
|
+
"""Split a span into whitespace tokens, distributing x by character offset.
|
|
33
|
+
|
|
34
|
+
Span geometry is exact; per-token x is interpolated across the span box.
|
|
35
|
+
Because catalogue article numbers are fixed-width digits, this places SKU
|
|
36
|
+
cells accurately enough to align them into columns.
|
|
37
|
+
"""
|
|
38
|
+
text = _strip_pua(span.get("text", ""))
|
|
39
|
+
if not text.strip():
|
|
40
|
+
return []
|
|
41
|
+
x0, y0, x1, y1 = span["bbox"]
|
|
42
|
+
size = float(span.get("size", 0.0))
|
|
43
|
+
font = str(span.get("font", "")).lower()
|
|
44
|
+
bold = bool(span.get("flags", 0) & _BOLD_FLAG) or "bold" in font
|
|
45
|
+
width = x1 - x0
|
|
46
|
+
n = len(text)
|
|
47
|
+
words: List[Word] = []
|
|
48
|
+
idx = 0
|
|
49
|
+
for tok in text.split():
|
|
50
|
+
start = text.index(tok, idx)
|
|
51
|
+
end = start + len(tok)
|
|
52
|
+
idx = end
|
|
53
|
+
wx0 = x0 + width * (start / n) if n else x0
|
|
54
|
+
wx1 = x0 + width * (end / n) if n else x1
|
|
55
|
+
words.append(Word(text=tok, x0=wx0, y0=y0, x1=wx1, y1=y1, size=size, bold=bold))
|
|
56
|
+
return words
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@register
|
|
60
|
+
class PdfAdapter(DocumentAdapter):
|
|
61
|
+
extensions = ("pdf",)
|
|
62
|
+
|
|
63
|
+
def parse(self, path: str) -> List[Page]:
|
|
64
|
+
pages: List[Page] = []
|
|
65
|
+
with fitz.open(path) as doc:
|
|
66
|
+
for i, page in enumerate(doc):
|
|
67
|
+
rect = page.rect
|
|
68
|
+
words: List[Word] = []
|
|
69
|
+
data = page.get_text("dict")
|
|
70
|
+
for block in data.get("blocks", []):
|
|
71
|
+
for line in block.get("lines", []):
|
|
72
|
+
for span in line.get("spans", []):
|
|
73
|
+
words.extend(_span_words(span))
|
|
74
|
+
images: List[ImageRef] = []
|
|
75
|
+
try:
|
|
76
|
+
for info in page.get_image_info(xrefs=True):
|
|
77
|
+
w = int(info.get("width", 0))
|
|
78
|
+
h = int(info.get("height", 0))
|
|
79
|
+
if w and h:
|
|
80
|
+
images.append(
|
|
81
|
+
ImageRef(
|
|
82
|
+
page=i,
|
|
83
|
+
width=w,
|
|
84
|
+
height=h,
|
|
85
|
+
kind=classify(w, h),
|
|
86
|
+
bbox=tuple(info.get("bbox")) if info.get("bbox") else None,
|
|
87
|
+
xref=info.get("xref"),
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
except Exception:
|
|
91
|
+
pass
|
|
92
|
+
pages.append(
|
|
93
|
+
Page(
|
|
94
|
+
index=i,
|
|
95
|
+
width=rect.width,
|
|
96
|
+
height=rect.height,
|
|
97
|
+
words=words,
|
|
98
|
+
images=images,
|
|
99
|
+
source_kind="pdf",
|
|
100
|
+
)
|
|
101
|
+
)
|
|
102
|
+
return pages
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def render_page_png(path: str, index: int, dpi: int = 120) -> bytes:
|
|
106
|
+
"""Render one page to PNG bytes (used by the vision lane)."""
|
|
107
|
+
with fitz.open(path) as doc:
|
|
108
|
+
page = doc[index]
|
|
109
|
+
pix = page.get_pixmap(dpi=dpi)
|
|
110
|
+
return pix.tobytes("png")
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""PPTX adapter (python-pptx).
|
|
2
|
+
|
|
3
|
+
PowerPoint is a gift compared to PDF: shapes carry positions, tables are native
|
|
4
|
+
(rows and cells, no reconstruction), and pictures expose real pixels. So we hand
|
|
5
|
+
tables straight through as high-confidence ``native_tables`` and only fall back
|
|
6
|
+
to geometry for free-floating text boxes.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import io
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from pptx import Presentation
|
|
14
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
15
|
+
|
|
16
|
+
from ..layout.images import classify
|
|
17
|
+
from ..model.elements import ImageRef, Page, Word
|
|
18
|
+
from ..model.table import ExtractedTable
|
|
19
|
+
from .base import DocumentAdapter, register
|
|
20
|
+
|
|
21
|
+
_EMU_PER_PT = 12700.0
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _pt(emu) -> float:
|
|
25
|
+
return float(emu) / _EMU_PER_PT if emu is not None else 0.0
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _tokens_in_box(text: str, x0: float, y: float, x1: float, size: float, bold: bool) -> List[Word]:
|
|
29
|
+
if not text.strip():
|
|
30
|
+
return []
|
|
31
|
+
width = max(1.0, x1 - x0)
|
|
32
|
+
n = len(text)
|
|
33
|
+
out: List[Word] = []
|
|
34
|
+
idx = 0
|
|
35
|
+
for tok in text.split():
|
|
36
|
+
start = text.index(tok, idx)
|
|
37
|
+
end = start + len(tok)
|
|
38
|
+
idx = end
|
|
39
|
+
wx0 = x0 + width * (start / n)
|
|
40
|
+
wx1 = x0 + width * (end / n)
|
|
41
|
+
out.append(Word(text=tok, x0=wx0, y0=y, x1=wx1, y1=y + max(size, 8), size=size, bold=bold))
|
|
42
|
+
return out
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _slide_title(slide) -> Optional[str]:
|
|
46
|
+
try:
|
|
47
|
+
if slide.shapes.title is not None and slide.shapes.title.text.strip():
|
|
48
|
+
return slide.shapes.title.text.strip()
|
|
49
|
+
except Exception:
|
|
50
|
+
pass
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _table_to_extracted(shape, page_index: int, title: Optional[str]) -> ExtractedTable:
|
|
55
|
+
tbl = shape.table
|
|
56
|
+
rows = list(tbl.rows)
|
|
57
|
+
grid = [[cell.text.strip() for cell in row.cells] for row in rows]
|
|
58
|
+
if not grid:
|
|
59
|
+
return ExtractedTable(title, [], [], [], page_index, kind="grid")
|
|
60
|
+
col_headers = grid[0]
|
|
61
|
+
body = grid[1:] if len(grid) > 1 else []
|
|
62
|
+
row_headers = [r[0] if r else "" for r in body]
|
|
63
|
+
cells = [[(v if v else None) for v in (r[1:] if len(r) > 1 else r)] for r in body]
|
|
64
|
+
col_headers = col_headers[1:] if len(col_headers) > 1 else col_headers
|
|
65
|
+
return ExtractedTable(
|
|
66
|
+
title=title,
|
|
67
|
+
row_headers=row_headers,
|
|
68
|
+
col_headers=col_headers,
|
|
69
|
+
cells=cells if cells else [[v or None for v in r] for r in body],
|
|
70
|
+
source_page=page_index,
|
|
71
|
+
kind="grid",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@register
|
|
76
|
+
class PptxAdapter(DocumentAdapter):
|
|
77
|
+
extensions = ("pptx",)
|
|
78
|
+
|
|
79
|
+
def parse(self, path: str) -> List[Page]:
|
|
80
|
+
prs = Presentation(path)
|
|
81
|
+
sw, sh = _pt(prs.slide_width), _pt(prs.slide_height)
|
|
82
|
+
pages: List[Page] = []
|
|
83
|
+
for i, slide in enumerate(prs.slides):
|
|
84
|
+
words: List[Word] = []
|
|
85
|
+
images: List[ImageRef] = []
|
|
86
|
+
native: List[ExtractedTable] = []
|
|
87
|
+
title = _slide_title(slide)
|
|
88
|
+
for shape in slide.shapes:
|
|
89
|
+
x0, y0 = _pt(shape.left), _pt(shape.top)
|
|
90
|
+
x1 = x0 + _pt(shape.width)
|
|
91
|
+
if shape.has_table:
|
|
92
|
+
native.append(_table_to_extracted(shape, i, title))
|
|
93
|
+
continue
|
|
94
|
+
if shape.has_text_frame:
|
|
95
|
+
y = y0
|
|
96
|
+
for para in shape.text_frame.paragraphs:
|
|
97
|
+
text = "".join(run.text for run in para.runs) or para.text
|
|
98
|
+
size = 0.0
|
|
99
|
+
bold = False
|
|
100
|
+
for run in para.runs:
|
|
101
|
+
if run.font.size is not None:
|
|
102
|
+
size = run.font.size.pt
|
|
103
|
+
if run.font.bold:
|
|
104
|
+
bold = True
|
|
105
|
+
words.extend(_tokens_in_box(text, x0, y, x1, size, bold))
|
|
106
|
+
y += max(size, 14) * 1.2
|
|
107
|
+
continue
|
|
108
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
|
109
|
+
try:
|
|
110
|
+
blob = shape.image.blob
|
|
111
|
+
from PIL import Image
|
|
112
|
+
|
|
113
|
+
with Image.open(io.BytesIO(blob)) as im:
|
|
114
|
+
w, h = im.size
|
|
115
|
+
images.append(ImageRef(page=i, width=w, height=h, kind=classify(w, h)))
|
|
116
|
+
except Exception:
|
|
117
|
+
pass
|
|
118
|
+
pages.append(
|
|
119
|
+
Page(
|
|
120
|
+
index=i,
|
|
121
|
+
width=sw,
|
|
122
|
+
height=sh,
|
|
123
|
+
words=words,
|
|
124
|
+
images=images,
|
|
125
|
+
source_kind="pptx",
|
|
126
|
+
native_tables=native,
|
|
127
|
+
)
|
|
128
|
+
)
|
|
129
|
+
return pages
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""XLSX adapter (openpyxl).
|
|
2
|
+
|
|
3
|
+
Spreadsheets are already a grid, so the work is mostly faithful transcription:
|
|
4
|
+
one sheet -> one page, merged header cells propagated across their span so
|
|
5
|
+
multi-column headers survive, values coerced to strings.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
|
|
11
|
+
from openpyxl import load_workbook
|
|
12
|
+
|
|
13
|
+
from ..model.elements import Page
|
|
14
|
+
from ..model.table import ExtractedTable
|
|
15
|
+
from .base import DocumentAdapter, register
|
|
16
|
+
|
|
17
|
+
_MAX_ROWS = 5000
|
|
18
|
+
_MAX_COLS = 200
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _s(v) -> Optional[str]:
|
|
22
|
+
if v is None:
|
|
23
|
+
return None
|
|
24
|
+
s = str(v).strip()
|
|
25
|
+
return s or None
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@register
|
|
29
|
+
class XlsxAdapter(DocumentAdapter):
|
|
30
|
+
extensions = ("xlsx", "xlsm")
|
|
31
|
+
|
|
32
|
+
def parse(self, path: str) -> List[Page]:
|
|
33
|
+
wb = load_workbook(path, data_only=True)
|
|
34
|
+
pages: List[Page] = []
|
|
35
|
+
for si, ws in enumerate(wb.worksheets):
|
|
36
|
+
max_row = min(ws.max_row or 0, _MAX_ROWS)
|
|
37
|
+
max_col = min(ws.max_column or 0, _MAX_COLS)
|
|
38
|
+
if max_row == 0 or max_col == 0:
|
|
39
|
+
pages.append(Page(index=si, width=0, height=0, source_kind="xlsx"))
|
|
40
|
+
continue
|
|
41
|
+
grid = [[_s(ws.cell(r, c).value) for c in range(1, max_col + 1)] for r in range(1, max_row + 1)]
|
|
42
|
+
# propagate merged header/label cells across their range
|
|
43
|
+
for rng in ws.merged_cells.ranges:
|
|
44
|
+
tl = _s(ws.cell(rng.min_row, rng.min_col).value)
|
|
45
|
+
for r in range(rng.min_row, min(rng.max_row, max_row) + 1):
|
|
46
|
+
for c in range(rng.min_col, min(rng.max_col, max_col) + 1):
|
|
47
|
+
grid[r - 1][c - 1] = tl
|
|
48
|
+
grid = _trim(grid)
|
|
49
|
+
table = _grid_to_table(grid, si, title=ws.title)
|
|
50
|
+
pages.append(
|
|
51
|
+
Page(index=si, width=0, height=0, source_kind="xlsx", native_tables=[table] if table else [])
|
|
52
|
+
)
|
|
53
|
+
return pages
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _trim(grid: List[List[Optional[str]]]) -> List[List[Optional[str]]]:
|
|
57
|
+
while grid and all(v is None for v in grid[-1]):
|
|
58
|
+
grid.pop()
|
|
59
|
+
while grid and all(v is None for v in grid[0]):
|
|
60
|
+
grid.pop(0)
|
|
61
|
+
if not grid:
|
|
62
|
+
return grid
|
|
63
|
+
ncol = len(grid[0])
|
|
64
|
+
last = 0
|
|
65
|
+
for row in grid:
|
|
66
|
+
for c in range(ncol - 1, -1, -1):
|
|
67
|
+
if c < len(row) and row[c] is not None:
|
|
68
|
+
last = max(last, c)
|
|
69
|
+
break
|
|
70
|
+
return [row[: last + 1] for row in grid]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _grid_to_table(grid: List[List[Optional[str]]], page_index: int, title: Optional[str]) -> Optional[ExtractedTable]:
|
|
74
|
+
if not grid:
|
|
75
|
+
return None
|
|
76
|
+
header = grid[0]
|
|
77
|
+
body = grid[1:]
|
|
78
|
+
col_headers = header[1:] if len(header) > 1 else header
|
|
79
|
+
row_headers = [(r[0] if r else "") or "" for r in body]
|
|
80
|
+
cells = [r[1:] if len(r) > 1 else r for r in body]
|
|
81
|
+
return ExtractedTable(
|
|
82
|
+
title=title,
|
|
83
|
+
row_headers=row_headers,
|
|
84
|
+
col_headers=[h or f"col{i + 1}" for i, h in enumerate(col_headers)],
|
|
85
|
+
cells=cells if cells else [[v for v in r] for r in body],
|
|
86
|
+
source_page=page_index,
|
|
87
|
+
kind="grid",
|
|
88
|
+
)
|