doctra 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +19 -0
- doctra/cli/__init__.py +27 -0
- doctra/cli/main.py +856 -0
- doctra/cli/utils.py +340 -0
- doctra/engines/__init__.py +0 -0
- doctra/engines/layout/__init__.py +0 -0
- doctra/engines/layout/layout_models.py +90 -0
- doctra/engines/layout/paddle_layout.py +225 -0
- doctra/engines/ocr/__init__.py +4 -0
- doctra/engines/ocr/api.py +36 -0
- doctra/engines/ocr/path_resolver.py +48 -0
- doctra/engines/ocr/pytesseract_engine.py +76 -0
- doctra/engines/vlm/__init__.py +0 -0
- doctra/engines/vlm/outlines_types.py +31 -0
- doctra/engines/vlm/provider.py +58 -0
- doctra/engines/vlm/service.py +117 -0
- doctra/exporters/__init__.py +0 -0
- doctra/exporters/excel_writer.py +197 -0
- doctra/exporters/image_saver.py +42 -0
- doctra/exporters/markdown_table.py +56 -0
- doctra/exporters/markdown_writer.py +29 -0
- doctra/parsers/__init__.py +6 -0
- doctra/parsers/layout_order.py +16 -0
- doctra/parsers/structured_pdf_parser.py +434 -0
- doctra/parsers/table_chart_extractor.py +283 -0
- doctra/utils/__init__.py +0 -0
- doctra/utils/bbox.py +18 -0
- doctra/utils/constants.py +8 -0
- doctra/utils/file_ops.py +26 -0
- doctra/utils/io_utils.py +10 -0
- doctra/utils/ocr_utils.py +20 -0
- doctra/utils/pdf_io.py +19 -0
- doctra/utils/quiet.py +13 -0
- doctra/utils/structured_utils.py +49 -0
- doctra/version.py +2 -0
- doctra-0.1.0.dist-info/METADATA +626 -0
- doctra-0.1.0.dist-info/RECORD +40 -0
- doctra-0.1.0.dist-info/WHEEL +5 -0
- doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
- doctra-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,283 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
from typing import List, Dict, Any
|
5
|
+
from contextlib import ExitStack
|
6
|
+
from pathlib import Path
|
7
|
+
|
8
|
+
from PIL import Image
|
9
|
+
from tqdm import tqdm
|
10
|
+
|
11
|
+
from doctra.utils.pdf_io import render_pdf_to_images
|
12
|
+
from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
|
13
|
+
from doctra.engines.layout.layout_models import LayoutPage
|
14
|
+
|
15
|
+
from doctra.parsers.layout_order import reading_order_key
|
16
|
+
from doctra.exporters.image_saver import save_box_image
|
17
|
+
from doctra.utils.file_ops import ensure_output_dirs
|
18
|
+
|
19
|
+
from doctra.engines.vlm.service import VLMStructuredExtractor
|
20
|
+
from doctra.exporters.excel_writer import write_structured_excel
|
21
|
+
from doctra.utils.structured_utils import to_structured_dict
|
22
|
+
from doctra.exporters.markdown_table import render_markdown_table
|
23
|
+
from doctra.exporters.markdown_writer import write_markdown
|
24
|
+
|
25
|
+
|
26
|
+
class ChartTablePDFParser:
|
27
|
+
"""
|
28
|
+
Specialized PDF parser for extracting charts and tables.
|
29
|
+
|
30
|
+
Focuses specifically on chart and table extraction from PDF documents,
|
31
|
+
with optional VLM (Vision Language Model) processing to convert visual
|
32
|
+
elements into structured data.
|
33
|
+
|
34
|
+
:param extract_charts: Whether to extract charts from the document (default: True)
|
35
|
+
:param extract_tables: Whether to extract tables from the document (default: True)
|
36
|
+
:param use_vlm: Whether to use VLM for structured data extraction (default: False)
|
37
|
+
:param vlm_provider: VLM provider to use ("gemini" or "openai", default: "gemini")
|
38
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
39
|
+
:param vlm_api_key: API key for VLM provider (required if use_vlm is True)
|
40
|
+
:param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
|
41
|
+
:param dpi: DPI for PDF rendering (default: 200)
|
42
|
+
:param min_score: Minimum confidence score for layout detection (default: 0.0)
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(
|
46
|
+
self,
|
47
|
+
*,
|
48
|
+
extract_charts: bool = True,
|
49
|
+
extract_tables: bool = True,
|
50
|
+
use_vlm: bool = False,
|
51
|
+
vlm_provider: str = "gemini",
|
52
|
+
vlm_model: str | None = None,
|
53
|
+
vlm_api_key: str | None = None,
|
54
|
+
layout_model_name: str = "PP-DocLayout_plus-L",
|
55
|
+
dpi: int = 200,
|
56
|
+
min_score: float = 0.0,
|
57
|
+
):
|
58
|
+
"""
|
59
|
+
Initialize the ChartTablePDFParser with extraction configuration.
|
60
|
+
|
61
|
+
Sets up the layout detection engine and optionally the VLM service
|
62
|
+
for structured data extraction.
|
63
|
+
|
64
|
+
:param extract_charts: Whether to extract charts from the document
|
65
|
+
:param extract_tables: Whether to extract tables from the document
|
66
|
+
:param use_vlm: Whether to use VLM for structured data extraction
|
67
|
+
:param vlm_provider: VLM provider to use ("gemini" or "openai")
|
68
|
+
:param vlm_model: Model name to use (defaults to provider-specific defaults)
|
69
|
+
:param vlm_api_key: API key for VLM provider
|
70
|
+
:param layout_model_name: Layout detection model name
|
71
|
+
:param dpi: DPI for PDF rendering
|
72
|
+
:param min_score: Minimum confidence score for layout detection
|
73
|
+
:raises ValueError: If neither extract_charts nor extract_tables is True
|
74
|
+
"""
|
75
|
+
# Validation
|
76
|
+
if not extract_charts and not extract_tables:
|
77
|
+
raise ValueError("At least one of extract_charts or extract_tables must be True")
|
78
|
+
|
79
|
+
self.extract_charts = extract_charts
|
80
|
+
self.extract_tables = extract_tables
|
81
|
+
self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
|
82
|
+
self.dpi = dpi
|
83
|
+
self.min_score = min_score
|
84
|
+
|
85
|
+
self.use_vlm = use_vlm
|
86
|
+
self.vlm = None
|
87
|
+
if self.use_vlm:
|
88
|
+
self.vlm = VLMStructuredExtractor(
|
89
|
+
vlm_provider=vlm_provider,
|
90
|
+
vlm_model=vlm_model,
|
91
|
+
api_key=vlm_api_key,
|
92
|
+
)
|
93
|
+
|
94
|
+
def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
|
95
|
+
"""
|
96
|
+
Parse a PDF document and extract charts and/or tables.
|
97
|
+
|
98
|
+
Processes the PDF through layout detection, extracts the specified
|
99
|
+
element types, saves cropped images, and optionally converts them
|
100
|
+
to structured data using VLM.
|
101
|
+
|
102
|
+
:param pdf_path: Path to the input PDF file
|
103
|
+
:param output_base_dir: Base directory for output files (default: "outputs")
|
104
|
+
:return: None
|
105
|
+
"""
|
106
|
+
# Create output directory structure: outputs/structured_doc/<filename>/
|
107
|
+
pdf_name = Path(pdf_path).stem
|
108
|
+
out_dir = os.path.join(output_base_dir, pdf_name)
|
109
|
+
os.makedirs(out_dir, exist_ok=True)
|
110
|
+
|
111
|
+
# Create subdirectories based on what we're extracting
|
112
|
+
charts_dir = None
|
113
|
+
tables_dir = None
|
114
|
+
|
115
|
+
if self.extract_charts:
|
116
|
+
charts_dir = os.path.join(out_dir, "charts")
|
117
|
+
os.makedirs(charts_dir, exist_ok=True)
|
118
|
+
|
119
|
+
if self.extract_tables:
|
120
|
+
tables_dir = os.path.join(out_dir, "tables")
|
121
|
+
os.makedirs(tables_dir, exist_ok=True)
|
122
|
+
|
123
|
+
pages: List[LayoutPage] = self.layout_engine.predict_pdf(
|
124
|
+
pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
|
125
|
+
)
|
126
|
+
pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
|
127
|
+
|
128
|
+
# Determine which labels to extract
|
129
|
+
target_labels = []
|
130
|
+
if self.extract_charts:
|
131
|
+
target_labels.append("chart")
|
132
|
+
if self.extract_tables:
|
133
|
+
target_labels.append("table")
|
134
|
+
|
135
|
+
# Count items for progress bars
|
136
|
+
chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
|
137
|
+
table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
|
138
|
+
|
139
|
+
# Prepare output content
|
140
|
+
if self.use_vlm:
|
141
|
+
md_lines: List[str] = ["# Extracted Charts and Tables\n"]
|
142
|
+
structured_items: List[Dict[str, Any]] = []
|
143
|
+
|
144
|
+
# Progress bar descriptions
|
145
|
+
charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
|
146
|
+
tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
|
147
|
+
|
148
|
+
chart_counter = 1
|
149
|
+
table_counter = 1
|
150
|
+
|
151
|
+
with ExitStack() as stack:
|
152
|
+
charts_bar = stack.enter_context(
|
153
|
+
tqdm(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
|
154
|
+
tables_bar = stack.enter_context(
|
155
|
+
tqdm(total=table_count, desc=tables_desc, leave=True)) if table_count else None
|
156
|
+
|
157
|
+
for p in pages:
|
158
|
+
page_num = p.page_index
|
159
|
+
page_img: Image.Image = pil_pages[page_num - 1]
|
160
|
+
|
161
|
+
# Only process selected item types
|
162
|
+
target_items = [box for box in p.boxes if box.label in target_labels]
|
163
|
+
|
164
|
+
if target_items and self.use_vlm:
|
165
|
+
md_lines.append(f"\n## Page {page_num}\n")
|
166
|
+
|
167
|
+
for box in sorted(target_items, key=reading_order_key):
|
168
|
+
# Handle charts
|
169
|
+
if box.label == "chart" and self.extract_charts:
|
170
|
+
chart_filename = f"chart_{chart_counter:03d}.png"
|
171
|
+
chart_path = os.path.join(charts_dir, chart_filename)
|
172
|
+
|
173
|
+
# Save image
|
174
|
+
cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
|
175
|
+
cropped_img.save(chart_path)
|
176
|
+
|
177
|
+
# Handle VLM processing if enabled
|
178
|
+
if self.use_vlm and self.vlm:
|
179
|
+
rel_path = os.path.join("charts", chart_filename)
|
180
|
+
wrote_table = False
|
181
|
+
|
182
|
+
try:
|
183
|
+
extracted_chart = self.vlm.extract_chart(chart_path)
|
184
|
+
structured_item = to_structured_dict(extracted_chart)
|
185
|
+
if structured_item:
|
186
|
+
structured_items.append(structured_item)
|
187
|
+
md_lines.append(
|
188
|
+
render_markdown_table(
|
189
|
+
structured_item.get("headers"),
|
190
|
+
structured_item.get("rows"),
|
191
|
+
title=structured_item.get(
|
192
|
+
"title") or f"Chart {chart_counter} — page {page_num}"
|
193
|
+
)
|
194
|
+
)
|
195
|
+
wrote_table = True
|
196
|
+
except Exception:
|
197
|
+
pass
|
198
|
+
|
199
|
+
if not wrote_table:
|
200
|
+
md_lines.append(f"\n")
|
201
|
+
|
202
|
+
chart_counter += 1
|
203
|
+
if charts_bar:
|
204
|
+
charts_bar.update(1)
|
205
|
+
|
206
|
+
# Handle tables
|
207
|
+
elif box.label == "table" and self.extract_tables:
|
208
|
+
table_filename = f"table_{table_counter:03d}.png"
|
209
|
+
table_path = os.path.join(tables_dir, table_filename)
|
210
|
+
|
211
|
+
# Save image
|
212
|
+
cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
|
213
|
+
cropped_img.save(table_path)
|
214
|
+
|
215
|
+
# Handle VLM processing if enabled
|
216
|
+
if self.use_vlm and self.vlm:
|
217
|
+
rel_path = os.path.join("tables", table_filename)
|
218
|
+
wrote_table = False
|
219
|
+
|
220
|
+
try:
|
221
|
+
extracted_table = self.vlm.extract_table(table_path)
|
222
|
+
structured_item = to_structured_dict(extracted_table)
|
223
|
+
if structured_item:
|
224
|
+
structured_items.append(structured_item)
|
225
|
+
md_lines.append(
|
226
|
+
render_markdown_table(
|
227
|
+
structured_item.get("headers"),
|
228
|
+
structured_item.get("rows"),
|
229
|
+
title=structured_item.get(
|
230
|
+
"title") or f"Table {table_counter} — page {page_num}"
|
231
|
+
)
|
232
|
+
)
|
233
|
+
wrote_table = True
|
234
|
+
except Exception:
|
235
|
+
pass
|
236
|
+
|
237
|
+
if not wrote_table:
|
238
|
+
md_lines.append(f"\n")
|
239
|
+
|
240
|
+
table_counter += 1
|
241
|
+
if tables_bar:
|
242
|
+
tables_bar.update(1)
|
243
|
+
|
244
|
+
# Write outputs only if VLM is used
|
245
|
+
md_path = None
|
246
|
+
excel_path = None
|
247
|
+
|
248
|
+
if self.use_vlm:
|
249
|
+
# Write markdown file
|
250
|
+
md_path = os.path.join(out_dir, "charts.md")
|
251
|
+
with open(md_path, 'w', encoding='utf-8') as f:
|
252
|
+
f.write('\n'.join(md_lines))
|
253
|
+
|
254
|
+
# Write Excel file if we have structured data
|
255
|
+
if structured_items:
|
256
|
+
excel_path = os.path.join(out_dir, "charts.xlsx")
|
257
|
+
write_structured_excel(excel_path, structured_items)
|
258
|
+
|
259
|
+
# Print results
|
260
|
+
extraction_types = []
|
261
|
+
if self.extract_charts:
|
262
|
+
extraction_types.append("charts")
|
263
|
+
if self.extract_tables:
|
264
|
+
extraction_types.append("tables")
|
265
|
+
|
266
|
+
print(f"{' and '.join(extraction_types).title()} extraction completed successfully.")
|
267
|
+
print(f"- Output directory: {out_dir}")
|
268
|
+
|
269
|
+
if charts_dir and self.extract_charts:
|
270
|
+
print(f"- Charts directory: {charts_dir}")
|
271
|
+
print(f"- Charts extracted: {chart_counter - 1}")
|
272
|
+
|
273
|
+
if tables_dir and self.extract_tables:
|
274
|
+
print(f"- Tables directory: {tables_dir}")
|
275
|
+
print(f"- Tables extracted: {table_counter - 1}")
|
276
|
+
|
277
|
+
if md_path:
|
278
|
+
print(f"- Markdown file: {md_path}")
|
279
|
+
if excel_path:
|
280
|
+
print(f"- Excel file: {excel_path}")
|
281
|
+
|
282
|
+
if not self.use_vlm:
|
283
|
+
print("- Note: VLM disabled - only cropped images saved")
|
doctra/utils/__init__.py
ADDED
File without changes
|
doctra/utils/bbox.py
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
import math
|
3
|
+
from typing import Tuple
|
4
|
+
|
5
|
+
def clip_bbox_to_image(x1: float, y1: float, x2: float, y2: float, w: int, h: int) -> Tuple[int, int, int, int]:
|
6
|
+
"""
|
7
|
+
Clip a float bbox to image bounds, return integer crop box (left, top, right, bottom).
|
8
|
+
Guarantees non-empty crop.
|
9
|
+
"""
|
10
|
+
left = max(0, min(int(math.floor(x1)), w))
|
11
|
+
top = max(0, min(int(math.floor(y1)), h))
|
12
|
+
right = max(0, min(int(math.ceil(x2)), w))
|
13
|
+
bottom = max(0, min(int(math.ceil(y2)), h))
|
14
|
+
if right <= left:
|
15
|
+
right = min(w, left + 1)
|
16
|
+
if bottom <= top:
|
17
|
+
bottom = min(h, top + 1)
|
18
|
+
return left, top, right, bottom
|
doctra/utils/file_ops.py
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import re
|
5
|
+
from typing import Dict
|
6
|
+
|
7
|
+
def ensure_output_dirs(base_out: str, image_subdirs: Dict[str, str]) -> Dict[str, str]:
|
8
|
+
"""
|
9
|
+
Create base output dir and image subfolders if missing.
|
10
|
+
Returns a dict with base paths (for convenience).
|
11
|
+
"""
|
12
|
+
img_base = os.path.join(base_out, "images")
|
13
|
+
os.makedirs(img_base, exist_ok=True)
|
14
|
+
paths = {}
|
15
|
+
for lbl, sub in image_subdirs.items():
|
16
|
+
p = os.path.join(img_base, sub)
|
17
|
+
os.makedirs(p, exist_ok=True)
|
18
|
+
paths[lbl] = p
|
19
|
+
return {"base": base_out, "images": img_base, **paths}
|
20
|
+
|
21
|
+
def sanitize_filename(name: str) -> str:
|
22
|
+
"""
|
23
|
+
Replace unsafe filename characters with underscores.
|
24
|
+
"""
|
25
|
+
name = re.sub(r"[^\w\-.]+", "_", name)
|
26
|
+
return name.strip("_")
|
doctra/utils/io_utils.py
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from PIL import Image
|
5
|
+
from doctra.engines.ocr import PytesseractOCREngine
|
6
|
+
from doctra.engines.layout.layout_models import LayoutBox
|
7
|
+
from doctra.utils.bbox import clip_bbox_to_image
|
8
|
+
|
9
|
+
def ocr_box_text(ocr_engine: PytesseractOCREngine, page_img: Image.Image, box: LayoutBox) -> str:
|
10
|
+
"""
|
11
|
+
OCR a single layout box from a page image and return normalized text.
|
12
|
+
Preserves line breaks; collapses excessive blank lines.
|
13
|
+
"""
|
14
|
+
w, h = page_img.size
|
15
|
+
l, t, r, b = clip_bbox_to_image(box.x1, box.y1, box.x2, box.y2, w, h)
|
16
|
+
crop = page_img.crop((l, t, r, b))
|
17
|
+
text = ocr_engine.recognize(crop)
|
18
|
+
text = re.sub(r"[ \t]+\n", "\n", text)
|
19
|
+
text = re.sub(r"\n{3,}", "\n\n", text).strip()
|
20
|
+
return text
|
doctra/utils/pdf_io.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
from typing import List, Tuple
|
2
|
+
from pdf2image import convert_from_path # requires Poppler installed locally
|
3
|
+
from PIL import Image
|
4
|
+
|
5
|
+
def render_pdf_to_images(pdf_path: str, dpi: int = 200, fmt: str = "RGB") -> List[Tuple[Image.Image, int, int]]:
|
6
|
+
"""
|
7
|
+
Render a PDF into PIL images.
|
8
|
+
|
9
|
+
Returns:
|
10
|
+
List of tuples (pil_image, width, height) in page order (1-based).
|
11
|
+
"""
|
12
|
+
pil_pages = convert_from_path(pdf_path, dpi=dpi) # may raise if Poppler missing
|
13
|
+
images: List[Tuple[Image.Image, int, int]] = []
|
14
|
+
for im in pil_pages:
|
15
|
+
if fmt and im.mode != fmt:
|
16
|
+
im = im.convert(fmt)
|
17
|
+
w, h = im.size
|
18
|
+
images.append((im, w, h))
|
19
|
+
return images
|
doctra/utils/quiet.py
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
import os
|
3
|
+
import contextlib
|
4
|
+
|
5
|
+
@contextlib.contextmanager
|
6
|
+
def suppress_output():
|
7
|
+
"""Temporarily silence stdout/stderr (tqdm + print + noisy libs)."""
|
8
|
+
devnull = open(os.devnull, "w")
|
9
|
+
try:
|
10
|
+
with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
|
11
|
+
yield
|
12
|
+
finally:
|
13
|
+
devnull.close()
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
from typing import Any, Dict, Optional
|
3
|
+
import json
|
4
|
+
|
5
|
+
try:
|
6
|
+
from pydantic import BaseModel # type: ignore
|
7
|
+
except Exception: # pydantic not strictly required for normalization
|
8
|
+
class BaseModel: # fallback stub
|
9
|
+
pass
|
10
|
+
|
11
|
+
def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
|
12
|
+
"""
|
13
|
+
Accepts a VLM result that might be:
|
14
|
+
- JSON string
|
15
|
+
- dict
|
16
|
+
- Pydantic BaseModel (v1 .dict() or v2 .model_dump())
|
17
|
+
Returns a normalized dict with keys: title, headers, rows — or None.
|
18
|
+
"""
|
19
|
+
if obj is None:
|
20
|
+
return None
|
21
|
+
|
22
|
+
# JSON string from VLM
|
23
|
+
if isinstance(obj, str):
|
24
|
+
try:
|
25
|
+
obj = json.loads(obj)
|
26
|
+
except Exception:
|
27
|
+
return None
|
28
|
+
|
29
|
+
# Pydantic model
|
30
|
+
if isinstance(obj, BaseModel):
|
31
|
+
try:
|
32
|
+
return obj.model_dump() # pydantic v2
|
33
|
+
except Exception:
|
34
|
+
try:
|
35
|
+
return obj.dict() # pydantic v1
|
36
|
+
except Exception:
|
37
|
+
return None
|
38
|
+
|
39
|
+
# Plain dict
|
40
|
+
if isinstance(obj, dict):
|
41
|
+
title = obj.get("title") or "Untitled"
|
42
|
+
headers = obj.get("headers") or []
|
43
|
+
rows = obj.get("rows") or []
|
44
|
+
# Basic shape checks
|
45
|
+
if not isinstance(headers, list) or not isinstance(rows, list):
|
46
|
+
return None
|
47
|
+
return {"title": title, "headers": headers, "rows": rows}
|
48
|
+
|
49
|
+
return None
|
doctra/version.py
ADDED