doctra 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. doctra/__init__.py +19 -0
  2. doctra/cli/__init__.py +27 -0
  3. doctra/cli/main.py +856 -0
  4. doctra/cli/utils.py +340 -0
  5. doctra/engines/__init__.py +0 -0
  6. doctra/engines/layout/__init__.py +0 -0
  7. doctra/engines/layout/layout_models.py +90 -0
  8. doctra/engines/layout/paddle_layout.py +225 -0
  9. doctra/engines/ocr/__init__.py +4 -0
  10. doctra/engines/ocr/api.py +36 -0
  11. doctra/engines/ocr/path_resolver.py +48 -0
  12. doctra/engines/ocr/pytesseract_engine.py +76 -0
  13. doctra/engines/vlm/__init__.py +0 -0
  14. doctra/engines/vlm/outlines_types.py +31 -0
  15. doctra/engines/vlm/provider.py +58 -0
  16. doctra/engines/vlm/service.py +117 -0
  17. doctra/exporters/__init__.py +0 -0
  18. doctra/exporters/excel_writer.py +197 -0
  19. doctra/exporters/image_saver.py +42 -0
  20. doctra/exporters/markdown_table.py +56 -0
  21. doctra/exporters/markdown_writer.py +29 -0
  22. doctra/parsers/__init__.py +6 -0
  23. doctra/parsers/layout_order.py +16 -0
  24. doctra/parsers/structured_pdf_parser.py +434 -0
  25. doctra/parsers/table_chart_extractor.py +283 -0
  26. doctra/utils/__init__.py +0 -0
  27. doctra/utils/bbox.py +18 -0
  28. doctra/utils/constants.py +8 -0
  29. doctra/utils/file_ops.py +26 -0
  30. doctra/utils/io_utils.py +10 -0
  31. doctra/utils/ocr_utils.py +20 -0
  32. doctra/utils/pdf_io.py +19 -0
  33. doctra/utils/quiet.py +13 -0
  34. doctra/utils/structured_utils.py +49 -0
  35. doctra/version.py +2 -0
  36. doctra-0.1.0.dist-info/METADATA +626 -0
  37. doctra-0.1.0.dist-info/RECORD +40 -0
  38. doctra-0.1.0.dist-info/WHEEL +5 -0
  39. doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
  40. doctra-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,283 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import List, Dict, Any
5
+ from contextlib import ExitStack
6
+ from pathlib import Path
7
+
8
+ from PIL import Image
9
+ from tqdm import tqdm
10
+
11
+ from doctra.utils.pdf_io import render_pdf_to_images
12
+ from doctra.engines.layout.paddle_layout import PaddleLayoutEngine
13
+ from doctra.engines.layout.layout_models import LayoutPage
14
+
15
+ from doctra.parsers.layout_order import reading_order_key
16
+ from doctra.exporters.image_saver import save_box_image
17
+ from doctra.utils.file_ops import ensure_output_dirs
18
+
19
+ from doctra.engines.vlm.service import VLMStructuredExtractor
20
+ from doctra.exporters.excel_writer import write_structured_excel
21
+ from doctra.utils.structured_utils import to_structured_dict
22
+ from doctra.exporters.markdown_table import render_markdown_table
23
+ from doctra.exporters.markdown_writer import write_markdown
24
+
25
+
26
+ class ChartTablePDFParser:
27
+ """
28
+ Specialized PDF parser for extracting charts and tables.
29
+
30
+ Focuses specifically on chart and table extraction from PDF documents,
31
+ with optional VLM (Vision Language Model) processing to convert visual
32
+ elements into structured data.
33
+
34
+ :param extract_charts: Whether to extract charts from the document (default: True)
35
+ :param extract_tables: Whether to extract tables from the document (default: True)
36
+ :param use_vlm: Whether to use VLM for structured data extraction (default: False)
37
+ :param vlm_provider: VLM provider to use ("gemini" or "openai", default: "gemini")
38
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
39
+ :param vlm_api_key: API key for VLM provider (required if use_vlm is True)
40
+ :param layout_model_name: Layout detection model name (default: "PP-DocLayout_plus-L")
41
+ :param dpi: DPI for PDF rendering (default: 200)
42
+ :param min_score: Minimum confidence score for layout detection (default: 0.0)
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ *,
48
+ extract_charts: bool = True,
49
+ extract_tables: bool = True,
50
+ use_vlm: bool = False,
51
+ vlm_provider: str = "gemini",
52
+ vlm_model: str | None = None,
53
+ vlm_api_key: str | None = None,
54
+ layout_model_name: str = "PP-DocLayout_plus-L",
55
+ dpi: int = 200,
56
+ min_score: float = 0.0,
57
+ ):
58
+ """
59
+ Initialize the ChartTablePDFParser with extraction configuration.
60
+
61
+ Sets up the layout detection engine and optionally the VLM service
62
+ for structured data extraction.
63
+
64
+ :param extract_charts: Whether to extract charts from the document
65
+ :param extract_tables: Whether to extract tables from the document
66
+ :param use_vlm: Whether to use VLM for structured data extraction
67
+ :param vlm_provider: VLM provider to use ("gemini" or "openai")
68
+ :param vlm_model: Model name to use (defaults to provider-specific defaults)
69
+ :param vlm_api_key: API key for VLM provider
70
+ :param layout_model_name: Layout detection model name
71
+ :param dpi: DPI for PDF rendering
72
+ :param min_score: Minimum confidence score for layout detection
73
+ :raises ValueError: If neither extract_charts nor extract_tables is True
74
+ """
75
+ # Validation
76
+ if not extract_charts and not extract_tables:
77
+ raise ValueError("At least one of extract_charts or extract_tables must be True")
78
+
79
+ self.extract_charts = extract_charts
80
+ self.extract_tables = extract_tables
81
+ self.layout_engine = PaddleLayoutEngine(model_name=layout_model_name)
82
+ self.dpi = dpi
83
+ self.min_score = min_score
84
+
85
+ self.use_vlm = use_vlm
86
+ self.vlm = None
87
+ if self.use_vlm:
88
+ self.vlm = VLMStructuredExtractor(
89
+ vlm_provider=vlm_provider,
90
+ vlm_model=vlm_model,
91
+ api_key=vlm_api_key,
92
+ )
93
+
94
+ def parse(self, pdf_path: str, output_base_dir: str = "outputs") -> None:
95
+ """
96
+ Parse a PDF document and extract charts and/or tables.
97
+
98
+ Processes the PDF through layout detection, extracts the specified
99
+ element types, saves cropped images, and optionally converts them
100
+ to structured data using VLM.
101
+
102
+ :param pdf_path: Path to the input PDF file
103
+ :param output_base_dir: Base directory for output files (default: "outputs")
104
+ :return: None
105
+ """
106
+ # Create output directory structure: outputs/structured_doc/<filename>/
107
+ pdf_name = Path(pdf_path).stem
108
+ out_dir = os.path.join(output_base_dir, pdf_name)
109
+ os.makedirs(out_dir, exist_ok=True)
110
+
111
+ # Create subdirectories based on what we're extracting
112
+ charts_dir = None
113
+ tables_dir = None
114
+
115
+ if self.extract_charts:
116
+ charts_dir = os.path.join(out_dir, "charts")
117
+ os.makedirs(charts_dir, exist_ok=True)
118
+
119
+ if self.extract_tables:
120
+ tables_dir = os.path.join(out_dir, "tables")
121
+ os.makedirs(tables_dir, exist_ok=True)
122
+
123
+ pages: List[LayoutPage] = self.layout_engine.predict_pdf(
124
+ pdf_path, batch_size=1, layout_nms=True, dpi=self.dpi, min_score=self.min_score
125
+ )
126
+ pil_pages = [im for (im, _, _) in render_pdf_to_images(pdf_path, dpi=self.dpi)]
127
+
128
+ # Determine which labels to extract
129
+ target_labels = []
130
+ if self.extract_charts:
131
+ target_labels.append("chart")
132
+ if self.extract_tables:
133
+ target_labels.append("table")
134
+
135
+ # Count items for progress bars
136
+ chart_count = sum(sum(1 for b in p.boxes if b.label == "chart") for p in pages) if self.extract_charts else 0
137
+ table_count = sum(sum(1 for b in p.boxes if b.label == "table") for p in pages) if self.extract_tables else 0
138
+
139
+ # Prepare output content
140
+ if self.use_vlm:
141
+ md_lines: List[str] = ["# Extracted Charts and Tables\n"]
142
+ structured_items: List[Dict[str, Any]] = []
143
+
144
+ # Progress bar descriptions
145
+ charts_desc = "Charts (VLM → table)" if self.use_vlm else "Charts (cropped)"
146
+ tables_desc = "Tables (VLM → table)" if self.use_vlm else "Tables (cropped)"
147
+
148
+ chart_counter = 1
149
+ table_counter = 1
150
+
151
+ with ExitStack() as stack:
152
+ charts_bar = stack.enter_context(
153
+ tqdm(total=chart_count, desc=charts_desc, leave=True)) if chart_count else None
154
+ tables_bar = stack.enter_context(
155
+ tqdm(total=table_count, desc=tables_desc, leave=True)) if table_count else None
156
+
157
+ for p in pages:
158
+ page_num = p.page_index
159
+ page_img: Image.Image = pil_pages[page_num - 1]
160
+
161
+ # Only process selected item types
162
+ target_items = [box for box in p.boxes if box.label in target_labels]
163
+
164
+ if target_items and self.use_vlm:
165
+ md_lines.append(f"\n## Page {page_num}\n")
166
+
167
+ for box in sorted(target_items, key=reading_order_key):
168
+ # Handle charts
169
+ if box.label == "chart" and self.extract_charts:
170
+ chart_filename = f"chart_{chart_counter:03d}.png"
171
+ chart_path = os.path.join(charts_dir, chart_filename)
172
+
173
+ # Save image
174
+ cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
175
+ cropped_img.save(chart_path)
176
+
177
+ # Handle VLM processing if enabled
178
+ if self.use_vlm and self.vlm:
179
+ rel_path = os.path.join("charts", chart_filename)
180
+ wrote_table = False
181
+
182
+ try:
183
+ extracted_chart = self.vlm.extract_chart(chart_path)
184
+ structured_item = to_structured_dict(extracted_chart)
185
+ if structured_item:
186
+ structured_items.append(structured_item)
187
+ md_lines.append(
188
+ render_markdown_table(
189
+ structured_item.get("headers"),
190
+ structured_item.get("rows"),
191
+ title=structured_item.get(
192
+ "title") or f"Chart {chart_counter} — page {page_num}"
193
+ )
194
+ )
195
+ wrote_table = True
196
+ except Exception:
197
+ pass
198
+
199
+ if not wrote_table:
200
+ md_lines.append(f"![Chart {chart_counter} — page {page_num}]({rel_path})\n")
201
+
202
+ chart_counter += 1
203
+ if charts_bar:
204
+ charts_bar.update(1)
205
+
206
+ # Handle tables
207
+ elif box.label == "table" and self.extract_tables:
208
+ table_filename = f"table_{table_counter:03d}.png"
209
+ table_path = os.path.join(tables_dir, table_filename)
210
+
211
+ # Save image
212
+ cropped_img = page_img.crop((box.x1, box.y1, box.x2, box.y2))
213
+ cropped_img.save(table_path)
214
+
215
+ # Handle VLM processing if enabled
216
+ if self.use_vlm and self.vlm:
217
+ rel_path = os.path.join("tables", table_filename)
218
+ wrote_table = False
219
+
220
+ try:
221
+ extracted_table = self.vlm.extract_table(table_path)
222
+ structured_item = to_structured_dict(extracted_table)
223
+ if structured_item:
224
+ structured_items.append(structured_item)
225
+ md_lines.append(
226
+ render_markdown_table(
227
+ structured_item.get("headers"),
228
+ structured_item.get("rows"),
229
+ title=structured_item.get(
230
+ "title") or f"Table {table_counter} — page {page_num}"
231
+ )
232
+ )
233
+ wrote_table = True
234
+ except Exception:
235
+ pass
236
+
237
+ if not wrote_table:
238
+ md_lines.append(f"![Table {table_counter} — page {page_num}]({rel_path})\n")
239
+
240
+ table_counter += 1
241
+ if tables_bar:
242
+ tables_bar.update(1)
243
+
244
+ # Write outputs only if VLM is used
245
+ md_path = None
246
+ excel_path = None
247
+
248
+ if self.use_vlm:
249
+ # Write markdown file
250
+ md_path = os.path.join(out_dir, "charts.md")
251
+ with open(md_path, 'w', encoding='utf-8') as f:
252
+ f.write('\n'.join(md_lines))
253
+
254
+ # Write Excel file if we have structured data
255
+ if structured_items:
256
+ excel_path = os.path.join(out_dir, "charts.xlsx")
257
+ write_structured_excel(excel_path, structured_items)
258
+
259
+ # Print results
260
+ extraction_types = []
261
+ if self.extract_charts:
262
+ extraction_types.append("charts")
263
+ if self.extract_tables:
264
+ extraction_types.append("tables")
265
+
266
+ print(f"{' and '.join(extraction_types).title()} extraction completed successfully.")
267
+ print(f"- Output directory: {out_dir}")
268
+
269
+ if charts_dir and self.extract_charts:
270
+ print(f"- Charts directory: {charts_dir}")
271
+ print(f"- Charts extracted: {chart_counter - 1}")
272
+
273
+ if tables_dir and self.extract_tables:
274
+ print(f"- Tables directory: {tables_dir}")
275
+ print(f"- Tables extracted: {table_counter - 1}")
276
+
277
+ if md_path:
278
+ print(f"- Markdown file: {md_path}")
279
+ if excel_path:
280
+ print(f"- Excel file: {excel_path}")
281
+
282
+ if not self.use_vlm:
283
+ print("- Note: VLM disabled - only cropped images saved")
File without changes
doctra/utils/bbox.py ADDED
@@ -0,0 +1,18 @@
1
+ from __future__ import annotations
2
+ import math
3
+ from typing import Tuple
4
+
5
+ def clip_bbox_to_image(x1: float, y1: float, x2: float, y2: float, w: int, h: int) -> Tuple[int, int, int, int]:
6
+ """
7
+ Clip a float bbox to image bounds, return integer crop box (left, top, right, bottom).
8
+ Guarantees non-empty crop.
9
+ """
10
+ left = max(0, min(int(math.floor(x1)), w))
11
+ top = max(0, min(int(math.floor(y1)), h))
12
+ right = max(0, min(int(math.ceil(x2)), w))
13
+ bottom = max(0, min(int(math.ceil(y2)), h))
14
+ if right <= left:
15
+ right = min(w, left + 1)
16
+ if bottom <= top:
17
+ bottom = min(h, top + 1)
18
+ return left, top, right, bottom
@@ -0,0 +1,8 @@
1
+ EXCLUDE_LABELS = {"figure", "chart", "table"}
2
+
3
+ # Where to store cropped images, per label
4
+ IMAGE_SUBDIRS = {
5
+ "figure": "figures",
6
+ "chart": "charts",
7
+ "table": "tables",
8
+ }
@@ -0,0 +1,26 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import re
5
+ from typing import Dict
6
+
7
+ def ensure_output_dirs(base_out: str, image_subdirs: Dict[str, str]) -> Dict[str, str]:
8
+ """
9
+ Create base output dir and image subfolders if missing.
10
+ Returns a dict with base paths (for convenience).
11
+ """
12
+ img_base = os.path.join(base_out, "images")
13
+ os.makedirs(img_base, exist_ok=True)
14
+ paths = {}
15
+ for lbl, sub in image_subdirs.items():
16
+ p = os.path.join(img_base, sub)
17
+ os.makedirs(p, exist_ok=True)
18
+ paths[lbl] = p
19
+ return {"base": base_out, "images": img_base, **paths}
20
+
21
+ def sanitize_filename(name: str) -> str:
22
+ """
23
+ Replace unsafe filename characters with underscores.
24
+ """
25
+ name = re.sub(r"[^\w\-.]+", "_", name)
26
+ return name.strip("_")
@@ -0,0 +1,10 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from PIL import Image # <-- import Image explicitly
6
+ import PIL
7
+
8
+
9
+ def get_image_from_local(file_path):
10
+ return PIL.Image.open(file_path)
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from PIL import Image
5
+ from doctra.engines.ocr import PytesseractOCREngine
6
+ from doctra.engines.layout.layout_models import LayoutBox
7
+ from doctra.utils.bbox import clip_bbox_to_image
8
+
9
+ def ocr_box_text(ocr_engine: PytesseractOCREngine, page_img: Image.Image, box: LayoutBox) -> str:
10
+ """
11
+ OCR a single layout box from a page image and return normalized text.
12
+ Preserves line breaks; collapses excessive blank lines.
13
+ """
14
+ w, h = page_img.size
15
+ l, t, r, b = clip_bbox_to_image(box.x1, box.y1, box.x2, box.y2, w, h)
16
+ crop = page_img.crop((l, t, r, b))
17
+ text = ocr_engine.recognize(crop)
18
+ text = re.sub(r"[ \t]+\n", "\n", text)
19
+ text = re.sub(r"\n{3,}", "\n\n", text).strip()
20
+ return text
doctra/utils/pdf_io.py ADDED
@@ -0,0 +1,19 @@
1
+ from typing import List, Tuple
2
+ from pdf2image import convert_from_path # requires Poppler installed locally
3
+ from PIL import Image
4
+
5
+ def render_pdf_to_images(pdf_path: str, dpi: int = 200, fmt: str = "RGB") -> List[Tuple[Image.Image, int, int]]:
6
+ """
7
+ Render a PDF into PIL images.
8
+
9
+ Returns:
10
+ List of tuples (pil_image, width, height) in page order (1-based).
11
+ """
12
+ pil_pages = convert_from_path(pdf_path, dpi=dpi) # may raise if Poppler missing
13
+ images: List[Tuple[Image.Image, int, int]] = []
14
+ for im in pil_pages:
15
+ if fmt and im.mode != fmt:
16
+ im = im.convert(fmt)
17
+ w, h = im.size
18
+ images.append((im, w, h))
19
+ return images
doctra/utils/quiet.py ADDED
@@ -0,0 +1,13 @@
1
+ from __future__ import annotations
2
+ import os
3
+ import contextlib
4
+
5
+ @contextlib.contextmanager
6
+ def suppress_output():
7
+ """Temporarily silence stdout/stderr (tqdm + print + noisy libs)."""
8
+ devnull = open(os.devnull, "w")
9
+ try:
10
+ with contextlib.redirect_stdout(devnull), contextlib.redirect_stderr(devnull):
11
+ yield
12
+ finally:
13
+ devnull.close()
@@ -0,0 +1,49 @@
1
+ from __future__ import annotations
2
+ from typing import Any, Dict, Optional
3
+ import json
4
+
5
+ try:
6
+ from pydantic import BaseModel # type: ignore
7
+ except Exception: # pydantic not strictly required for normalization
8
+ class BaseModel: # fallback stub
9
+ pass
10
+
11
+ def to_structured_dict(obj: Any) -> Optional[Dict[str, Any]]:
12
+ """
13
+ Accepts a VLM result that might be:
14
+ - JSON string
15
+ - dict
16
+ - Pydantic BaseModel (v1 .dict() or v2 .model_dump())
17
+ Returns a normalized dict with keys: title, headers, rows — or None.
18
+ """
19
+ if obj is None:
20
+ return None
21
+
22
+ # JSON string from VLM
23
+ if isinstance(obj, str):
24
+ try:
25
+ obj = json.loads(obj)
26
+ except Exception:
27
+ return None
28
+
29
+ # Pydantic model
30
+ if isinstance(obj, BaseModel):
31
+ try:
32
+ return obj.model_dump() # pydantic v2
33
+ except Exception:
34
+ try:
35
+ return obj.dict() # pydantic v1
36
+ except Exception:
37
+ return None
38
+
39
+ # Plain dict
40
+ if isinstance(obj, dict):
41
+ title = obj.get("title") or "Untitled"
42
+ headers = obj.get("headers") or []
43
+ rows = obj.get("rows") or []
44
+ # Basic shape checks
45
+ if not isinstance(headers, list) or not isinstance(rows, list):
46
+ return None
47
+ return {"title": title, "headers": headers, "rows": rows}
48
+
49
+ return None
doctra/version.py ADDED
@@ -0,0 +1,2 @@
1
+ """Version information for Doctra."""
2
+ __version__ = '0.1.0'