doctra 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctra/__init__.py +19 -0
- doctra/cli/__init__.py +27 -0
- doctra/cli/main.py +856 -0
- doctra/cli/utils.py +340 -0
- doctra/engines/__init__.py +0 -0
- doctra/engines/layout/__init__.py +0 -0
- doctra/engines/layout/layout_models.py +90 -0
- doctra/engines/layout/paddle_layout.py +225 -0
- doctra/engines/ocr/__init__.py +4 -0
- doctra/engines/ocr/api.py +36 -0
- doctra/engines/ocr/path_resolver.py +48 -0
- doctra/engines/ocr/pytesseract_engine.py +76 -0
- doctra/engines/vlm/__init__.py +0 -0
- doctra/engines/vlm/outlines_types.py +31 -0
- doctra/engines/vlm/provider.py +58 -0
- doctra/engines/vlm/service.py +117 -0
- doctra/exporters/__init__.py +0 -0
- doctra/exporters/excel_writer.py +197 -0
- doctra/exporters/image_saver.py +42 -0
- doctra/exporters/markdown_table.py +56 -0
- doctra/exporters/markdown_writer.py +29 -0
- doctra/parsers/__init__.py +6 -0
- doctra/parsers/layout_order.py +16 -0
- doctra/parsers/structured_pdf_parser.py +434 -0
- doctra/parsers/table_chart_extractor.py +283 -0
- doctra/utils/__init__.py +0 -0
- doctra/utils/bbox.py +18 -0
- doctra/utils/constants.py +8 -0
- doctra/utils/file_ops.py +26 -0
- doctra/utils/io_utils.py +10 -0
- doctra/utils/ocr_utils.py +20 -0
- doctra/utils/pdf_io.py +19 -0
- doctra/utils/quiet.py +13 -0
- doctra/utils/structured_utils.py +49 -0
- doctra/version.py +2 -0
- doctra-0.1.0.dist-info/METADATA +626 -0
- doctra-0.1.0.dist-info/RECORD +40 -0
- doctra-0.1.0.dist-info/WHEEL +5 -0
- doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
- doctra-0.1.0.dist-info/top_level.txt +1 -0
doctra/cli/utils.py
ADDED
@@ -0,0 +1,340 @@
|
|
1
|
+
"""
|
2
|
+
CLI utilities for the Doctra command line interface.
|
3
|
+
|
4
|
+
This module contains shared utilities and helper functions used across
|
5
|
+
different CLI commands.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import click
|
9
|
+
import sys
|
10
|
+
from typing import Optional, Dict, Any
|
11
|
+
from pathlib import Path
|
12
|
+
|
13
|
+
|
14
|
+
def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
|
15
|
+
"""
|
16
|
+
Validate VLM configuration and exit with error if invalid.
|
17
|
+
|
18
|
+
Checks if VLM is enabled but no API key is provided, and exits
|
19
|
+
with an appropriate error message if the configuration is invalid.
|
20
|
+
|
21
|
+
:param use_vlm: Whether VLM processing is enabled
|
22
|
+
:param vlm_api_key: The VLM API key (can be None if VLM is disabled)
|
23
|
+
:return: None
|
24
|
+
:raises SystemExit: If VLM is enabled but no API key is provided
|
25
|
+
"""
|
26
|
+
if use_vlm and not vlm_api_key:
|
27
|
+
click.echo("❌ Error: VLM API key is required when using --use-vlm", err=True)
|
28
|
+
click.echo(" Set the VLM_API_KEY environment variable or use --vlm-api-key", err=True)
|
29
|
+
click.echo(" Example: export VLM_API_KEY=your_api_key", err=True)
|
30
|
+
sys.exit(1)
|
31
|
+
|
32
|
+
|
33
|
+
def handle_keyboard_interrupt() -> None:
|
34
|
+
"""
|
35
|
+
Handle keyboard interrupt (Ctrl+C) gracefully.
|
36
|
+
|
37
|
+
Displays a user-friendly message and exits with the standard
|
38
|
+
interrupt exit code (130).
|
39
|
+
|
40
|
+
:return: None
|
41
|
+
:raises SystemExit: Always exits with code 130
|
42
|
+
"""
|
43
|
+
click.echo("\n⚠️ Operation interrupted by user", err=True)
|
44
|
+
sys.exit(130)
|
45
|
+
|
46
|
+
|
47
|
+
def handle_exception(e: Exception, verbose: bool = False) -> None:
|
48
|
+
"""
|
49
|
+
Handle exceptions with appropriate error messages.
|
50
|
+
|
51
|
+
Displays the exception message and optionally the full traceback
|
52
|
+
if verbose mode is enabled.
|
53
|
+
|
54
|
+
:param e: The exception that occurred
|
55
|
+
:param verbose: Whether to show full traceback
|
56
|
+
:return: None
|
57
|
+
:raises SystemExit: Always exits with code 1
|
58
|
+
"""
|
59
|
+
click.echo(f"❌ Error: {e}", err=True)
|
60
|
+
if verbose:
|
61
|
+
import traceback
|
62
|
+
click.echo(traceback.format_exc(), err=True)
|
63
|
+
sys.exit(1)
|
64
|
+
|
65
|
+
|
66
|
+
def validate_pdf_path(pdf_path: Path) -> None:
|
67
|
+
"""
|
68
|
+
Validate that the PDF path exists and is a valid PDF file.
|
69
|
+
|
70
|
+
Checks if the file exists, is actually a file (not directory),
|
71
|
+
and optionally warns if the file extension is not .pdf.
|
72
|
+
|
73
|
+
:param pdf_path: Path to the PDF file to validate
|
74
|
+
:return: None
|
75
|
+
:raises SystemExit: If file doesn't exist or is not a file
|
76
|
+
"""
|
77
|
+
if not pdf_path.exists():
|
78
|
+
click.echo(f"❌ Error: PDF file not found: {pdf_path}", err=True)
|
79
|
+
sys.exit(1)
|
80
|
+
|
81
|
+
if not pdf_path.is_file():
|
82
|
+
click.echo(f"❌ Error: Path is not a file: {pdf_path}", err=True)
|
83
|
+
sys.exit(1)
|
84
|
+
|
85
|
+
if pdf_path.suffix.lower() != '.pdf':
|
86
|
+
click.echo(f"⚠️ Warning: File does not have .pdf extension: {pdf_path}")
|
87
|
+
|
88
|
+
|
89
|
+
def format_file_size(size_bytes: int) -> str:
|
90
|
+
"""
|
91
|
+
Format file size in human readable format.
|
92
|
+
|
93
|
+
Converts bytes to the most appropriate unit (B, KB, MB, GB)
|
94
|
+
with one decimal place precision.
|
95
|
+
|
96
|
+
:param size_bytes: Size in bytes to format
|
97
|
+
:return: Formatted size string (e.g., "1.5 MB", "2.3 GB")
|
98
|
+
"""
|
99
|
+
if size_bytes == 0:
|
100
|
+
return "0 B"
|
101
|
+
|
102
|
+
units = ["B", "KB", "MB", "GB"]
|
103
|
+
unit_index = 0
|
104
|
+
size = float(size_bytes)
|
105
|
+
|
106
|
+
while size >= 1024 and unit_index < len(units) - 1:
|
107
|
+
size /= 1024
|
108
|
+
unit_index += 1
|
109
|
+
|
110
|
+
return f"{size:.1f} {units[unit_index]}"
|
111
|
+
|
112
|
+
|
113
|
+
def get_file_info(file_path: Path) -> Dict[str, Any]:
|
114
|
+
"""
|
115
|
+
Get basic file information.
|
116
|
+
|
117
|
+
Retrieves file metadata including name, size, modification time,
|
118
|
+
and file type information.
|
119
|
+
|
120
|
+
:param file_path: Path to the file to get information for
|
121
|
+
:return: Dictionary containing file information with keys:
|
122
|
+
- name: File name
|
123
|
+
- size: Size in bytes
|
124
|
+
- size_formatted: Human-readable size
|
125
|
+
- modified: Modification timestamp
|
126
|
+
- is_file: Whether it's a file
|
127
|
+
- is_dir: Whether it's a directory
|
128
|
+
- extension: File extension (lowercase)
|
129
|
+
Returns empty dict if file doesn't exist
|
130
|
+
"""
|
131
|
+
if not file_path.exists():
|
132
|
+
return {}
|
133
|
+
|
134
|
+
stat = file_path.stat()
|
135
|
+
return {
|
136
|
+
'name': file_path.name,
|
137
|
+
'size': stat.st_size,
|
138
|
+
'size_formatted': format_file_size(stat.st_size),
|
139
|
+
'modified': stat.st_mtime,
|
140
|
+
'is_file': file_path.is_file(),
|
141
|
+
'is_dir': file_path.is_dir(),
|
142
|
+
'extension': file_path.suffix.lower()
|
143
|
+
}
|
144
|
+
|
145
|
+
|
146
|
+
def print_processing_summary(
|
147
|
+
input_file: Path,
|
148
|
+
output_dir: Path,
|
149
|
+
processing_time: Optional[float] = None,
|
150
|
+
elements_processed: Optional[int] = None,
|
151
|
+
use_vlm: bool = False
|
152
|
+
) -> None:
|
153
|
+
"""
|
154
|
+
Print a summary of processing results.
|
155
|
+
|
156
|
+
Displays a formatted summary including input file information,
|
157
|
+
output directory, processing time, number of elements processed,
|
158
|
+
and VLM usage status.
|
159
|
+
|
160
|
+
:param input_file: Input PDF file path
|
161
|
+
:param output_dir: Output directory path
|
162
|
+
:param processing_time: Time taken for processing in seconds
|
163
|
+
:param elements_processed: Number of elements processed
|
164
|
+
:param use_vlm: Whether VLM was used during processing
|
165
|
+
:return: None
|
166
|
+
"""
|
167
|
+
click.echo("\n" + "=" * 50)
|
168
|
+
click.echo("📊 Processing Summary")
|
169
|
+
click.echo("=" * 50)
|
170
|
+
|
171
|
+
# Input file info
|
172
|
+
file_info = get_file_info(input_file)
|
173
|
+
if file_info:
|
174
|
+
click.echo(f"Input file: {file_info['name']}")
|
175
|
+
click.echo(f"File size: {file_info['size_formatted']}")
|
176
|
+
|
177
|
+
# Output info
|
178
|
+
if output_dir.exists():
|
179
|
+
click.echo(f"Output: {output_dir}")
|
180
|
+
|
181
|
+
# Processing details
|
182
|
+
if elements_processed is not None:
|
183
|
+
click.echo(f"Elements: {elements_processed} processed")
|
184
|
+
|
185
|
+
if processing_time is not None:
|
186
|
+
click.echo(f"Time: {processing_time:.1f} seconds")
|
187
|
+
|
188
|
+
if use_vlm:
|
189
|
+
click.echo("VLM: ✅ Enabled")
|
190
|
+
else:
|
191
|
+
click.echo("VLM: ❌ Disabled")
|
192
|
+
|
193
|
+
|
194
|
+
def check_dependencies() -> Dict[str, bool]:
|
195
|
+
"""
|
196
|
+
Check if required dependencies are available.
|
197
|
+
|
198
|
+
Tests import availability for core and optional dependencies
|
199
|
+
used by the Doctra library.
|
200
|
+
|
201
|
+
:return: Dictionary mapping dependency names to availability status:
|
202
|
+
- PIL: Pillow for image processing
|
203
|
+
- paddle: PaddlePaddle for layout detection
|
204
|
+
- pytesseract: Tesseract OCR wrapper
|
205
|
+
- tqdm: Progress bar library
|
206
|
+
- click: CLI framework
|
207
|
+
- google.generativeai: Gemini VLM support
|
208
|
+
- openai: OpenAI VLM support
|
209
|
+
"""
|
210
|
+
dependencies = {
|
211
|
+
'PIL': False,
|
212
|
+
'paddle': False,
|
213
|
+
'pytesseract': False,
|
214
|
+
'tqdm': False,
|
215
|
+
'click': False,
|
216
|
+
'google.generativeai': False,
|
217
|
+
'openai': False,
|
218
|
+
}
|
219
|
+
|
220
|
+
for dep in dependencies:
|
221
|
+
try:
|
222
|
+
__import__(dep)
|
223
|
+
dependencies[dep] = True
|
224
|
+
except ImportError:
|
225
|
+
dependencies[dep] = False
|
226
|
+
|
227
|
+
return dependencies
|
228
|
+
|
229
|
+
|
230
|
+
def estimate_processing_time(
|
231
|
+
num_pages: int,
|
232
|
+
num_charts: int = 0,
|
233
|
+
num_tables: int = 0,
|
234
|
+
use_vlm: bool = False
|
235
|
+
) -> int:
|
236
|
+
"""
|
237
|
+
Estimate processing time based on document characteristics.
|
238
|
+
|
239
|
+
Provides a rough estimate of processing time based on the number
|
240
|
+
of pages, charts, tables, and whether VLM processing is enabled.
|
241
|
+
|
242
|
+
:param num_pages: Number of pages in the document
|
243
|
+
:param num_charts: Number of charts detected in the document
|
244
|
+
:param num_tables: Number of tables detected in the document
|
245
|
+
:param use_vlm: Whether VLM processing will be used
|
246
|
+
:return: Estimated processing time in seconds
|
247
|
+
"""
|
248
|
+
# Base time per page (layout detection + OCR)
|
249
|
+
base_time = num_pages * 2
|
250
|
+
|
251
|
+
# Additional time for charts and tables
|
252
|
+
visual_elements_time = (num_charts + num_tables) * 1
|
253
|
+
|
254
|
+
# VLM processing time
|
255
|
+
vlm_time = 0
|
256
|
+
if use_vlm:
|
257
|
+
vlm_time = (num_charts + num_tables) * 3
|
258
|
+
|
259
|
+
return base_time + visual_elements_time + vlm_time
|
260
|
+
|
261
|
+
|
262
|
+
def create_progress_callback(description: str, total: int):
|
263
|
+
"""
|
264
|
+
Create a progress callback function for use with processing operations.
|
265
|
+
|
266
|
+
Creates a tqdm progress bar and returns a callback function that
|
267
|
+
can be used to update the progress during long-running operations.
|
268
|
+
|
269
|
+
:param description: Description text for the progress bar
|
270
|
+
:param total: Total number of items to process
|
271
|
+
:return: Callable progress callback function that takes an integer
|
272
|
+
representing the number of completed items
|
273
|
+
"""
|
274
|
+
from tqdm import tqdm
|
275
|
+
|
276
|
+
pbar = tqdm(total=total, desc=description, leave=True)
|
277
|
+
|
278
|
+
def callback(completed: int):
|
279
|
+
pbar.n = completed
|
280
|
+
pbar.refresh()
|
281
|
+
if completed >= total:
|
282
|
+
pbar.close()
|
283
|
+
|
284
|
+
return callback
|
285
|
+
|
286
|
+
|
287
|
+
def safe_create_directory(path: Path, parents: bool = True) -> bool:
|
288
|
+
"""
|
289
|
+
Safely create a directory with error handling.
|
290
|
+
|
291
|
+
Attempts to create a directory and handles common errors like
|
292
|
+
permission issues gracefully.
|
293
|
+
|
294
|
+
:param path: Directory path to create
|
295
|
+
:param parents: Whether to create parent directories if they don't exist
|
296
|
+
:return: True if directory was created successfully, False otherwise
|
297
|
+
"""
|
298
|
+
try:
|
299
|
+
path.mkdir(parents=parents, exist_ok=True)
|
300
|
+
return True
|
301
|
+
except PermissionError:
|
302
|
+
click.echo(f"❌ Permission denied creating directory: {path}", err=True)
|
303
|
+
return False
|
304
|
+
except Exception as e:
|
305
|
+
click.echo(f"❌ Error creating directory {path}: {e}", err=True)
|
306
|
+
return False
|
307
|
+
|
308
|
+
|
309
|
+
def get_output_recommendations(element_counts: Dict[str, int]) -> str:
|
310
|
+
"""
|
311
|
+
Generate command recommendations based on detected elements.
|
312
|
+
|
313
|
+
Analyzes the types and counts of detected elements and suggests
|
314
|
+
appropriate Doctra commands for processing.
|
315
|
+
|
316
|
+
:param element_counts: Dictionary mapping element types to their counts
|
317
|
+
(e.g., {'chart': 5, 'table': 3, 'text': 100})
|
318
|
+
:return: Formatted string with command recommendations for the user
|
319
|
+
"""
|
320
|
+
charts = element_counts.get('chart', 0)
|
321
|
+
tables = element_counts.get('table', 0)
|
322
|
+
text = element_counts.get('text', 0)
|
323
|
+
figures = element_counts.get('figure', 0)
|
324
|
+
|
325
|
+
recommendations = []
|
326
|
+
|
327
|
+
if charts > 0 and tables > 0:
|
328
|
+
recommendations.append(f"📊📋 doctra extract both document.pdf # {charts} charts, {tables} tables")
|
329
|
+
elif charts > 0:
|
330
|
+
recommendations.append(f"📊 doctra extract charts document.pdf # {charts} charts")
|
331
|
+
elif tables > 0:
|
332
|
+
recommendations.append(f"📋 doctra extract tables document.pdf # {tables} tables")
|
333
|
+
|
334
|
+
if text > 0 or figures > 0:
|
335
|
+
recommendations.append(f"📄 doctra parse document.pdf # Full document with text")
|
336
|
+
|
337
|
+
if charts > 0 or tables > 0:
|
338
|
+
recommendations.append("💡 Add --use-vlm for structured data extraction")
|
339
|
+
|
340
|
+
return "\n ".join(recommendations) if recommendations else "No specific recommendations"
|
File without changes
|
File without changes
|
@@ -0,0 +1,90 @@
|
|
1
|
+
from dataclasses import dataclass, asdict
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
|
5
|
+
@dataclass
|
6
|
+
class LayoutBox:
|
7
|
+
"""
|
8
|
+
Single detected block on a page.
|
9
|
+
|
10
|
+
Represents a detected layout element (text, table, chart, figure, etc.)
|
11
|
+
with both absolute and normalized coordinates for flexibility in processing.
|
12
|
+
|
13
|
+
:param label: Type of layout element (e.g., 'text', 'table', 'chart', 'figure')
|
14
|
+
:param score: Confidence score of the detection (0.0 to 1.0)
|
15
|
+
:param x1: Left coordinate in absolute pixels
|
16
|
+
:param y1: Top coordinate in absolute pixels
|
17
|
+
:param x2: Right coordinate in absolute pixels
|
18
|
+
:param y2: Bottom coordinate in absolute pixels
|
19
|
+
:param nx1: Left coordinate normalized to [0,1] range
|
20
|
+
:param ny1: Top coordinate normalized to [0,1] range
|
21
|
+
:param nx2: Right coordinate normalized to [0,1] range
|
22
|
+
:param ny2: Bottom coordinate normalized to [0,1] range
|
23
|
+
"""
|
24
|
+
label: str
|
25
|
+
score: float
|
26
|
+
x1: float
|
27
|
+
y1: float
|
28
|
+
x2: float
|
29
|
+
y2: float
|
30
|
+
nx1: float # normalized [0,1]
|
31
|
+
ny1: float
|
32
|
+
nx2: float
|
33
|
+
ny2: float
|
34
|
+
|
35
|
+
@staticmethod
|
36
|
+
def from_absolute(label: str, score: float, coord: List[float], img_w: int, img_h: int) -> "LayoutBox":
|
37
|
+
"""
|
38
|
+
Create a LayoutBox from absolute coordinates.
|
39
|
+
|
40
|
+
Converts absolute pixel coordinates to a LayoutBox with both
|
41
|
+
absolute and normalized coordinates calculated.
|
42
|
+
|
43
|
+
:param label: Type of layout element (e.g., 'text', 'table', 'chart')
|
44
|
+
:param score: Confidence score of the detection (0.0 to 1.0)
|
45
|
+
:param coord: List of coordinates [x1, y1, x2, y2] in absolute pixels
|
46
|
+
:param img_w: Width of the source image in pixels
|
47
|
+
:param img_h: Height of the source image in pixels
|
48
|
+
:return: LayoutBox instance with both absolute and normalized coordinates
|
49
|
+
"""
|
50
|
+
x1, y1, x2, y2 = coord
|
51
|
+
return LayoutBox(
|
52
|
+
label=label,
|
53
|
+
score=score,
|
54
|
+
x1=x1, y1=y1, x2=x2, y2=y2,
|
55
|
+
nx1=x1 / img_w, ny1=y1 / img_h, nx2=x2 / img_w, ny2=y2 / img_h,
|
56
|
+
)
|
57
|
+
|
58
|
+
|
59
|
+
@dataclass
|
60
|
+
class LayoutPage:
|
61
|
+
"""
|
62
|
+
Detections for a single page.
|
63
|
+
|
64
|
+
Contains all layout elements detected on a single page of a document,
|
65
|
+
including page metadata and a list of detected layout boxes.
|
66
|
+
|
67
|
+
:param page_index: 1-based page index within the document
|
68
|
+
:param width: Width of the page in pixels
|
69
|
+
:param height: Height of the page in pixels
|
70
|
+
:param boxes: List of detected layout elements on this page
|
71
|
+
"""
|
72
|
+
page_index: int # 1-based
|
73
|
+
width: int
|
74
|
+
height: int
|
75
|
+
boxes: List[LayoutBox]
|
76
|
+
|
77
|
+
def to_dict(self) -> dict:
|
78
|
+
"""
|
79
|
+
Convert the LayoutPage to a dictionary representation.
|
80
|
+
|
81
|
+
Useful for serialization to JSON or other formats.
|
82
|
+
|
83
|
+
:return: Dictionary representation of the page with all boxes serialized
|
84
|
+
"""
|
85
|
+
return {
|
86
|
+
"page_index": self.page_index,
|
87
|
+
"width": self.width,
|
88
|
+
"height": self.height,
|
89
|
+
"boxes": [asdict(b) for b in self.boxes],
|
90
|
+
}
|
@@ -0,0 +1,225 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import sys
|
5
|
+
import json
|
6
|
+
import tempfile
|
7
|
+
import logging
|
8
|
+
from dataclasses import dataclass, asdict
|
9
|
+
from typing import Dict, List, Any, Tuple, Optional
|
10
|
+
from tqdm import tqdm
|
11
|
+
|
12
|
+
from PIL import Image
|
13
|
+
from paddleocr import LayoutDetection # pip install paddleocr>=2.7.0.3
|
14
|
+
from doctra.utils.pdf_io import render_pdf_to_images
|
15
|
+
from doctra.engines.layout.layout_models import LayoutBox, LayoutPage
|
16
|
+
from doctra.utils.quiet import suppress_output
|
17
|
+
|
18
|
+
|
19
|
+
class PaddleLayoutEngine:
|
20
|
+
"""
|
21
|
+
Thin wrapper around PaddleOCR LayoutDetection to support:
|
22
|
+
- Multi-page PDF inputs
|
23
|
+
- Batch prediction on page images
|
24
|
+
- Clean, page-indexed output with absolute and normalized coords
|
25
|
+
|
26
|
+
Provides a high-level interface for document layout detection using
|
27
|
+
PaddleOCR's layout detection models with enhanced output formatting
|
28
|
+
and multi-page PDF support.
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(self, model_name: str = "PP-DocLayout_plus-L"):
|
32
|
+
"""
|
33
|
+
Initialize the PaddleLayoutEngine with a specific model.
|
34
|
+
|
35
|
+
The model is loaded lazily on first use to avoid unnecessary
|
36
|
+
initialization overhead.
|
37
|
+
|
38
|
+
:param model_name: Name of the PaddleOCR layout detection model to use
|
39
|
+
(default: "PP-DocLayout_plus-L")
|
40
|
+
"""
|
41
|
+
self.model_name = model_name
|
42
|
+
self.model: Optional[LayoutDetection] = None
|
43
|
+
|
44
|
+
def _ensure_model(self) -> None:
|
45
|
+
"""
|
46
|
+
Ensure the PaddleOCR model is loaded and ready for inference.
|
47
|
+
|
48
|
+
Loads the model on first call with comprehensive output suppression
|
49
|
+
to minimize console noise during initialization.
|
50
|
+
|
51
|
+
:return: None
|
52
|
+
"""
|
53
|
+
if self.model is not None:
|
54
|
+
return
|
55
|
+
|
56
|
+
# Your own friendly progress line
|
57
|
+
with tqdm(total=1, desc=f'Loading PaddleOCR layout model: "{self.model_name}"', leave=True) as bar:
|
58
|
+
# Monkey patch tqdm to disable it completely during model loading
|
59
|
+
original_tqdm_init = tqdm.__init__
|
60
|
+
original_tqdm_update = tqdm.update
|
61
|
+
original_tqdm_close = tqdm.close
|
62
|
+
|
63
|
+
def silent_init(self, *args, **kwargs):
|
64
|
+
# Make all tqdm instances silent
|
65
|
+
kwargs['disable'] = True
|
66
|
+
original_tqdm_init(self, *args, **kwargs)
|
67
|
+
|
68
|
+
def silent_update(self, *args, **kwargs):
|
69
|
+
pass # Do nothing
|
70
|
+
|
71
|
+
def silent_close(self, *args, **kwargs):
|
72
|
+
pass # Do nothing
|
73
|
+
|
74
|
+
# More comprehensive output suppression
|
75
|
+
# Save original logging levels
|
76
|
+
original_levels = {}
|
77
|
+
loggers_to_silence = ['ppocr', 'paddle', 'PIL', 'urllib3', 'requests']
|
78
|
+
for logger_name in loggers_to_silence:
|
79
|
+
logger = logging.getLogger(logger_name)
|
80
|
+
original_levels[logger_name] = logger.level
|
81
|
+
logger.setLevel(logging.CRITICAL)
|
82
|
+
|
83
|
+
# Also try to silence the root logger temporarily
|
84
|
+
root_logger = logging.getLogger()
|
85
|
+
original_root_level = root_logger.level
|
86
|
+
root_logger.setLevel(logging.CRITICAL)
|
87
|
+
|
88
|
+
# Set environment variables that might help silence PaddlePaddle
|
89
|
+
old_env = {}
|
90
|
+
env_vars_to_set = {
|
91
|
+
'FLAGS_print_model_stats': '0',
|
92
|
+
'FLAGS_enable_parallel_graph': '0',
|
93
|
+
'GLOG_v': '4', # Only show fatal errors
|
94
|
+
'GLOG_logtostderr': '0',
|
95
|
+
'GLOG_alsologtostderr': '0'
|
96
|
+
}
|
97
|
+
|
98
|
+
for key, value in env_vars_to_set.items():
|
99
|
+
old_env[key] = os.environ.get(key)
|
100
|
+
os.environ[key] = value
|
101
|
+
|
102
|
+
try:
|
103
|
+
# Monkey patch tqdm
|
104
|
+
tqdm.__init__ = silent_init
|
105
|
+
tqdm.update = silent_update
|
106
|
+
tqdm.close = silent_close
|
107
|
+
|
108
|
+
# Silence Paddle's download/init noise with enhanced suppression
|
109
|
+
with suppress_output():
|
110
|
+
self.model = LayoutDetection(model_name=self.model_name)
|
111
|
+
|
112
|
+
finally:
|
113
|
+
# Restore tqdm methods
|
114
|
+
tqdm.__init__ = original_tqdm_init
|
115
|
+
tqdm.update = original_tqdm_update
|
116
|
+
tqdm.close = original_tqdm_close
|
117
|
+
|
118
|
+
# Restore logging levels
|
119
|
+
for logger_name, level in original_levels.items():
|
120
|
+
logging.getLogger(logger_name).setLevel(level)
|
121
|
+
root_logger.setLevel(original_root_level)
|
122
|
+
|
123
|
+
# Restore environment variables
|
124
|
+
for key, old_value in old_env.items():
|
125
|
+
if old_value is None:
|
126
|
+
os.environ.pop(key, None)
|
127
|
+
else:
|
128
|
+
os.environ[key] = old_value
|
129
|
+
|
130
|
+
bar.update(1)
|
131
|
+
|
132
|
+
def predict_pdf(
|
133
|
+
self,
|
134
|
+
pdf_path: str,
|
135
|
+
batch_size: int = 1,
|
136
|
+
layout_nms: bool = True,
|
137
|
+
dpi: int = 200,
|
138
|
+
min_score: float = 0.0,
|
139
|
+
keep_temp_files: bool = False,
|
140
|
+
) -> List[LayoutPage]:
|
141
|
+
"""
|
142
|
+
Run layout detection on every page of a PDF.
|
143
|
+
|
144
|
+
Processes each page of the PDF through the layout detection model,
|
145
|
+
returning structured results with both absolute and normalized coordinates
|
146
|
+
for each detected layout element.
|
147
|
+
|
148
|
+
:param pdf_path: Path to the input PDF file
|
149
|
+
:param batch_size: Batch size for Paddle inference (default: 1)
|
150
|
+
:param layout_nms: Whether to apply layout NMS in Paddle (default: True)
|
151
|
+
:param dpi: Rendering DPI for pdf2image conversion (default: 200)
|
152
|
+
:param min_score: Filter out detections below this confidence threshold (default: 0.0)
|
153
|
+
:param keep_temp_files: If True, keep the intermediate JPGs for debugging (default: False)
|
154
|
+
:return: List of LayoutPage objects in 1-based page_index order
|
155
|
+
"""
|
156
|
+
self._ensure_model()
|
157
|
+
pil_pages: List[Tuple[Image.Image, int, int]] = render_pdf_to_images(pdf_path, dpi=dpi)
|
158
|
+
if not pil_pages:
|
159
|
+
return []
|
160
|
+
|
161
|
+
# Write pages to a temp dir because LayoutDetection expects image paths.
|
162
|
+
with tempfile.TemporaryDirectory(prefix="doctra_layout_") as tmpdir:
|
163
|
+
img_paths: List[str] = []
|
164
|
+
sizes: List[Tuple[int, int]] = []
|
165
|
+
for i, (im, w, h) in enumerate(pil_pages, start=1):
|
166
|
+
out_path = os.path.join(tmpdir, f"page_{i:04d}.jpg")
|
167
|
+
im.save(out_path, format="JPEG", quality=95)
|
168
|
+
img_paths.append(out_path)
|
169
|
+
sizes.append((w, h))
|
170
|
+
|
171
|
+
# PaddleOCR allows list input; results align with img_paths order.
|
172
|
+
raw_outputs: List[Dict[str, Any]] = self.model.predict(
|
173
|
+
img_paths, batch_size=batch_size, layout_nms=layout_nms
|
174
|
+
)
|
175
|
+
|
176
|
+
pages: List[LayoutPage] = []
|
177
|
+
for idx, raw in enumerate(raw_outputs, start=1):
|
178
|
+
w, h = sizes[idx - 1]
|
179
|
+
boxes: List[LayoutBox] = []
|
180
|
+
for det in raw.get("boxes", []):
|
181
|
+
score = float(det.get("score", 0.0))
|
182
|
+
if score < min_score:
|
183
|
+
continue
|
184
|
+
label = str(det.get("label", "unknown"))
|
185
|
+
coord = det.get("coordinate", [0, 0, 0, 0])
|
186
|
+
boxes.append(LayoutBox.from_absolute(label=label, score=score, coord=coord, img_w=w, img_h=h))
|
187
|
+
pages.append(LayoutPage(page_index=idx, width=w, height=h, boxes=boxes))
|
188
|
+
|
189
|
+
# Optionally keep rendered images for inspection
|
190
|
+
if keep_temp_files:
|
191
|
+
debug_dir = os.path.join(os.path.dirname(pdf_path), f"_doctra_layout_{os.getpid()}")
|
192
|
+
os.makedirs(debug_dir, exist_ok=True)
|
193
|
+
for p in img_paths:
|
194
|
+
os.replace(p, os.path.join(debug_dir, os.path.basename(p)))
|
195
|
+
|
196
|
+
return pages
|
197
|
+
|
198
|
+
# Convenience helpers
|
199
|
+
def predict_pdf_as_dicts(self, pdf_path: str, **kwargs) -> List[Dict[str, Any]]:
|
200
|
+
"""
|
201
|
+
Same as predict_pdf, but returns plain dicts for easy JSON serialization.
|
202
|
+
|
203
|
+
Convenience method that converts LayoutPage objects to dictionaries,
|
204
|
+
making it easy to serialize results to JSON or other formats.
|
205
|
+
|
206
|
+
:param pdf_path: Path to the input PDF file
|
207
|
+
:param kwargs: Additional arguments passed to predict_pdf
|
208
|
+
:return: List of dictionaries representing the layout pages
|
209
|
+
"""
|
210
|
+
return [p.to_dict() for p in self.predict_pdf(pdf_path, **kwargs)]
|
211
|
+
|
212
|
+
def save_jsonl(self, pages: List[LayoutPage], out_path: str) -> None:
|
213
|
+
"""
|
214
|
+
Save detections to a JSONL file (one page per line).
|
215
|
+
|
216
|
+
Writes each page as a separate JSON line, making it easy to process
|
217
|
+
large documents incrementally.
|
218
|
+
|
219
|
+
:param pages: List of LayoutPage objects to save
|
220
|
+
:param out_path: Output file path for the JSONL file
|
221
|
+
:return: None
|
222
|
+
"""
|
223
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
224
|
+
for p in pages:
|
225
|
+
f.write(json.dumps(p.to_dict(), ensure_ascii=False) + "\n")
|
@@ -0,0 +1,36 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Optional
|
4
|
+
from PIL import Image
|
5
|
+
|
6
|
+
from .pytesseract_engine import PytesseractOCREngine
|
7
|
+
|
8
|
+
|
9
|
+
def ocr_image(
|
10
|
+
cropped_pil: Image.Image,
|
11
|
+
*,
|
12
|
+
lang: str = "eng",
|
13
|
+
psm: int = 4,
|
14
|
+
oem: int = 3,
|
15
|
+
extra_config: str = "",
|
16
|
+
tesseract_cmd: Optional[str] = None,
|
17
|
+
) -> str:
|
18
|
+
"""
|
19
|
+
One-shot OCR: run pytesseract on a cropped PIL image and return text.
|
20
|
+
|
21
|
+
Convenience function that creates a PytesseractOCREngine instance and
|
22
|
+
immediately runs OCR on the provided image. Useful for quick text extraction
|
23
|
+
without needing to manage engine instances.
|
24
|
+
|
25
|
+
:param cropped_pil: PIL Image object to perform OCR on
|
26
|
+
:param lang: OCR language code (default: "eng")
|
27
|
+
:param psm: Tesseract page segmentation mode (default: 4)
|
28
|
+
:param oem: Tesseract OCR engine mode (default: 3)
|
29
|
+
:param extra_config: Additional Tesseract configuration string (default: "")
|
30
|
+
:param tesseract_cmd: Optional path to tesseract executable (default: None)
|
31
|
+
:return: Extracted text string from the image
|
32
|
+
"""
|
33
|
+
engine = PytesseractOCREngine(
|
34
|
+
tesseract_cmd=tesseract_cmd, lang=lang, psm=psm, oem=oem, extra_config=extra_config
|
35
|
+
)
|
36
|
+
return engine.recognize(cropped_pil)
|