doctra 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. doctra/__init__.py +19 -0
  2. doctra/cli/__init__.py +27 -0
  3. doctra/cli/main.py +856 -0
  4. doctra/cli/utils.py +340 -0
  5. doctra/engines/__init__.py +0 -0
  6. doctra/engines/layout/__init__.py +0 -0
  7. doctra/engines/layout/layout_models.py +90 -0
  8. doctra/engines/layout/paddle_layout.py +225 -0
  9. doctra/engines/ocr/__init__.py +4 -0
  10. doctra/engines/ocr/api.py +36 -0
  11. doctra/engines/ocr/path_resolver.py +48 -0
  12. doctra/engines/ocr/pytesseract_engine.py +76 -0
  13. doctra/engines/vlm/__init__.py +0 -0
  14. doctra/engines/vlm/outlines_types.py +31 -0
  15. doctra/engines/vlm/provider.py +58 -0
  16. doctra/engines/vlm/service.py +117 -0
  17. doctra/exporters/__init__.py +0 -0
  18. doctra/exporters/excel_writer.py +197 -0
  19. doctra/exporters/image_saver.py +42 -0
  20. doctra/exporters/markdown_table.py +56 -0
  21. doctra/exporters/markdown_writer.py +29 -0
  22. doctra/parsers/__init__.py +6 -0
  23. doctra/parsers/layout_order.py +16 -0
  24. doctra/parsers/structured_pdf_parser.py +434 -0
  25. doctra/parsers/table_chart_extractor.py +283 -0
  26. doctra/utils/__init__.py +0 -0
  27. doctra/utils/bbox.py +18 -0
  28. doctra/utils/constants.py +8 -0
  29. doctra/utils/file_ops.py +26 -0
  30. doctra/utils/io_utils.py +10 -0
  31. doctra/utils/ocr_utils.py +20 -0
  32. doctra/utils/pdf_io.py +19 -0
  33. doctra/utils/quiet.py +13 -0
  34. doctra/utils/structured_utils.py +49 -0
  35. doctra/version.py +2 -0
  36. doctra-0.1.0.dist-info/METADATA +626 -0
  37. doctra-0.1.0.dist-info/RECORD +40 -0
  38. doctra-0.1.0.dist-info/WHEEL +5 -0
  39. doctra-0.1.0.dist-info/licenses/LICENSE +201 -0
  40. doctra-0.1.0.dist-info/top_level.txt +1 -0
doctra/cli/utils.py ADDED
@@ -0,0 +1,340 @@
1
+ """
2
+ CLI utilities for the Doctra command line interface.
3
+
4
+ This module contains shared utilities and helper functions used across
5
+ different CLI commands.
6
+ """
7
+
8
+ import click
9
+ import sys
10
+ from typing import Optional, Dict, Any
11
+ from pathlib import Path
12
+
13
+
14
+ def validate_vlm_config(use_vlm: bool, vlm_api_key: Optional[str]) -> None:
15
+ """
16
+ Validate VLM configuration and exit with error if invalid.
17
+
18
+ Checks if VLM is enabled but no API key is provided, and exits
19
+ with an appropriate error message if the configuration is invalid.
20
+
21
+ :param use_vlm: Whether VLM processing is enabled
22
+ :param vlm_api_key: The VLM API key (can be None if VLM is disabled)
23
+ :return: None
24
+ :raises SystemExit: If VLM is enabled but no API key is provided
25
+ """
26
+ if use_vlm and not vlm_api_key:
27
+ click.echo("❌ Error: VLM API key is required when using --use-vlm", err=True)
28
+ click.echo(" Set the VLM_API_KEY environment variable or use --vlm-api-key", err=True)
29
+ click.echo(" Example: export VLM_API_KEY=your_api_key", err=True)
30
+ sys.exit(1)
31
+
32
+
33
+ def handle_keyboard_interrupt() -> None:
34
+ """
35
+ Handle keyboard interrupt (Ctrl+C) gracefully.
36
+
37
+ Displays a user-friendly message and exits with the standard
38
+ interrupt exit code (130).
39
+
40
+ :return: None
41
+ :raises SystemExit: Always exits with code 130
42
+ """
43
+ click.echo("\n⚠️ Operation interrupted by user", err=True)
44
+ sys.exit(130)
45
+
46
+
47
+ def handle_exception(e: Exception, verbose: bool = False) -> None:
48
+ """
49
+ Handle exceptions with appropriate error messages.
50
+
51
+ Displays the exception message and optionally the full traceback
52
+ if verbose mode is enabled.
53
+
54
+ :param e: The exception that occurred
55
+ :param verbose: Whether to show full traceback
56
+ :return: None
57
+ :raises SystemExit: Always exits with code 1
58
+ """
59
+ click.echo(f"❌ Error: {e}", err=True)
60
+ if verbose:
61
+ import traceback
62
+ click.echo(traceback.format_exc(), err=True)
63
+ sys.exit(1)
64
+
65
+
66
+ def validate_pdf_path(pdf_path: Path) -> None:
67
+ """
68
+ Validate that the PDF path exists and is a valid PDF file.
69
+
70
+ Checks if the file exists, is actually a file (not directory),
71
+ and optionally warns if the file extension is not .pdf.
72
+
73
+ :param pdf_path: Path to the PDF file to validate
74
+ :return: None
75
+ :raises SystemExit: If file doesn't exist or is not a file
76
+ """
77
+ if not pdf_path.exists():
78
+ click.echo(f"❌ Error: PDF file not found: {pdf_path}", err=True)
79
+ sys.exit(1)
80
+
81
+ if not pdf_path.is_file():
82
+ click.echo(f"❌ Error: Path is not a file: {pdf_path}", err=True)
83
+ sys.exit(1)
84
+
85
+ if pdf_path.suffix.lower() != '.pdf':
86
+ click.echo(f"⚠️ Warning: File does not have .pdf extension: {pdf_path}")
87
+
88
+
89
+ def format_file_size(size_bytes: int) -> str:
90
+ """
91
+ Format file size in human readable format.
92
+
93
+ Converts bytes to the most appropriate unit (B, KB, MB, GB)
94
+ with one decimal place precision.
95
+
96
+ :param size_bytes: Size in bytes to format
97
+ :return: Formatted size string (e.g., "1.5 MB", "2.3 GB")
98
+ """
99
+ if size_bytes == 0:
100
+ return "0 B"
101
+
102
+ units = ["B", "KB", "MB", "GB"]
103
+ unit_index = 0
104
+ size = float(size_bytes)
105
+
106
+ while size >= 1024 and unit_index < len(units) - 1:
107
+ size /= 1024
108
+ unit_index += 1
109
+
110
+ return f"{size:.1f} {units[unit_index]}"
111
+
112
+
113
+ def get_file_info(file_path: Path) -> Dict[str, Any]:
114
+ """
115
+ Get basic file information.
116
+
117
+ Retrieves file metadata including name, size, modification time,
118
+ and file type information.
119
+
120
+ :param file_path: Path to the file to get information for
121
+ :return: Dictionary containing file information with keys:
122
+ - name: File name
123
+ - size: Size in bytes
124
+ - size_formatted: Human-readable size
125
+ - modified: Modification timestamp
126
+ - is_file: Whether it's a file
127
+ - is_dir: Whether it's a directory
128
+ - extension: File extension (lowercase)
129
+ Returns empty dict if file doesn't exist
130
+ """
131
+ if not file_path.exists():
132
+ return {}
133
+
134
+ stat = file_path.stat()
135
+ return {
136
+ 'name': file_path.name,
137
+ 'size': stat.st_size,
138
+ 'size_formatted': format_file_size(stat.st_size),
139
+ 'modified': stat.st_mtime,
140
+ 'is_file': file_path.is_file(),
141
+ 'is_dir': file_path.is_dir(),
142
+ 'extension': file_path.suffix.lower()
143
+ }
144
+
145
+
146
+ def print_processing_summary(
147
+ input_file: Path,
148
+ output_dir: Path,
149
+ processing_time: Optional[float] = None,
150
+ elements_processed: Optional[int] = None,
151
+ use_vlm: bool = False
152
+ ) -> None:
153
+ """
154
+ Print a summary of processing results.
155
+
156
+ Displays a formatted summary including input file information,
157
+ output directory, processing time, number of elements processed,
158
+ and VLM usage status.
159
+
160
+ :param input_file: Input PDF file path
161
+ :param output_dir: Output directory path
162
+ :param processing_time: Time taken for processing in seconds
163
+ :param elements_processed: Number of elements processed
164
+ :param use_vlm: Whether VLM was used during processing
165
+ :return: None
166
+ """
167
+ click.echo("\n" + "=" * 50)
168
+ click.echo("📊 Processing Summary")
169
+ click.echo("=" * 50)
170
+
171
+ # Input file info
172
+ file_info = get_file_info(input_file)
173
+ if file_info:
174
+ click.echo(f"Input file: {file_info['name']}")
175
+ click.echo(f"File size: {file_info['size_formatted']}")
176
+
177
+ # Output info
178
+ if output_dir.exists():
179
+ click.echo(f"Output: {output_dir}")
180
+
181
+ # Processing details
182
+ if elements_processed is not None:
183
+ click.echo(f"Elements: {elements_processed} processed")
184
+
185
+ if processing_time is not None:
186
+ click.echo(f"Time: {processing_time:.1f} seconds")
187
+
188
+ if use_vlm:
189
+ click.echo("VLM: ✅ Enabled")
190
+ else:
191
+ click.echo("VLM: ❌ Disabled")
192
+
193
+
194
+ def check_dependencies() -> Dict[str, bool]:
195
+ """
196
+ Check if required dependencies are available.
197
+
198
+ Tests import availability for core and optional dependencies
199
+ used by the Doctra library.
200
+
201
+ :return: Dictionary mapping dependency names to availability status:
202
+ - PIL: Pillow for image processing
203
+ - paddle: PaddlePaddle for layout detection
204
+ - pytesseract: Tesseract OCR wrapper
205
+ - tqdm: Progress bar library
206
+ - click: CLI framework
207
+ - google.generativeai: Gemini VLM support
208
+ - openai: OpenAI VLM support
209
+ """
210
+ dependencies = {
211
+ 'PIL': False,
212
+ 'paddle': False,
213
+ 'pytesseract': False,
214
+ 'tqdm': False,
215
+ 'click': False,
216
+ 'google.generativeai': False,
217
+ 'openai': False,
218
+ }
219
+
220
+ for dep in dependencies:
221
+ try:
222
+ __import__(dep)
223
+ dependencies[dep] = True
224
+ except ImportError:
225
+ dependencies[dep] = False
226
+
227
+ return dependencies
228
+
229
+
230
+ def estimate_processing_time(
231
+ num_pages: int,
232
+ num_charts: int = 0,
233
+ num_tables: int = 0,
234
+ use_vlm: bool = False
235
+ ) -> int:
236
+ """
237
+ Estimate processing time based on document characteristics.
238
+
239
+ Provides a rough estimate of processing time based on the number
240
+ of pages, charts, tables, and whether VLM processing is enabled.
241
+
242
+ :param num_pages: Number of pages in the document
243
+ :param num_charts: Number of charts detected in the document
244
+ :param num_tables: Number of tables detected in the document
245
+ :param use_vlm: Whether VLM processing will be used
246
+ :return: Estimated processing time in seconds
247
+ """
248
+ # Base time per page (layout detection + OCR)
249
+ base_time = num_pages * 2
250
+
251
+ # Additional time for charts and tables
252
+ visual_elements_time = (num_charts + num_tables) * 1
253
+
254
+ # VLM processing time
255
+ vlm_time = 0
256
+ if use_vlm:
257
+ vlm_time = (num_charts + num_tables) * 3
258
+
259
+ return base_time + visual_elements_time + vlm_time
260
+
261
+
262
+ def create_progress_callback(description: str, total: int):
263
+ """
264
+ Create a progress callback function for use with processing operations.
265
+
266
+ Creates a tqdm progress bar and returns a callback function that
267
+ can be used to update the progress during long-running operations.
268
+
269
+ :param description: Description text for the progress bar
270
+ :param total: Total number of items to process
271
+ :return: Callable progress callback function that takes an integer
272
+ representing the number of completed items
273
+ """
274
+ from tqdm import tqdm
275
+
276
+ pbar = tqdm(total=total, desc=description, leave=True)
277
+
278
+ def callback(completed: int):
279
+ pbar.n = completed
280
+ pbar.refresh()
281
+ if completed >= total:
282
+ pbar.close()
283
+
284
+ return callback
285
+
286
+
287
+ def safe_create_directory(path: Path, parents: bool = True) -> bool:
288
+ """
289
+ Safely create a directory with error handling.
290
+
291
+ Attempts to create a directory and handles common errors like
292
+ permission issues gracefully.
293
+
294
+ :param path: Directory path to create
295
+ :param parents: Whether to create parent directories if they don't exist
296
+ :return: True if directory was created successfully, False otherwise
297
+ """
298
+ try:
299
+ path.mkdir(parents=parents, exist_ok=True)
300
+ return True
301
+ except PermissionError:
302
+ click.echo(f"❌ Permission denied creating directory: {path}", err=True)
303
+ return False
304
+ except Exception as e:
305
+ click.echo(f"❌ Error creating directory {path}: {e}", err=True)
306
+ return False
307
+
308
+
309
+ def get_output_recommendations(element_counts: Dict[str, int]) -> str:
310
+ """
311
+ Generate command recommendations based on detected elements.
312
+
313
+ Analyzes the types and counts of detected elements and suggests
314
+ appropriate Doctra commands for processing.
315
+
316
+ :param element_counts: Dictionary mapping element types to their counts
317
+ (e.g., {'chart': 5, 'table': 3, 'text': 100})
318
+ :return: Formatted string with command recommendations for the user
319
+ """
320
+ charts = element_counts.get('chart', 0)
321
+ tables = element_counts.get('table', 0)
322
+ text = element_counts.get('text', 0)
323
+ figures = element_counts.get('figure', 0)
324
+
325
+ recommendations = []
326
+
327
+ if charts > 0 and tables > 0:
328
+ recommendations.append(f"📊📋 doctra extract both document.pdf # {charts} charts, {tables} tables")
329
+ elif charts > 0:
330
+ recommendations.append(f"📊 doctra extract charts document.pdf # {charts} charts")
331
+ elif tables > 0:
332
+ recommendations.append(f"📋 doctra extract tables document.pdf # {tables} tables")
333
+
334
+ if text > 0 or figures > 0:
335
+ recommendations.append(f"📄 doctra parse document.pdf # Full document with text")
336
+
337
+ if charts > 0 or tables > 0:
338
+ recommendations.append("💡 Add --use-vlm for structured data extraction")
339
+
340
+ return "\n ".join(recommendations) if recommendations else "No specific recommendations"
File without changes
File without changes
@@ -0,0 +1,90 @@
1
+ from dataclasses import dataclass, asdict
2
+ from typing import List
3
+
4
+
5
+ @dataclass
6
+ class LayoutBox:
7
+ """
8
+ Single detected block on a page.
9
+
10
+ Represents a detected layout element (text, table, chart, figure, etc.)
11
+ with both absolute and normalized coordinates for flexibility in processing.
12
+
13
+ :param label: Type of layout element (e.g., 'text', 'table', 'chart', 'figure')
14
+ :param score: Confidence score of the detection (0.0 to 1.0)
15
+ :param x1: Left coordinate in absolute pixels
16
+ :param y1: Top coordinate in absolute pixels
17
+ :param x2: Right coordinate in absolute pixels
18
+ :param y2: Bottom coordinate in absolute pixels
19
+ :param nx1: Left coordinate normalized to [0,1] range
20
+ :param ny1: Top coordinate normalized to [0,1] range
21
+ :param nx2: Right coordinate normalized to [0,1] range
22
+ :param ny2: Bottom coordinate normalized to [0,1] range
23
+ """
24
+ label: str
25
+ score: float
26
+ x1: float
27
+ y1: float
28
+ x2: float
29
+ y2: float
30
+ nx1: float # normalized [0,1]
31
+ ny1: float
32
+ nx2: float
33
+ ny2: float
34
+
35
+ @staticmethod
36
+ def from_absolute(label: str, score: float, coord: List[float], img_w: int, img_h: int) -> "LayoutBox":
37
+ """
38
+ Create a LayoutBox from absolute coordinates.
39
+
40
+ Converts absolute pixel coordinates to a LayoutBox with both
41
+ absolute and normalized coordinates calculated.
42
+
43
+ :param label: Type of layout element (e.g., 'text', 'table', 'chart')
44
+ :param score: Confidence score of the detection (0.0 to 1.0)
45
+ :param coord: List of coordinates [x1, y1, x2, y2] in absolute pixels
46
+ :param img_w: Width of the source image in pixels
47
+ :param img_h: Height of the source image in pixels
48
+ :return: LayoutBox instance with both absolute and normalized coordinates
49
+ """
50
+ x1, y1, x2, y2 = coord
51
+ return LayoutBox(
52
+ label=label,
53
+ score=score,
54
+ x1=x1, y1=y1, x2=x2, y2=y2,
55
+ nx1=x1 / img_w, ny1=y1 / img_h, nx2=x2 / img_w, ny2=y2 / img_h,
56
+ )
57
+
58
+
59
+ @dataclass
60
+ class LayoutPage:
61
+ """
62
+ Detections for a single page.
63
+
64
+ Contains all layout elements detected on a single page of a document,
65
+ including page metadata and a list of detected layout boxes.
66
+
67
+ :param page_index: 1-based page index within the document
68
+ :param width: Width of the page in pixels
69
+ :param height: Height of the page in pixels
70
+ :param boxes: List of detected layout elements on this page
71
+ """
72
+ page_index: int # 1-based
73
+ width: int
74
+ height: int
75
+ boxes: List[LayoutBox]
76
+
77
+ def to_dict(self) -> dict:
78
+ """
79
+ Convert the LayoutPage to a dictionary representation.
80
+
81
+ Useful for serialization to JSON or other formats.
82
+
83
+ :return: Dictionary representation of the page with all boxes serialized
84
+ """
85
+ return {
86
+ "page_index": self.page_index,
87
+ "width": self.width,
88
+ "height": self.height,
89
+ "boxes": [asdict(b) for b in self.boxes],
90
+ }
@@ -0,0 +1,225 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import sys
5
+ import json
6
+ import tempfile
7
+ import logging
8
+ from dataclasses import dataclass, asdict
9
+ from typing import Dict, List, Any, Tuple, Optional
10
+ from tqdm import tqdm
11
+
12
+ from PIL import Image
13
+ from paddleocr import LayoutDetection # pip install paddleocr>=2.7.0.3
14
+ from doctra.utils.pdf_io import render_pdf_to_images
15
+ from doctra.engines.layout.layout_models import LayoutBox, LayoutPage
16
+ from doctra.utils.quiet import suppress_output
17
+
18
+
19
+ class PaddleLayoutEngine:
20
+ """
21
+ Thin wrapper around PaddleOCR LayoutDetection to support:
22
+ - Multi-page PDF inputs
23
+ - Batch prediction on page images
24
+ - Clean, page-indexed output with absolute and normalized coords
25
+
26
+ Provides a high-level interface for document layout detection using
27
+ PaddleOCR's layout detection models with enhanced output formatting
28
+ and multi-page PDF support.
29
+ """
30
+
31
+ def __init__(self, model_name: str = "PP-DocLayout_plus-L"):
32
+ """
33
+ Initialize the PaddleLayoutEngine with a specific model.
34
+
35
+ The model is loaded lazily on first use to avoid unnecessary
36
+ initialization overhead.
37
+
38
+ :param model_name: Name of the PaddleOCR layout detection model to use
39
+ (default: "PP-DocLayout_plus-L")
40
+ """
41
+ self.model_name = model_name
42
+ self.model: Optional[LayoutDetection] = None
43
+
44
+ def _ensure_model(self) -> None:
45
+ """
46
+ Ensure the PaddleOCR model is loaded and ready for inference.
47
+
48
+ Loads the model on first call with comprehensive output suppression
49
+ to minimize console noise during initialization.
50
+
51
+ :return: None
52
+ """
53
+ if self.model is not None:
54
+ return
55
+
56
+ # Your own friendly progress line
57
+ with tqdm(total=1, desc=f'Loading PaddleOCR layout model: "{self.model_name}"', leave=True) as bar:
58
+ # Monkey patch tqdm to disable it completely during model loading
59
+ original_tqdm_init = tqdm.__init__
60
+ original_tqdm_update = tqdm.update
61
+ original_tqdm_close = tqdm.close
62
+
63
+ def silent_init(self, *args, **kwargs):
64
+ # Make all tqdm instances silent
65
+ kwargs['disable'] = True
66
+ original_tqdm_init(self, *args, **kwargs)
67
+
68
+ def silent_update(self, *args, **kwargs):
69
+ pass # Do nothing
70
+
71
+ def silent_close(self, *args, **kwargs):
72
+ pass # Do nothing
73
+
74
+ # More comprehensive output suppression
75
+ # Save original logging levels
76
+ original_levels = {}
77
+ loggers_to_silence = ['ppocr', 'paddle', 'PIL', 'urllib3', 'requests']
78
+ for logger_name in loggers_to_silence:
79
+ logger = logging.getLogger(logger_name)
80
+ original_levels[logger_name] = logger.level
81
+ logger.setLevel(logging.CRITICAL)
82
+
83
+ # Also try to silence the root logger temporarily
84
+ root_logger = logging.getLogger()
85
+ original_root_level = root_logger.level
86
+ root_logger.setLevel(logging.CRITICAL)
87
+
88
+ # Set environment variables that might help silence PaddlePaddle
89
+ old_env = {}
90
+ env_vars_to_set = {
91
+ 'FLAGS_print_model_stats': '0',
92
+ 'FLAGS_enable_parallel_graph': '0',
93
+ 'GLOG_v': '4', # Only show fatal errors
94
+ 'GLOG_logtostderr': '0',
95
+ 'GLOG_alsologtostderr': '0'
96
+ }
97
+
98
+ for key, value in env_vars_to_set.items():
99
+ old_env[key] = os.environ.get(key)
100
+ os.environ[key] = value
101
+
102
+ try:
103
+ # Monkey patch tqdm
104
+ tqdm.__init__ = silent_init
105
+ tqdm.update = silent_update
106
+ tqdm.close = silent_close
107
+
108
+ # Silence Paddle's download/init noise with enhanced suppression
109
+ with suppress_output():
110
+ self.model = LayoutDetection(model_name=self.model_name)
111
+
112
+ finally:
113
+ # Restore tqdm methods
114
+ tqdm.__init__ = original_tqdm_init
115
+ tqdm.update = original_tqdm_update
116
+ tqdm.close = original_tqdm_close
117
+
118
+ # Restore logging levels
119
+ for logger_name, level in original_levels.items():
120
+ logging.getLogger(logger_name).setLevel(level)
121
+ root_logger.setLevel(original_root_level)
122
+
123
+ # Restore environment variables
124
+ for key, old_value in old_env.items():
125
+ if old_value is None:
126
+ os.environ.pop(key, None)
127
+ else:
128
+ os.environ[key] = old_value
129
+
130
+ bar.update(1)
131
+
132
+ def predict_pdf(
133
+ self,
134
+ pdf_path: str,
135
+ batch_size: int = 1,
136
+ layout_nms: bool = True,
137
+ dpi: int = 200,
138
+ min_score: float = 0.0,
139
+ keep_temp_files: bool = False,
140
+ ) -> List[LayoutPage]:
141
+ """
142
+ Run layout detection on every page of a PDF.
143
+
144
+ Processes each page of the PDF through the layout detection model,
145
+ returning structured results with both absolute and normalized coordinates
146
+ for each detected layout element.
147
+
148
+ :param pdf_path: Path to the input PDF file
149
+ :param batch_size: Batch size for Paddle inference (default: 1)
150
+ :param layout_nms: Whether to apply layout NMS in Paddle (default: True)
151
+ :param dpi: Rendering DPI for pdf2image conversion (default: 200)
152
+ :param min_score: Filter out detections below this confidence threshold (default: 0.0)
153
+ :param keep_temp_files: If True, keep the intermediate JPGs for debugging (default: False)
154
+ :return: List of LayoutPage objects in 1-based page_index order
155
+ """
156
+ self._ensure_model()
157
+ pil_pages: List[Tuple[Image.Image, int, int]] = render_pdf_to_images(pdf_path, dpi=dpi)
158
+ if not pil_pages:
159
+ return []
160
+
161
+ # Write pages to a temp dir because LayoutDetection expects image paths.
162
+ with tempfile.TemporaryDirectory(prefix="doctra_layout_") as tmpdir:
163
+ img_paths: List[str] = []
164
+ sizes: List[Tuple[int, int]] = []
165
+ for i, (im, w, h) in enumerate(pil_pages, start=1):
166
+ out_path = os.path.join(tmpdir, f"page_{i:04d}.jpg")
167
+ im.save(out_path, format="JPEG", quality=95)
168
+ img_paths.append(out_path)
169
+ sizes.append((w, h))
170
+
171
+ # PaddleOCR allows list input; results align with img_paths order.
172
+ raw_outputs: List[Dict[str, Any]] = self.model.predict(
173
+ img_paths, batch_size=batch_size, layout_nms=layout_nms
174
+ )
175
+
176
+ pages: List[LayoutPage] = []
177
+ for idx, raw in enumerate(raw_outputs, start=1):
178
+ w, h = sizes[idx - 1]
179
+ boxes: List[LayoutBox] = []
180
+ for det in raw.get("boxes", []):
181
+ score = float(det.get("score", 0.0))
182
+ if score < min_score:
183
+ continue
184
+ label = str(det.get("label", "unknown"))
185
+ coord = det.get("coordinate", [0, 0, 0, 0])
186
+ boxes.append(LayoutBox.from_absolute(label=label, score=score, coord=coord, img_w=w, img_h=h))
187
+ pages.append(LayoutPage(page_index=idx, width=w, height=h, boxes=boxes))
188
+
189
+ # Optionally keep rendered images for inspection
190
+ if keep_temp_files:
191
+ debug_dir = os.path.join(os.path.dirname(pdf_path), f"_doctra_layout_{os.getpid()}")
192
+ os.makedirs(debug_dir, exist_ok=True)
193
+ for p in img_paths:
194
+ os.replace(p, os.path.join(debug_dir, os.path.basename(p)))
195
+
196
+ return pages
197
+
198
+ # Convenience helpers
199
+ def predict_pdf_as_dicts(self, pdf_path: str, **kwargs) -> List[Dict[str, Any]]:
200
+ """
201
+ Same as predict_pdf, but returns plain dicts for easy JSON serialization.
202
+
203
+ Convenience method that converts LayoutPage objects to dictionaries,
204
+ making it easy to serialize results to JSON or other formats.
205
+
206
+ :param pdf_path: Path to the input PDF file
207
+ :param kwargs: Additional arguments passed to predict_pdf
208
+ :return: List of dictionaries representing the layout pages
209
+ """
210
+ return [p.to_dict() for p in self.predict_pdf(pdf_path, **kwargs)]
211
+
212
+ def save_jsonl(self, pages: List[LayoutPage], out_path: str) -> None:
213
+ """
214
+ Save detections to a JSONL file (one page per line).
215
+
216
+ Writes each page as a separate JSON line, making it easy to process
217
+ large documents incrementally.
218
+
219
+ :param pages: List of LayoutPage objects to save
220
+ :param out_path: Output file path for the JSONL file
221
+ :return: None
222
+ """
223
+ with open(out_path, "w", encoding="utf-8") as f:
224
+ for p in pages:
225
+ f.write(json.dumps(p.to_dict(), ensure_ascii=False) + "\n")
@@ -0,0 +1,4 @@
1
+ from .pytesseract_engine import PytesseractOCREngine
2
+ from .api import ocr_image
3
+
4
+ __all__ = ["PytesseractOCREngine", "ocr_image"]
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional
4
+ from PIL import Image
5
+
6
+ from .pytesseract_engine import PytesseractOCREngine
7
+
8
+
9
+ def ocr_image(
10
+ cropped_pil: Image.Image,
11
+ *,
12
+ lang: str = "eng",
13
+ psm: int = 4,
14
+ oem: int = 3,
15
+ extra_config: str = "",
16
+ tesseract_cmd: Optional[str] = None,
17
+ ) -> str:
18
+ """
19
+ One-shot OCR: run pytesseract on a cropped PIL image and return text.
20
+
21
+ Convenience function that creates a PytesseractOCREngine instance and
22
+ immediately runs OCR on the provided image. Useful for quick text extraction
23
+ without needing to manage engine instances.
24
+
25
+ :param cropped_pil: PIL Image object to perform OCR on
26
+ :param lang: OCR language code (default: "eng")
27
+ :param psm: Tesseract page segmentation mode (default: 4)
28
+ :param oem: Tesseract OCR engine mode (default: 3)
29
+ :param extra_config: Additional Tesseract configuration string (default: "")
30
+ :param tesseract_cmd: Optional path to tesseract executable (default: None)
31
+ :return: Extracted text string from the image
32
+ """
33
+ engine = PytesseractOCREngine(
34
+ tesseract_cmd=tesseract_cmd, lang=lang, psm=psm, oem=oem, extra_config=extra_config
35
+ )
36
+ return engine.recognize(cropped_pil)