natural-pdf 0.1.5__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. docs/finetuning/index.md +176 -0
  2. docs/ocr/index.md +34 -47
  3. docs/tutorials/01-loading-and-extraction.ipynb +34 -1536
  4. docs/tutorials/02-finding-elements.ipynb +42 -42
  5. docs/tutorials/03-extracting-blocks.ipynb +17 -17
  6. docs/tutorials/04-table-extraction.ipynb +12 -12
  7. docs/tutorials/05-excluding-content.ipynb +30 -30
  8. docs/tutorials/06-document-qa.ipynb +28 -28
  9. docs/tutorials/07-layout-analysis.ipynb +63 -35
  10. docs/tutorials/07-working-with-regions.ipynb +55 -51
  11. docs/tutorials/07-working-with-regions.md +2 -2
  12. docs/tutorials/08-spatial-navigation.ipynb +60 -60
  13. docs/tutorials/09-section-extraction.ipynb +113 -113
  14. docs/tutorials/10-form-field-extraction.ipynb +78 -50
  15. docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. docs/tutorials/12-ocr-integration.ipynb +149 -131
  17. docs/tutorials/12-ocr-integration.md +0 -13
  18. docs/tutorials/13-semantic-search.ipynb +313 -873
  19. natural_pdf/__init__.py +21 -22
  20. natural_pdf/analyzers/layout/gemini.py +280 -0
  21. natural_pdf/analyzers/layout/layout_manager.py +28 -1
  22. natural_pdf/analyzers/layout/layout_options.py +11 -0
  23. natural_pdf/analyzers/layout/yolo.py +6 -2
  24. natural_pdf/collections/pdf_collection.py +24 -0
  25. natural_pdf/core/element_manager.py +18 -13
  26. natural_pdf/core/page.py +174 -36
  27. natural_pdf/core/pdf.py +156 -42
  28. natural_pdf/elements/base.py +9 -17
  29. natural_pdf/elements/collections.py +99 -38
  30. natural_pdf/elements/region.py +77 -37
  31. natural_pdf/elements/text.py +5 -0
  32. natural_pdf/exporters/__init__.py +4 -0
  33. natural_pdf/exporters/base.py +61 -0
  34. natural_pdf/exporters/paddleocr.py +345 -0
  35. natural_pdf/ocr/__init__.py +57 -36
  36. natural_pdf/ocr/engine.py +160 -49
  37. natural_pdf/ocr/engine_easyocr.py +178 -157
  38. natural_pdf/ocr/engine_paddle.py +114 -189
  39. natural_pdf/ocr/engine_surya.py +87 -144
  40. natural_pdf/ocr/ocr_factory.py +125 -0
  41. natural_pdf/ocr/ocr_manager.py +65 -89
  42. natural_pdf/ocr/ocr_options.py +8 -13
  43. natural_pdf/ocr/utils.py +113 -0
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
  45. natural_pdf/templates/spa/css/style.css +334 -0
  46. natural_pdf/templates/spa/index.html +31 -0
  47. natural_pdf/templates/spa/js/app.js +472 -0
  48. natural_pdf/templates/spa/words.txt +235976 -0
  49. natural_pdf/utils/debug.py +34 -0
  50. natural_pdf/utils/identifiers.py +33 -0
  51. natural_pdf/utils/packaging.py +485 -0
  52. natural_pdf/utils/text_extraction.py +44 -64
  53. natural_pdf/utils/visualization.py +1 -1
  54. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +44 -20
  55. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +58 -47
  56. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +1 -1
  57. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -1
  58. natural_pdf/templates/ocr_debug.html +0 -517
  59. tests/test_loading.py +0 -50
  60. tests/test_optional_deps.py +0 -298
  61. {natural_pdf-0.1.5.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,345 @@
1
+ import os
2
+ import logging
3
+ import random
4
+ import shutil
5
+ from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
6
+ from tqdm import tqdm
7
+
8
+ from natural_pdf.exporters.base import FinetuneExporter
9
+
10
+ # Need to import this utility
11
+ from natural_pdf.utils.identifiers import generate_short_path_hash
12
+
13
+ if TYPE_CHECKING:
14
+ from natural_pdf.core.pdf import PDF
15
+ from natural_pdf.collections.pdf_collection import PDFCollection
16
+ from natural_pdf.elements.text import TextElement
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ DEFAULT_SELECTOR_CORRECTED = "text[source^=manifest]" # Match manifest-import etc.
21
+
22
+
23
+ class PaddleOCRRecognitionExporter(FinetuneExporter):
24
+ """
25
+ Exports data for fine-tuning a PaddleOCR text recognition model.
26
+
27
+ Creates a directory structure with cropped text images and label files
28
+ (`train.txt`, `val.txt`, or `label.txt`) suitable for PaddleOCR training.
29
+ Optionally includes a Jupyter Notebook guide for fine-tuning on Colab.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ resolution: int = 150,
35
+ padding: int = 2,
36
+ selector: Optional[str] = None,
37
+ corrected_only: bool = False,
38
+ split_ratio: Optional[float] = 0.9,
39
+ include_guide: bool = True,
40
+ random_seed: Optional[int] = 42,
41
+ ):
42
+ """
43
+ Initialize the PaddleOCR Recognition Exporter.
44
+
45
+ Args:
46
+ resolution: DPI resolution for rendering text region images (default: 150).
47
+ padding: Padding (in points) to add around text element bbox before cropping (default: 2).
48
+ selector: CSS-like selector to filter which TextElements to export.
49
+ If None and corrected_only is False, all 'text' elements are considered.
50
+ corrected_only: If True, overrides selector and exports only elements likely
51
+ originating from a correction manifest (selector="text[source^=manifest]").
52
+ (default: False).
53
+ split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
54
+ If None, creates a single `label.txt` file (default: 0.9).
55
+ include_guide: If True, includes a template Jupyter Notebook guide for fine-tuning
56
+ in the output directory (default: True).
57
+ random_seed: Seed for the random number generator used for train/val split shuffling,
58
+ ensuring reproducibility (default: 42).
59
+ """
60
+ if corrected_only and selector:
61
+ logger.warning(
62
+ f"Both 'corrected_only=True' and 'selector=\"{selector}\"' were provided. "
63
+ f"Using corrected_only=True (selector='{DEFAULT_SELECTOR_CORRECTED}')."
64
+ )
65
+ self.selector = DEFAULT_SELECTOR_CORRECTED
66
+ elif corrected_only:
67
+ self.selector = DEFAULT_SELECTOR_CORRECTED
68
+ elif selector:
69
+ self.selector = selector
70
+ else:
71
+ self.selector = "text" # Default to all text elements if nothing else specified
72
+
73
+ self.resolution = resolution
74
+ self.padding = padding
75
+ self.split_ratio = split_ratio
76
+ self.include_guide = include_guide
77
+ self.random_seed = random_seed
78
+
79
+ logger.info(
80
+ f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
81
+ f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
82
+ )
83
+
84
+ def export(
85
+ self,
86
+ source: Union["PDF", "PDFCollection", List["PDF"]],
87
+ output_dir: str,
88
+ **kwargs, # Allow for potential future args
89
+ ):
90
+ """
91
+ Exports text elements from the source PDF(s) to the specified output directory
92
+ in PaddleOCR text recognition format.
93
+
94
+ Args:
95
+ source: The PDF object, PDFCollection, or list of PDF objects to process.
96
+ output_dir: The path to the directory where the exported files will be saved.
97
+ The directory will be created if it doesn't exist.
98
+ **kwargs: Optional keyword arguments (currently unused).
99
+ """
100
+ # --- 1. Setup and Validation ---
101
+ pdfs_to_process = self._resolve_source_pdfs(source)
102
+ if not pdfs_to_process:
103
+ logger.error("No valid PDF sources found. Aborting export.")
104
+ return
105
+
106
+ try:
107
+ os.makedirs(output_dir, exist_ok=True)
108
+ images_dir = os.path.join(output_dir, "images")
109
+ os.makedirs(images_dir, exist_ok=True)
110
+ except OSError as e:
111
+ logger.error(f"Failed to create output directory '{output_dir}': {e}", exc_info=True)
112
+ raise
113
+
114
+ # --- 2. Collect Elements and Render Images ---
115
+ labels: List[Tuple[str, str]] = [] # List of (relative_image_path, text_label)
116
+ char_set: Set[str] = set()
117
+ elements_processed = 0
118
+ elements_skipped = 0
119
+
120
+ logger.info(
121
+ f"Processing {len(pdfs_to_process)} PDF(s) to find elements matching selector: '{self.selector}'"
122
+ )
123
+
124
+ for pdf in tqdm(pdfs_to_process, desc="Processing PDFs"):
125
+ # Need to ensure pdf.path exists and is string
126
+ if not hasattr(pdf, "path") or not isinstance(pdf.path, str):
127
+ logger.warning(f"Skipping PDF object without a valid path attribute: {pdf}")
128
+ continue
129
+ pdf_hash = generate_short_path_hash(pdf.path)
130
+ try:
131
+ # Find elements using the specified selector
132
+ # Need to check if pdf has find_all method
133
+ if not hasattr(pdf, "find_all"):
134
+ logger.warning(
135
+ f"PDF object {pdf.path} does not have find_all method. Skipping."
136
+ )
137
+ continue
138
+
139
+ elements = pdf.find_all(
140
+ self.selector, apply_exclusions=False
141
+ ) # Usually want all text, even if excluded
142
+ if not elements:
143
+ logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
144
+ continue
145
+
146
+ for i, element in enumerate(
147
+ tqdm(
148
+ elements,
149
+ desc=f"Exporting '{os.path.basename(pdf.path)}'",
150
+ leave=False,
151
+ position=1,
152
+ )
153
+ ):
154
+ # Ensure it's a TextElement with necessary methods/attributes
155
+ # Removed check for to_image as it's called after expand()
156
+ if not (
157
+ hasattr(element, "page")
158
+ and hasattr(element, "text")
159
+ and hasattr(element, "expand")
160
+ ):
161
+ logger.warning(f"Skipping invalid/non-text element {i} in {pdf.path}")
162
+ elements_skipped += 1
163
+ continue
164
+
165
+ element_text = element.text
166
+ # Skip elements with no text, non-string text, or newlines
167
+ if (
168
+ not element_text
169
+ or not isinstance(element_text, str)
170
+ or "\n" in element_text
171
+ ):
172
+ if "\n" in str(element_text):
173
+ reason = "contains newline"
174
+ elif not element_text:
175
+ reason = "empty text"
176
+ else:
177
+ reason = "invalid text type"
178
+ logger.debug(
179
+ f"Skipping element {i} in {pdf.path} page {getattr(element.page, 'number', 'N/A')} because {reason}."
180
+ )
181
+ elements_skipped += 1
182
+ continue
183
+
184
+ # Use page index if available, otherwise fallback or skip? Fallback to 0 for now.
185
+ page_index = getattr(element.page, "index", 0)
186
+ image_filename = f"{pdf_hash}_p{page_index}_e{i}.png"
187
+ relative_image_path = os.path.join("images", image_filename)
188
+ absolute_image_path = os.path.join(output_dir, relative_image_path)
189
+
190
+ try:
191
+ # Expand region, render, and save image
192
+ region = element.expand(self.padding)
193
+ img = region.to_image(
194
+ resolution=self.resolution, crop_only=True, include_highlights=False
195
+ )
196
+ img.save(absolute_image_path, "PNG")
197
+
198
+ # Add to labels and character set
199
+ labels.append(
200
+ (relative_image_path.replace(os.path.sep, "/"), element_text)
201
+ ) # Use forward slashes for labels
202
+ char_set.update(element_text)
203
+ elements_processed += 1
204
+
205
+ except Exception as e:
206
+ page_num_str = getattr(
207
+ element.page, "number", "N/A"
208
+ ) # Get page number safely
209
+ logger.error(
210
+ f"Failed to process/save image for element {i} in {pdf.path} page {page_num_str}: {e}",
211
+ exc_info=False, # Keep log cleaner
212
+ )
213
+ elements_skipped += 1
214
+
215
+ except Exception as e:
216
+ logger.error(f"Failed to process PDF {pdf.path}: {e}", exc_info=True)
217
+ # Continue with other PDFs if possible
218
+
219
+ if elements_processed == 0:
220
+ logger.error(
221
+ f"No text elements were successfully processed and exported matching '{self.selector}'. Aborting."
222
+ )
223
+ # Clean up potentially created directories? Or leave them empty? Let's leave them.
224
+ return
225
+
226
+ logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
227
+
228
+ # --- 3. Generate Dictionary File (`dict.txt`) ---
229
+ dict_path = os.path.join(output_dir, "dict.txt")
230
+ try:
231
+ # Log the character set before sorting/writing
232
+ logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
233
+ # PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
234
+ # for recognition models, but this might depend on the specific base model.
235
+ # Start with just the characters found.
236
+ sorted_chars = sorted(list(char_set), reverse=True)
237
+ with open(dict_path, "w", encoding="utf-8") as f_dict:
238
+ for char in sorted_chars:
239
+ # Ensure we don't write empty strings or just newlines as dictionary entries
240
+ if char and char != "\n":
241
+ f_dict.write(char + "\n")
242
+ logger.info(f"Created dictionary file with {len(sorted_chars)} characters: {dict_path}")
243
+ except Exception as e:
244
+ logger.error(f"Failed to write dictionary file '{dict_path}': {e}", exc_info=True)
245
+ raise # Re-raise as this is critical
246
+
247
+ # --- 4. Generate Label Files (`train.txt`, `val.txt` or `label.txt`) ---
248
+ if self.split_ratio is not None and 0 < self.split_ratio < 1:
249
+ if self.random_seed is not None:
250
+ random.seed(self.random_seed)
251
+ random.shuffle(labels)
252
+ split_index = int(len(labels) * self.split_ratio)
253
+ train_labels = labels[:split_index]
254
+ val_labels = labels[split_index:]
255
+
256
+ try:
257
+ train_path = os.path.join(output_dir, "train.txt")
258
+ with open(train_path, "w", encoding="utf-8") as f_train:
259
+ for img_path, text in train_labels:
260
+ f_train.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
261
+ logger.info(
262
+ f"Created training label file with {len(train_labels)} entries: {train_path}"
263
+ )
264
+
265
+ val_path = os.path.join(output_dir, "val.txt")
266
+ with open(val_path, "w", encoding="utf-8") as f_val:
267
+ for img_path, text in val_labels:
268
+ f_val.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
269
+ logger.info(
270
+ f"Created validation label file with {len(val_labels)} entries: {val_path}"
271
+ )
272
+ except Exception as e:
273
+ logger.error(f"Failed to write train/validation label files: {e}", exc_info=True)
274
+ raise
275
+ else:
276
+ # Create a single label file
277
+ label_path = os.path.join(output_dir, "label.txt")
278
+ try:
279
+ with open(label_path, "w", encoding="utf-8") as f_label:
280
+ for img_path, text in labels:
281
+ f_label.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
282
+ logger.info(f"Created single label file with {len(labels)} entries: {label_path}")
283
+ except Exception as e:
284
+ logger.error(f"Failed to write label file '{label_path}': {e}", exc_info=True)
285
+ raise
286
+
287
+ # --- 5. Include Guide Notebook ---
288
+ if self.include_guide:
289
+ self._copy_guide_notebook(output_dir)
290
+
291
+ logger.info(f"PaddleOCR recognition data export completed successfully to '{output_dir}'.")
292
+
293
+ def _copy_guide_notebook(self, output_dir: str):
294
+ """Locates, converts (md->ipynb), and copies the guide notebook."""
295
+ try:
296
+ # Try importing conversion library
297
+ import jupytext
298
+ from nbformat import write as write_notebook
299
+ except ImportError:
300
+ logger.warning(
301
+ "Could not import 'jupytext' or 'nbformat'. Skipping guide notebook generation. "
302
+ "Install with 'pip install natural-pdf[dev]' or 'pip install jupytext nbformat'."
303
+ )
304
+ return
305
+
306
+ try:
307
+ # Locate the template .md file relative to this script
308
+ exporter_dir = os.path.dirname(os.path.abspath(__file__))
309
+ # Go up two levels (exporters -> natural_pdf) then down to templates/finetune
310
+ template_dir = os.path.abspath(
311
+ os.path.join(exporter_dir, "..", "templates", "finetune")
312
+ )
313
+ template_md_path = os.path.join(template_dir, "fine_tune_paddleocr.md")
314
+ output_ipynb_path = os.path.join(output_dir, "fine_tune_paddleocr.ipynb")
315
+
316
+ if not os.path.exists(template_md_path):
317
+ logger.error(
318
+ f"Guide template not found at expected location: {template_md_path}. Trying alternate path."
319
+ )
320
+ # Try path relative to workspace root as fallback if run from project root
321
+ alt_template_path = os.path.abspath(
322
+ os.path.join("natural_pdf", "templates", "finetune", "fine_tune_paddleocr.md")
323
+ )
324
+ if os.path.exists(alt_template_path):
325
+ template_md_path = alt_template_path
326
+ logger.info(f"Found guide template at alternate path: {template_md_path}")
327
+ else:
328
+ logger.error(
329
+ f"Guide template also not found at: {alt_template_path}. Cannot copy guide."
330
+ )
331
+ return
332
+
333
+ # Convert Markdown to Notebook object using jupytext
334
+ logger.debug(f"Reading guide template from: {template_md_path}")
335
+ notebook = jupytext.read(template_md_path) # Reads md and returns NotebookNode
336
+
337
+ # Write the Notebook object to the output .ipynb file
338
+ logger.debug(f"Writing guide notebook to: {output_ipynb_path}")
339
+ with open(output_ipynb_path, "w", encoding="utf-8") as f_nb:
340
+ write_notebook(notebook, f_nb)
341
+
342
+ logger.info(f"Copied and converted fine-tuning guide notebook to: {output_ipynb_path}")
343
+
344
+ except Exception as e:
345
+ logger.error(f"Failed to copy/convert guide notebook: {e}", exc_info=True)
@@ -8,58 +8,79 @@ import logging
8
8
 
9
9
  # Set up module logger
10
10
  logger = logging.getLogger("natural_pdf.ocr")
11
+
12
+ # Import the base classes that are always available
11
13
  from .engine import OCREngine
12
- from .engine_paddle import PaddleOCREngine
13
- from .engine_surya import SuryaOCREngine
14
+ from .ocr_options import (
15
+ OCROptions,
16
+ BaseOCROptions,
17
+ EasyOCROptions,
18
+ PaddleOCROptions,
19
+ SuryaOCROptions,
20
+ )
14
21
  from .ocr_manager import OCRManager
15
- from .ocr_options import OCROptions
22
+ from .ocr_factory import OCRFactory
16
23
 
24
+ # Add all public symbols that should be available when importing this module
17
25
  __all__ = [
18
26
  "OCRManager",
19
27
  "OCREngine",
20
28
  "OCROptions",
21
- "EasyOCREngine",
22
- "PaddleOCREngine",
23
- "SuryaOCREngine",
29
+ "BaseOCROptions",
30
+ "EasyOCROptions",
31
+ "PaddleOCROptions",
32
+ "SuryaOCROptions",
33
+ "OCRFactory",
34
+ "get_engine",
35
+ "list_available_engines",
24
36
  ]
25
37
 
26
- DEFAULT_ENGINE = SuryaOCREngine
27
-
28
38
 
29
39
  def get_engine(engine_name=None, **kwargs):
30
40
  """
31
- Get OCR engine by name.
41
+ Get OCR engine by name with graceful handling of missing dependencies.
32
42
 
33
43
  Args:
34
- engine_name: Name of the engine to use ('easyocr', 'paddleocr', etc.)
35
- If None, the default engine is used (PaddleOCR if available, otherwise EasyOCR)
44
+ engine_name: Name of the engine to use ('easyocr', 'paddle', 'surya')
45
+ If None, the best available engine is used
36
46
  **kwargs: Additional arguments to pass to the engine constructor
37
47
 
38
48
  Returns:
39
49
  OCREngine instance
50
+
51
+ Raises:
52
+ ImportError: If the requested engine's dependencies aren't installed
53
+ ValueError: If the engine_name is unknown
54
+ """
55
+ logger.debug(f"Initializing OCR engine: {engine_name or 'best available'}")
56
+
57
+ try:
58
+ if engine_name is None or engine_name == "default":
59
+ # Use the factory to get the best available engine
60
+ engine = OCRFactory.get_recommended_engine(**kwargs)
61
+ logger.info(f"Using recommended OCR engine: {engine.__class__.__name__}")
62
+ return engine
63
+
64
+ # Use the factory to create a specific engine
65
+ normalized_name = engine_name.lower()
66
+ if normalized_name in ["easyocr", "paddle", "surya"]:
67
+ return OCRFactory.create_engine(normalized_name, **kwargs)
68
+ else:
69
+ raise ValueError(f"Unknown OCR engine: {engine_name}")
70
+
71
+ except ImportError as e:
72
+ logger.error(f"OCR engine dependency error: {e}")
73
+ raise
74
+ except Exception as e:
75
+ logger.error(f"Error initializing OCR engine: {e}")
76
+ raise
77
+
78
+
79
+ def list_available_engines():
80
+ """
81
+ List all available OCR engines.
82
+
83
+ Returns:
84
+ Dict[str, bool]: Dictionary mapping engine names to availability status
40
85
  """
41
- logger.debug(f"Initializing OCR engine: {engine_name or 'default'}")
42
-
43
- if engine_name is None or engine_name == "default":
44
- engine = DEFAULT_ENGINE(**kwargs)
45
- logger.info(f"Using default OCR engine: {engine.__class__.__name__}")
46
- return engine
47
-
48
- if engine_name.lower() == "easyocr":
49
- logger.info("Initializing EasyOCR engine")
50
- return EasyOCREngine(**kwargs)
51
-
52
- if engine_name.lower() == "paddleocr":
53
- try:
54
- from .engine_paddle import PaddleOCREngine
55
-
56
- logger.info("Initializing PaddleOCR engine")
57
- return PaddleOCREngine(**kwargs)
58
- except ImportError:
59
- logger.error("PaddleOCR is not installed")
60
- raise ImportError(
61
- "PaddleOCR is not installed. Please install it with: pip install paddlepaddle paddleocr"
62
- )
63
-
64
- logger.error(f"Unknown OCR engine: {engine_name}")
65
- raise ValueError(f"Unknown OCR engine: {engine_name}")
86
+ return OCRFactory.list_available_engines()