natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/finetuning/index.md +176 -0
  6. docs/index.md +19 -0
  7. docs/ocr/index.md +63 -16
  8. docs/tutorials/01-loading-and-extraction.ipynb +411 -248
  9. docs/tutorials/02-finding-elements.ipynb +123 -46
  10. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  11. docs/tutorials/04-table-extraction.ipynb +17 -12
  12. docs/tutorials/05-excluding-content.ipynb +37 -32
  13. docs/tutorials/06-document-qa.ipynb +36 -31
  14. docs/tutorials/07-layout-analysis.ipynb +45 -40
  15. docs/tutorials/07-working-with-regions.ipynb +61 -60
  16. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  17. docs/tutorials/09-section-extraction.ipynb +160 -155
  18. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  19. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  20. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  21. docs/tutorials/12-ocr-integration.md +68 -106
  22. docs/tutorials/13-semantic-search.ipynb +641 -251
  23. natural_pdf/__init__.py +3 -0
  24. natural_pdf/analyzers/layout/gemini.py +63 -47
  25. natural_pdf/classification/manager.py +343 -0
  26. natural_pdf/classification/mixin.py +149 -0
  27. natural_pdf/classification/results.py +62 -0
  28. natural_pdf/collections/mixins.py +63 -0
  29. natural_pdf/collections/pdf_collection.py +326 -17
  30. natural_pdf/core/element_manager.py +73 -4
  31. natural_pdf/core/page.py +255 -83
  32. natural_pdf/core/pdf.py +385 -367
  33. natural_pdf/elements/base.py +1 -3
  34. natural_pdf/elements/collections.py +279 -49
  35. natural_pdf/elements/region.py +106 -21
  36. natural_pdf/elements/text.py +5 -2
  37. natural_pdf/exporters/__init__.py +4 -0
  38. natural_pdf/exporters/base.py +61 -0
  39. natural_pdf/exporters/paddleocr.py +345 -0
  40. natural_pdf/extraction/manager.py +134 -0
  41. natural_pdf/extraction/mixin.py +246 -0
  42. natural_pdf/extraction/result.py +37 -0
  43. natural_pdf/ocr/__init__.py +16 -8
  44. natural_pdf/ocr/engine.py +46 -30
  45. natural_pdf/ocr/engine_easyocr.py +86 -42
  46. natural_pdf/ocr/engine_paddle.py +39 -28
  47. natural_pdf/ocr/engine_surya.py +32 -16
  48. natural_pdf/ocr/ocr_factory.py +34 -23
  49. natural_pdf/ocr/ocr_manager.py +98 -34
  50. natural_pdf/ocr/ocr_options.py +38 -10
  51. natural_pdf/ocr/utils.py +59 -33
  52. natural_pdf/qa/document_qa.py +0 -4
  53. natural_pdf/selectors/parser.py +363 -238
  54. natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
  55. natural_pdf/utils/debug.py +4 -2
  56. natural_pdf/utils/identifiers.py +9 -5
  57. natural_pdf/utils/locks.py +8 -0
  58. natural_pdf/utils/packaging.py +172 -105
  59. natural_pdf/utils/text_extraction.py +96 -65
  60. natural_pdf/utils/tqdm_utils.py +43 -0
  61. natural_pdf/utils/visualization.py +1 -1
  62. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
  63. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
  64. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  65. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  66. {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -11,7 +11,15 @@ from natural_pdf.elements.base import DirectionalMixin
11
11
  # Import new utils
12
12
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
13
13
 
14
- from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
14
+ from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
15
+
16
+ # --- Classification Imports --- #
17
+ from natural_pdf.classification.mixin import ClassificationMixin
18
+ from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
19
+ # --- End Classification Imports --- #
20
+
21
+ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
22
+ from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
15
23
 
16
24
  if TYPE_CHECKING:
17
25
  from natural_pdf.core.page import Page
@@ -27,7 +35,7 @@ except ImportError:
27
35
  logger = logging.getLogger(__name__)
28
36
 
29
37
 
30
- class Region(DirectionalMixin):
38
+ class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
31
39
  """
32
40
  Represents a rectangular region on a page.
33
41
  """
@@ -57,6 +65,12 @@ class Region(DirectionalMixin):
57
65
  self.start_element = None
58
66
  self.end_element = None
59
67
 
68
+ # --- ADDED --- Metadata store for mixins
69
+ self.metadata: Dict[str, Any] = {}
70
+ # --- NEW --- Central registry for analysis results
71
+ self.analyses: Dict[str, Any] = {}
72
+ # --- END ADDED ---
73
+
60
74
  # Standard attributes for all elements
61
75
  self.object_type = "region" # For selector compatibility
62
76
 
@@ -600,6 +614,18 @@ class Region(DirectionalMixin):
600
614
  x1 = int(self.x1 * scale_factor)
601
615
  bottom = int(self.bottom * scale_factor)
602
616
 
617
+ # Ensure coords are valid for cropping (left < right, top < bottom)
618
+ if x0 >= x1:
619
+ logger.warning(
620
+ f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
621
+ )
622
+ return None
623
+ if top >= bottom:
624
+ logger.warning(
625
+ f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
626
+ )
627
+ return None
628
+
603
629
  # Crop the image to just this region
604
630
  region_image = page_image.crop((x0, top, x1, bottom))
605
631
 
@@ -776,11 +802,6 @@ class Region(DirectionalMixin):
776
802
  debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
777
803
  logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
778
804
 
779
- # --- Handle Docling source (priority) --- DEPRECATED or Adapt?
780
- # For now, let's bypass this and always use the standard extraction flow
781
- # based on contained elements to ensure consistency.
782
- # if self.model == 'docling' or hasattr(self, 'text_content'): ...
783
-
784
805
  # 1. Get Word Elements potentially within this region (initial broad phase)
785
806
  # Optimization: Could use spatial query if page elements were indexed
786
807
  page_words = self.page.words # Get all words from the page
@@ -829,7 +850,7 @@ class Region(DirectionalMixin):
829
850
  result = generate_text_layout(
830
851
  char_dicts=filtered_chars,
831
852
  layout_context_bbox=self.bbox, # Use region's bbox for context
832
- user_kwargs=kwargs,
853
+ user_kwargs=kwargs, # Pass original kwargs to layout generator
833
854
  )
834
855
 
835
856
  logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
@@ -1084,11 +1105,14 @@ class Region(DirectionalMixin):
1084
1105
  filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
1085
1106
  return ElementCollection(filtered_elements)
1086
1107
 
1087
- def apply_ocr(self, **ocr_params) -> "Region":
1108
+ def apply_ocr(self, replace=True, **ocr_params) -> "Region":
1088
1109
  """
1089
1110
  Apply OCR to this region and return the created text elements.
1090
1111
 
1091
1112
  Args:
1113
+ replace: If True (default), removes existing OCR elements in the region
1114
+ before adding new ones. If False, adds new OCR elements without
1115
+ removing existing ones.
1092
1116
  **ocr_params: Keyword arguments passed to the OCR Manager.
1093
1117
  Common parameters like `engine`, `languages`, `min_confidence`,
1094
1118
  `device`, and `resolution` (for image rendering) should be
@@ -1098,17 +1122,33 @@ class Region(DirectionalMixin):
1098
1122
  an `options` object (e.g., `options=EasyOCROptions(...)`).
1099
1123
 
1100
1124
  Returns:
1101
- List of created TextElement objects representing OCR words/lines.
1125
+ Self for method chaining.
1102
1126
  """
1103
1127
  # Ensure OCRManager is available
1104
1128
  if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
1105
1129
  logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
1106
- return []
1130
+ return self
1131
+
1132
+ # If replace is True, find and remove existing OCR elements in this region
1133
+ if replace:
1134
+ logger.info(f"Region {self.bbox}: Removing existing OCR elements before applying new OCR.")
1135
+ # Find all OCR elements in this region
1136
+ ocr_selector = "text[source=ocr]"
1137
+ ocr_elements = self.find_all(ocr_selector)
1138
+
1139
+ if ocr_elements:
1140
+ logger.info(f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove.")
1141
+ # Remove these elements from their page
1142
+ removed_count = ocr_elements.remove()
1143
+ logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
1144
+ else:
1145
+ logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
1146
+
1107
1147
  ocr_mgr = self.page._parent._ocr_manager
1108
1148
 
1109
1149
  # Determine rendering resolution from parameters
1110
1150
  final_resolution = ocr_params.get("resolution")
1111
- if final_resolution is None and hasattr(self.page, '_parent') and self.page._parent:
1151
+ if final_resolution is None and hasattr(self.page, "_parent") and self.page._parent:
1112
1152
  final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
1113
1153
  elif final_resolution is None:
1114
1154
  final_resolution = 150
@@ -1123,11 +1163,11 @@ class Region(DirectionalMixin):
1123
1163
  )
1124
1164
  if not region_image:
1125
1165
  logger.error("Failed to render region to image for OCR.")
1126
- return []
1166
+ return self
1127
1167
  logger.debug(f"Region rendered to image size: {region_image.size}")
1128
1168
  except Exception as e:
1129
1169
  logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
1130
- return []
1170
+ return self
1131
1171
 
1132
1172
  # Prepare args for the OCR Manager
1133
1173
  manager_args = {
@@ -1148,11 +1188,11 @@ class Region(DirectionalMixin):
1148
1188
  logger.error(
1149
1189
  f"OCRManager returned unexpected type for single region image: {type(results)}"
1150
1190
  )
1151
- return []
1191
+ return self
1152
1192
  logger.debug(f"Region OCR processing returned {len(results)} results.")
1153
1193
  except Exception as e:
1154
1194
  logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
1155
- return []
1195
+ return self
1156
1196
 
1157
1197
  # Convert results to TextElements
1158
1198
  scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
@@ -1191,6 +1231,7 @@ class Region(DirectionalMixin):
1191
1231
  ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
1192
1232
  element_data["_char_dicts"] = [ocr_char_dict]
1193
1233
  from natural_pdf.elements.text import TextElement
1234
+
1194
1235
  elem = TextElement(element_data, self.page)
1195
1236
  created_elements.append(elem)
1196
1237
  self.page._element_mgr.add_element(elem, element_type="words")
@@ -1692,7 +1733,7 @@ class Region(DirectionalMixin):
1692
1733
  def correct_ocr(
1693
1734
  self,
1694
1735
  correction_callback: Callable[[Any], Optional[str]],
1695
- ) -> "Region": # Return self for chaining
1736
+ ) -> "Region": # Return self for chaining
1696
1737
  """
1697
1738
  Applies corrections to OCR-generated text elements within this region
1698
1739
  using a user-provided callback function.
@@ -1718,13 +1759,57 @@ class Region(DirectionalMixin):
1718
1759
  """
1719
1760
  # Find OCR elements specifically within this region
1720
1761
  # Note: We typically want to correct even if the element falls in an excluded area
1721
- target_elements = self.find_all(selector="text[source^=ocr]", apply_exclusions=False)
1762
+ target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
1722
1763
 
1723
1764
  # Delegate to the utility function
1724
1765
  _apply_ocr_correction_to_elements(
1725
- elements=target_elements, # Pass the ElementCollection directly
1766
+ elements=target_elements, # Pass the ElementCollection directly
1726
1767
  correction_callback=correction_callback,
1727
- caller_info=f"Region({self.bbox})", # Pass caller info
1768
+ caller_info=f"Region({self.bbox})", # Pass caller info
1728
1769
  )
1729
1770
 
1730
- return self # Return self for chaining
1771
+ return self # Return self for chaining
1772
+
1773
+ # --- Classification Mixin Implementation --- #
1774
+ def _get_classification_manager(self) -> "ClassificationManager":
1775
+ if not hasattr(self, 'page') or not hasattr(self.page, 'pdf') or not hasattr(self.page.pdf, 'get_manager'):
1776
+ raise AttributeError("ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing.")
1777
+ try:
1778
+ # Use the PDF's manager registry accessor via page
1779
+ return self.page.pdf.get_manager('classification')
1780
+ except (ValueError, RuntimeError, AttributeError) as e:
1781
+ # Wrap potential errors from get_manager for clarity
1782
+ raise AttributeError(f"Failed to get ClassificationManager from PDF via Page: {e}") from e
1783
+
1784
+ def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
1785
+ if model_type == 'text':
1786
+ text_content = self.extract_text(layout=False) # Simple join for classification
1787
+ if not text_content or text_content.isspace():
1788
+ raise ValueError("Cannot classify region with 'text' model: No text content found.")
1789
+ return text_content
1790
+ elif model_type == 'vision':
1791
+ # Get resolution from manager/kwargs if possible, else default
1792
+ # We access manager via the method to ensure it's available
1793
+ manager = self._get_classification_manager()
1794
+ default_resolution = 150 # Manager doesn't store default res, set here
1795
+ # Note: classify() passes resolution via **kwargs if user specifies
1796
+ resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
1797
+
1798
+ img = self.to_image(
1799
+ resolution=resolution,
1800
+ include_highlights=False, # No highlights for classification input
1801
+ crop_only=True # Just the region content
1802
+ )
1803
+ if img is None:
1804
+ raise ValueError("Cannot classify region with 'vision' model: Failed to render image.")
1805
+ return img
1806
+ else:
1807
+ raise ValueError(f"Unsupported model_type for classification: {model_type}")
1808
+
1809
+ def _get_metadata_storage(self) -> Dict[str, Any]:
1810
+ # Ensure metadata exists
1811
+ if not hasattr(self, 'metadata') or self.metadata is None:
1812
+ self.metadata = {}
1813
+ return self.metadata
1814
+
1815
+ # --- End Classification Mixin Implementation --- #
@@ -274,9 +274,12 @@ class TextElement(Element):
274
274
 
275
275
  return False
276
276
 
277
- def __repr__(self) -> str:
277
+ def __repr__(self) -> str:
278
278
  """String representation of the text element."""
279
- preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
279
+ if self.text:
280
+ preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
281
+ else:
282
+ preview = "..."
280
283
  font_style = []
281
284
  if self.bold:
282
285
  font_style.append("bold")
@@ -0,0 +1,4 @@
1
+ from .base import FinetuneExporter
2
+ from .paddleocr import PaddleOCRRecognitionExporter
3
+
4
+ __all__ = ["FinetuneExporter", "PaddleOCRRecognitionExporter"]
@@ -0,0 +1,61 @@
1
+ import abc
2
+ import logging
3
+ from typing import Union, List, TYPE_CHECKING
4
+
5
+ if TYPE_CHECKING:
6
+ from natural_pdf.core.pdf import PDF
7
+ from natural_pdf.collections.pdf_collection import PDFCollection
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class FinetuneExporter(abc.ABC):
13
+ """
14
+ Abstract base class for exporting data suitable for fine-tuning models.
15
+ """
16
+
17
+ @abc.abstractmethod
18
+ def __init__(self, **kwargs):
19
+ """
20
+ Initialize the exporter with format-specific options.
21
+ """
22
+ pass
23
+
24
+ @abc.abstractmethod
25
+ def export(self, source: Union["PDF", "PDFCollection", List["PDF"]], output_dir: str, **kwargs):
26
+ """
27
+ Exports the data from the source PDF(s) to the specified output directory
28
+ in a format suitable for fine-tuning a specific model type.
29
+
30
+ Args:
31
+ source: The PDF object, PDFCollection, or list of PDF objects to process.
32
+ output_dir: The path to the directory where the exported files will be saved.
33
+ **kwargs: Additional export-time arguments.
34
+ """
35
+ pass
36
+
37
+ def _resolve_source_pdfs(
38
+ self, source: Union["PDF", "PDFCollection", List["PDF"]]
39
+ ) -> List["PDF"]:
40
+ """
41
+ Helper to consistently resolve the input source to a list of PDF objects.
42
+ """
43
+ from natural_pdf.core.pdf import PDF # Avoid circular import at module level
44
+ from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
45
+
46
+ pdfs_to_process: List["PDF"] = []
47
+ if isinstance(source, PDF):
48
+ pdfs_to_process = [source]
49
+ elif isinstance(source, PDFCollection):
50
+ pdfs_to_process = source.pdfs
51
+ elif isinstance(source, list) and all(isinstance(p, PDF) for p in source):
52
+ pdfs_to_process = source
53
+ else:
54
+ raise TypeError(
55
+ f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
56
+ )
57
+
58
+ if not pdfs_to_process:
59
+ logger.warning("No PDF documents provided in the source.")
60
+
61
+ return pdfs_to_process
@@ -0,0 +1,345 @@
1
+ import os
2
+ import logging
3
+ import random
4
+ import shutil
5
+ from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
6
+ from tqdm import tqdm
7
+
8
+ from natural_pdf.exporters.base import FinetuneExporter
9
+
10
+ # Need to import this utility
11
+ from natural_pdf.utils.identifiers import generate_short_path_hash
12
+
13
+ if TYPE_CHECKING:
14
+ from natural_pdf.core.pdf import PDF
15
+ from natural_pdf.collections.pdf_collection import PDFCollection
16
+ from natural_pdf.elements.text import TextElement
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ DEFAULT_SELECTOR_CORRECTED = "text[source^=manifest]" # Match manifest-import etc.
21
+
22
+
23
+ class PaddleOCRRecognitionExporter(FinetuneExporter):
24
+ """
25
+ Exports data for fine-tuning a PaddleOCR text recognition model.
26
+
27
+ Creates a directory structure with cropped text images and label files
28
+ (`train.txt`, `val.txt`, or `label.txt`) suitable for PaddleOCR training.
29
+ Optionally includes a Jupyter Notebook guide for fine-tuning on Colab.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ resolution: int = 150,
35
+ padding: int = 2,
36
+ selector: Optional[str] = None,
37
+ corrected_only: bool = False,
38
+ split_ratio: Optional[float] = 0.9,
39
+ include_guide: bool = True,
40
+ random_seed: Optional[int] = 42,
41
+ ):
42
+ """
43
+ Initialize the PaddleOCR Recognition Exporter.
44
+
45
+ Args:
46
+ resolution: DPI resolution for rendering text region images (default: 150).
47
+ padding: Padding (in points) to add around text element bbox before cropping (default: 2).
48
+ selector: CSS-like selector to filter which TextElements to export.
49
+ If None and corrected_only is False, all 'text' elements are considered.
50
+ corrected_only: If True, overrides selector and exports only elements likely
51
+ originating from a correction manifest (selector="text[source=manifest]").
52
+ (default: False).
53
+ split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
54
+ If None, creates a single `label.txt` file (default: 0.9).
55
+ include_guide: If True, includes a template Jupyter Notebook guide for fine-tuning
56
+ in the output directory (default: True).
57
+ random_seed: Seed for the random number generator used for train/val split shuffling,
58
+ ensuring reproducibility (default: 42).
59
+ """
60
+ if corrected_only and selector:
61
+ logger.warning(
62
+ f"Both 'corrected_only=True' and 'selector=\"{selector}\"' were provided. "
63
+ f"Using corrected_only=True (selector='{DEFAULT_SELECTOR_CORRECTED}')."
64
+ )
65
+ self.selector = DEFAULT_SELECTOR_CORRECTED
66
+ elif corrected_only:
67
+ self.selector = DEFAULT_SELECTOR_CORRECTED
68
+ elif selector:
69
+ self.selector = selector
70
+ else:
71
+ self.selector = "text" # Default to all text elements if nothing else specified
72
+
73
+ self.resolution = resolution
74
+ self.padding = padding
75
+ self.split_ratio = split_ratio
76
+ self.include_guide = include_guide
77
+ self.random_seed = random_seed
78
+
79
+ logger.info(
80
+ f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
81
+ f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
82
+ )
83
+
84
+ def export(
85
+ self,
86
+ source: Union["PDF", "PDFCollection", List["PDF"]],
87
+ output_dir: str,
88
+ **kwargs, # Allow for potential future args
89
+ ):
90
+ """
91
+ Exports text elements from the source PDF(s) to the specified output directory
92
+ in PaddleOCR text recognition format.
93
+
94
+ Args:
95
+ source: The PDF object, PDFCollection, or list of PDF objects to process.
96
+ output_dir: The path to the directory where the exported files will be saved.
97
+ The directory will be created if it doesn't exist.
98
+ **kwargs: Optional keyword arguments (currently unused).
99
+ """
100
+ # --- 1. Setup and Validation ---
101
+ pdfs_to_process = self._resolve_source_pdfs(source)
102
+ if not pdfs_to_process:
103
+ logger.error("No valid PDF sources found. Aborting export.")
104
+ return
105
+
106
+ try:
107
+ os.makedirs(output_dir, exist_ok=True)
108
+ images_dir = os.path.join(output_dir, "images")
109
+ os.makedirs(images_dir, exist_ok=True)
110
+ except OSError as e:
111
+ logger.error(f"Failed to create output directory '{output_dir}': {e}", exc_info=True)
112
+ raise
113
+
114
+ # --- 2. Collect Elements and Render Images ---
115
+ labels: List[Tuple[str, str]] = [] # List of (relative_image_path, text_label)
116
+ char_set: Set[str] = set()
117
+ elements_processed = 0
118
+ elements_skipped = 0
119
+
120
+ logger.info(
121
+ f"Processing {len(pdfs_to_process)} PDF(s) to find elements matching selector: '{self.selector}'"
122
+ )
123
+
124
+ for pdf in tqdm(pdfs_to_process, desc="Processing PDFs"):
125
+ # Need to ensure pdf.path exists and is string
126
+ if not hasattr(pdf, "path") or not isinstance(pdf.path, str):
127
+ logger.warning(f"Skipping PDF object without a valid path attribute: {pdf}")
128
+ continue
129
+ pdf_hash = generate_short_path_hash(pdf.path)
130
+ try:
131
+ # Find elements using the specified selector
132
+ # Need to check if pdf has find_all method
133
+ if not hasattr(pdf, "find_all"):
134
+ logger.warning(
135
+ f"PDF object {pdf.path} does not have find_all method. Skipping."
136
+ )
137
+ continue
138
+
139
+ elements = pdf.find_all(
140
+ self.selector, apply_exclusions=False
141
+ ) # Usually want all text, even if excluded
142
+ if not elements:
143
+ logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
144
+ continue
145
+
146
+ for i, element in enumerate(
147
+ tqdm(
148
+ elements,
149
+ desc=f"Exporting '{os.path.basename(pdf.path)}'",
150
+ leave=False,
151
+ position=1,
152
+ )
153
+ ):
154
+ # Ensure it's a TextElement with necessary methods/attributes
155
+ # Removed check for to_image as it's called after expand()
156
+ if not (
157
+ hasattr(element, "page")
158
+ and hasattr(element, "text")
159
+ and hasattr(element, "expand")
160
+ ):
161
+ logger.warning(f"Skipping invalid/non-text element {i} in {pdf.path}")
162
+ elements_skipped += 1
163
+ continue
164
+
165
+ element_text = element.text
166
+ # Skip elements with no text, non-string text, or newlines
167
+ if (
168
+ not element_text
169
+ or not isinstance(element_text, str)
170
+ or "\n" in element_text
171
+ ):
172
+ if "\n" in str(element_text):
173
+ reason = "contains newline"
174
+ elif not element_text:
175
+ reason = "empty text"
176
+ else:
177
+ reason = "invalid text type"
178
+ logger.debug(
179
+ f"Skipping element {i} in {pdf.path} page {getattr(element.page, 'number', 'N/A')} because {reason}."
180
+ )
181
+ elements_skipped += 1
182
+ continue
183
+
184
+ # Use page index if available, otherwise fallback or skip? Fallback to 0 for now.
185
+ page_index = getattr(element.page, "index", 0)
186
+ image_filename = f"{pdf_hash}_p{page_index}_e{i}.png"
187
+ relative_image_path = os.path.join("images", image_filename)
188
+ absolute_image_path = os.path.join(output_dir, relative_image_path)
189
+
190
+ try:
191
+ # Expand region, render, and save image
192
+ region = element.expand(self.padding)
193
+ img = region.to_image(
194
+ resolution=self.resolution, crop_only=True, include_highlights=False
195
+ )
196
+ img.save(absolute_image_path, "PNG")
197
+
198
+ # Add to labels and character set
199
+ labels.append(
200
+ (relative_image_path.replace(os.path.sep, "/"), element_text)
201
+ ) # Use forward slashes for labels
202
+ char_set.update(element_text)
203
+ elements_processed += 1
204
+
205
+ except Exception as e:
206
+ page_num_str = getattr(
207
+ element.page, "number", "N/A"
208
+ ) # Get page number safely
209
+ logger.error(
210
+ f"Failed to process/save image for element {i} in {pdf.path} page {page_num_str}: {e}",
211
+ exc_info=False, # Keep log cleaner
212
+ )
213
+ elements_skipped += 1
214
+
215
+ except Exception as e:
216
+ logger.error(f"Failed to process PDF {pdf.path}: {e}", exc_info=True)
217
+ # Continue with other PDFs if possible
218
+
219
+ if elements_processed == 0:
220
+ logger.error(
221
+ f"No text elements were successfully processed and exported matching '{self.selector}'. Aborting."
222
+ )
223
+ # Clean up potentially created directories? Or leave them empty? Let's leave them.
224
+ return
225
+
226
+ logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
227
+
228
+ # --- 3. Generate Dictionary File (`dict.txt`) ---
229
+ dict_path = os.path.join(output_dir, "dict.txt")
230
+ try:
231
+ # Log the character set before sorting/writing
232
+ logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
233
+ # PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
234
+ # for recognition models, but this might depend on the specific base model.
235
+ # Start with just the characters found.
236
+ sorted_chars = sorted(list(char_set), reverse=True)
237
+ with open(dict_path, "w", encoding="utf-8") as f_dict:
238
+ for char in sorted_chars:
239
+ # Ensure we don't write empty strings or just newlines as dictionary entries
240
+ if char and char != "\n":
241
+ f_dict.write(char + "\n")
242
+ logger.info(f"Created dictionary file with {len(sorted_chars)} characters: {dict_path}")
243
+ except Exception as e:
244
+ logger.error(f"Failed to write dictionary file '{dict_path}': {e}", exc_info=True)
245
+ raise # Re-raise as this is critical
246
+
247
+ # --- 4. Generate Label Files (`train.txt`, `val.txt` or `label.txt`) ---
248
+ if self.split_ratio is not None and 0 < self.split_ratio < 1:
249
+ if self.random_seed is not None:
250
+ random.seed(self.random_seed)
251
+ random.shuffle(labels)
252
+ split_index = int(len(labels) * self.split_ratio)
253
+ train_labels = labels[:split_index]
254
+ val_labels = labels[split_index:]
255
+
256
+ try:
257
+ train_path = os.path.join(output_dir, "train.txt")
258
+ with open(train_path, "w", encoding="utf-8") as f_train:
259
+ for img_path, text in train_labels:
260
+ f_train.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
261
+ logger.info(
262
+ f"Created training label file with {len(train_labels)} entries: {train_path}"
263
+ )
264
+
265
+ val_path = os.path.join(output_dir, "val.txt")
266
+ with open(val_path, "w", encoding="utf-8") as f_val:
267
+ for img_path, text in val_labels:
268
+ f_val.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
269
+ logger.info(
270
+ f"Created validation label file with {len(val_labels)} entries: {val_path}"
271
+ )
272
+ except Exception as e:
273
+ logger.error(f"Failed to write train/validation label files: {e}", exc_info=True)
274
+ raise
275
+ else:
276
+ # Create a single label file
277
+ label_path = os.path.join(output_dir, "label.txt")
278
+ try:
279
+ with open(label_path, "w", encoding="utf-8") as f_label:
280
+ for img_path, text in labels:
281
+ f_label.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
282
+ logger.info(f"Created single label file with {len(labels)} entries: {label_path}")
283
+ except Exception as e:
284
+ logger.error(f"Failed to write label file '{label_path}': {e}", exc_info=True)
285
+ raise
286
+
287
+ # --- 5. Include Guide Notebook ---
288
+ if self.include_guide:
289
+ self._copy_guide_notebook(output_dir)
290
+
291
+ logger.info(f"PaddleOCR recognition data export completed successfully to '{output_dir}'.")
292
+
293
+ def _copy_guide_notebook(self, output_dir: str):
294
+ """Locates, converts (md->ipynb), and copies the guide notebook."""
295
+ try:
296
+ # Try importing conversion library
297
+ import jupytext
298
+ from nbformat import write as write_notebook
299
+ except ImportError:
300
+ logger.warning(
301
+ "Could not import 'jupytext' or 'nbformat'. Skipping guide notebook generation. "
302
+ "Install with 'pip install natural-pdf[dev]' or 'pip install jupytext nbformat'."
303
+ )
304
+ return
305
+
306
+ try:
307
+ # Locate the template .md file relative to this script
308
+ exporter_dir = os.path.dirname(os.path.abspath(__file__))
309
+ # Go up two levels (exporters -> natural_pdf) then down to templates/finetune
310
+ template_dir = os.path.abspath(
311
+ os.path.join(exporter_dir, "..", "templates", "finetune")
312
+ )
313
+ template_md_path = os.path.join(template_dir, "fine_tune_paddleocr.md")
314
+ output_ipynb_path = os.path.join(output_dir, "fine_tune_paddleocr.ipynb")
315
+
316
+ if not os.path.exists(template_md_path):
317
+ logger.error(
318
+ f"Guide template not found at expected location: {template_md_path}. Trying alternate path."
319
+ )
320
+ # Try path relative to workspace root as fallback if run from project root
321
+ alt_template_path = os.path.abspath(
322
+ os.path.join("natural_pdf", "templates", "finetune", "fine_tune_paddleocr.md")
323
+ )
324
+ if os.path.exists(alt_template_path):
325
+ template_md_path = alt_template_path
326
+ logger.info(f"Found guide template at alternate path: {template_md_path}")
327
+ else:
328
+ logger.error(
329
+ f"Guide template also not found at: {alt_template_path}. Cannot copy guide."
330
+ )
331
+ return
332
+
333
+ # Convert Markdown to Notebook object using jupytext
334
+ logger.debug(f"Reading guide template from: {template_md_path}")
335
+ notebook = jupytext.read(template_md_path) # Reads md and returns NotebookNode
336
+
337
+ # Write the Notebook object to the output .ipynb file
338
+ logger.debug(f"Writing guide notebook to: {output_ipynb_path}")
339
+ with open(output_ipynb_path, "w", encoding="utf-8") as f_nb:
340
+ write_notebook(notebook, f_nb)
341
+
342
+ logger.info(f"Copied and converted fine-tuning guide notebook to: {output_ipynb_path}")
343
+
344
+ except Exception as e:
345
+ logger.error(f"Failed to copy/convert guide notebook: {e}", exc_info=True)