natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/finetuning/index.md +176 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +411 -248
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +326 -17
- natural_pdf/core/element_manager.py +73 -4
- natural_pdf/core/page.py +255 -83
- natural_pdf/core/pdf.py +385 -367
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +279 -49
- natural_pdf/elements/region.py +106 -21
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +86 -42
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +98 -34
- natural_pdf/ocr/ocr_options.py +38 -10
- natural_pdf/ocr/utils.py +59 -33
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +96 -65
- natural_pdf/utils/tqdm_utils.py +43 -0
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -11,7 +11,15 @@ from natural_pdf.elements.base import DirectionalMixin
|
|
11
11
|
# Import new utils
|
12
12
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
13
13
|
|
14
|
-
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
14
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
15
|
+
|
16
|
+
# --- Classification Imports --- #
|
17
|
+
from natural_pdf.classification.mixin import ClassificationMixin
|
18
|
+
from natural_pdf.classification.manager import ClassificationManager # Keep for type hint
|
19
|
+
# --- End Classification Imports --- #
|
20
|
+
|
21
|
+
from natural_pdf.utils.locks import pdf_render_lock # Import the lock
|
22
|
+
from natural_pdf.extraction.mixin import ExtractionMixin # Import extraction mixin
|
15
23
|
|
16
24
|
if TYPE_CHECKING:
|
17
25
|
from natural_pdf.core.page import Page
|
@@ -27,7 +35,7 @@ except ImportError:
|
|
27
35
|
logger = logging.getLogger(__name__)
|
28
36
|
|
29
37
|
|
30
|
-
class Region(DirectionalMixin):
|
38
|
+
class Region(DirectionalMixin, ClassificationMixin, ExtractionMixin):
|
31
39
|
"""
|
32
40
|
Represents a rectangular region on a page.
|
33
41
|
"""
|
@@ -57,6 +65,12 @@ class Region(DirectionalMixin):
|
|
57
65
|
self.start_element = None
|
58
66
|
self.end_element = None
|
59
67
|
|
68
|
+
# --- ADDED --- Metadata store for mixins
|
69
|
+
self.metadata: Dict[str, Any] = {}
|
70
|
+
# --- NEW --- Central registry for analysis results
|
71
|
+
self.analyses: Dict[str, Any] = {}
|
72
|
+
# --- END ADDED ---
|
73
|
+
|
60
74
|
# Standard attributes for all elements
|
61
75
|
self.object_type = "region" # For selector compatibility
|
62
76
|
|
@@ -600,6 +614,18 @@ class Region(DirectionalMixin):
|
|
600
614
|
x1 = int(self.x1 * scale_factor)
|
601
615
|
bottom = int(self.bottom * scale_factor)
|
602
616
|
|
617
|
+
# Ensure coords are valid for cropping (left < right, top < bottom)
|
618
|
+
if x0 >= x1:
|
619
|
+
logger.warning(
|
620
|
+
f"Region {self.bbox} resulted in non-positive width after scaling ({x0} >= {x1}). Cannot create image."
|
621
|
+
)
|
622
|
+
return None
|
623
|
+
if top >= bottom:
|
624
|
+
logger.warning(
|
625
|
+
f"Region {self.bbox} resulted in non-positive height after scaling ({top} >= {bottom}). Cannot create image."
|
626
|
+
)
|
627
|
+
return None
|
628
|
+
|
603
629
|
# Crop the image to just this region
|
604
630
|
region_image = page_image.crop((x0, top, x1, bottom))
|
605
631
|
|
@@ -776,11 +802,6 @@ class Region(DirectionalMixin):
|
|
776
802
|
debug = kwargs.get("debug", debug or kwargs.get("debug_exclusions", False))
|
777
803
|
logger.debug(f"Region {self.bbox}: extract_text called with kwargs: {kwargs}")
|
778
804
|
|
779
|
-
# --- Handle Docling source (priority) --- DEPRECATED or Adapt?
|
780
|
-
# For now, let's bypass this and always use the standard extraction flow
|
781
|
-
# based on contained elements to ensure consistency.
|
782
|
-
# if self.model == 'docling' or hasattr(self, 'text_content'): ...
|
783
|
-
|
784
805
|
# 1. Get Word Elements potentially within this region (initial broad phase)
|
785
806
|
# Optimization: Could use spatial query if page elements were indexed
|
786
807
|
page_words = self.page.words # Get all words from the page
|
@@ -829,7 +850,7 @@ class Region(DirectionalMixin):
|
|
829
850
|
result = generate_text_layout(
|
830
851
|
char_dicts=filtered_chars,
|
831
852
|
layout_context_bbox=self.bbox, # Use region's bbox for context
|
832
|
-
user_kwargs=kwargs,
|
853
|
+
user_kwargs=kwargs, # Pass original kwargs to layout generator
|
833
854
|
)
|
834
855
|
|
835
856
|
logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
|
@@ -1084,11 +1105,14 @@ class Region(DirectionalMixin):
|
|
1084
1105
|
filtered_elements = [e for e in page_elements if self._is_element_in_region(e)]
|
1085
1106
|
return ElementCollection(filtered_elements)
|
1086
1107
|
|
1087
|
-
def apply_ocr(self, **ocr_params) -> "Region":
|
1108
|
+
def apply_ocr(self, replace=True, **ocr_params) -> "Region":
|
1088
1109
|
"""
|
1089
1110
|
Apply OCR to this region and return the created text elements.
|
1090
1111
|
|
1091
1112
|
Args:
|
1113
|
+
replace: If True (default), removes existing OCR elements in the region
|
1114
|
+
before adding new ones. If False, adds new OCR elements without
|
1115
|
+
removing existing ones.
|
1092
1116
|
**ocr_params: Keyword arguments passed to the OCR Manager.
|
1093
1117
|
Common parameters like `engine`, `languages`, `min_confidence`,
|
1094
1118
|
`device`, and `resolution` (for image rendering) should be
|
@@ -1098,17 +1122,33 @@ class Region(DirectionalMixin):
|
|
1098
1122
|
an `options` object (e.g., `options=EasyOCROptions(...)`).
|
1099
1123
|
|
1100
1124
|
Returns:
|
1101
|
-
|
1125
|
+
Self for method chaining.
|
1102
1126
|
"""
|
1103
1127
|
# Ensure OCRManager is available
|
1104
1128
|
if not hasattr(self.page._parent, "_ocr_manager") or self.page._parent._ocr_manager is None:
|
1105
1129
|
logger.error("OCRManager not available on parent PDF. Cannot apply OCR to region.")
|
1106
|
-
return
|
1130
|
+
return self
|
1131
|
+
|
1132
|
+
# If replace is True, find and remove existing OCR elements in this region
|
1133
|
+
if replace:
|
1134
|
+
logger.info(f"Region {self.bbox}: Removing existing OCR elements before applying new OCR.")
|
1135
|
+
# Find all OCR elements in this region
|
1136
|
+
ocr_selector = "text[source=ocr]"
|
1137
|
+
ocr_elements = self.find_all(ocr_selector)
|
1138
|
+
|
1139
|
+
if ocr_elements:
|
1140
|
+
logger.info(f"Region {self.bbox}: Found {len(ocr_elements)} existing OCR elements to remove.")
|
1141
|
+
# Remove these elements from their page
|
1142
|
+
removed_count = ocr_elements.remove()
|
1143
|
+
logger.info(f"Region {self.bbox}: Removed {removed_count} OCR elements.")
|
1144
|
+
else:
|
1145
|
+
logger.info(f"Region {self.bbox}: No existing OCR elements found to remove.")
|
1146
|
+
|
1107
1147
|
ocr_mgr = self.page._parent._ocr_manager
|
1108
1148
|
|
1109
1149
|
# Determine rendering resolution from parameters
|
1110
1150
|
final_resolution = ocr_params.get("resolution")
|
1111
|
-
if final_resolution is None and hasattr(self.page,
|
1151
|
+
if final_resolution is None and hasattr(self.page, "_parent") and self.page._parent:
|
1112
1152
|
final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
|
1113
1153
|
elif final_resolution is None:
|
1114
1154
|
final_resolution = 150
|
@@ -1123,11 +1163,11 @@ class Region(DirectionalMixin):
|
|
1123
1163
|
)
|
1124
1164
|
if not region_image:
|
1125
1165
|
logger.error("Failed to render region to image for OCR.")
|
1126
|
-
return
|
1166
|
+
return self
|
1127
1167
|
logger.debug(f"Region rendered to image size: {region_image.size}")
|
1128
1168
|
except Exception as e:
|
1129
1169
|
logger.error(f"Error rendering region to image for OCR: {e}", exc_info=True)
|
1130
|
-
return
|
1170
|
+
return self
|
1131
1171
|
|
1132
1172
|
# Prepare args for the OCR Manager
|
1133
1173
|
manager_args = {
|
@@ -1148,11 +1188,11 @@ class Region(DirectionalMixin):
|
|
1148
1188
|
logger.error(
|
1149
1189
|
f"OCRManager returned unexpected type for single region image: {type(results)}"
|
1150
1190
|
)
|
1151
|
-
return
|
1191
|
+
return self
|
1152
1192
|
logger.debug(f"Region OCR processing returned {len(results)} results.")
|
1153
1193
|
except Exception as e:
|
1154
1194
|
logger.error(f"Error during OCRManager processing for region: {e}", exc_info=True)
|
1155
|
-
return
|
1195
|
+
return self
|
1156
1196
|
|
1157
1197
|
# Convert results to TextElements
|
1158
1198
|
scale_x = self.width / region_image.width if region_image.width > 0 else 1.0
|
@@ -1191,6 +1231,7 @@ class Region(DirectionalMixin):
|
|
1191
1231
|
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
1192
1232
|
element_data["_char_dicts"] = [ocr_char_dict]
|
1193
1233
|
from natural_pdf.elements.text import TextElement
|
1234
|
+
|
1194
1235
|
elem = TextElement(element_data, self.page)
|
1195
1236
|
created_elements.append(elem)
|
1196
1237
|
self.page._element_mgr.add_element(elem, element_type="words")
|
@@ -1692,7 +1733,7 @@ class Region(DirectionalMixin):
|
|
1692
1733
|
def correct_ocr(
|
1693
1734
|
self,
|
1694
1735
|
correction_callback: Callable[[Any], Optional[str]],
|
1695
|
-
) -> "Region":
|
1736
|
+
) -> "Region": # Return self for chaining
|
1696
1737
|
"""
|
1697
1738
|
Applies corrections to OCR-generated text elements within this region
|
1698
1739
|
using a user-provided callback function.
|
@@ -1718,13 +1759,57 @@ class Region(DirectionalMixin):
|
|
1718
1759
|
"""
|
1719
1760
|
# Find OCR elements specifically within this region
|
1720
1761
|
# Note: We typically want to correct even if the element falls in an excluded area
|
1721
|
-
target_elements = self.find_all(selector="text[source
|
1762
|
+
target_elements = self.find_all(selector="text[source=ocr]", apply_exclusions=False)
|
1722
1763
|
|
1723
1764
|
# Delegate to the utility function
|
1724
1765
|
_apply_ocr_correction_to_elements(
|
1725
|
-
elements=target_elements,
|
1766
|
+
elements=target_elements, # Pass the ElementCollection directly
|
1726
1767
|
correction_callback=correction_callback,
|
1727
|
-
caller_info=f"Region({self.bbox})",
|
1768
|
+
caller_info=f"Region({self.bbox})", # Pass caller info
|
1728
1769
|
)
|
1729
1770
|
|
1730
|
-
return self
|
1771
|
+
return self # Return self for chaining
|
1772
|
+
|
1773
|
+
# --- Classification Mixin Implementation --- #
|
1774
|
+
def _get_classification_manager(self) -> "ClassificationManager":
|
1775
|
+
if not hasattr(self, 'page') or not hasattr(self.page, 'pdf') or not hasattr(self.page.pdf, 'get_manager'):
|
1776
|
+
raise AttributeError("ClassificationManager cannot be accessed: Parent Page, PDF, or get_manager method missing.")
|
1777
|
+
try:
|
1778
|
+
# Use the PDF's manager registry accessor via page
|
1779
|
+
return self.page.pdf.get_manager('classification')
|
1780
|
+
except (ValueError, RuntimeError, AttributeError) as e:
|
1781
|
+
# Wrap potential errors from get_manager for clarity
|
1782
|
+
raise AttributeError(f"Failed to get ClassificationManager from PDF via Page: {e}") from e
|
1783
|
+
|
1784
|
+
def _get_classification_content(self, model_type: str, **kwargs) -> Union[str, "Image"]: # Use "Image" for lazy import
|
1785
|
+
if model_type == 'text':
|
1786
|
+
text_content = self.extract_text(layout=False) # Simple join for classification
|
1787
|
+
if not text_content or text_content.isspace():
|
1788
|
+
raise ValueError("Cannot classify region with 'text' model: No text content found.")
|
1789
|
+
return text_content
|
1790
|
+
elif model_type == 'vision':
|
1791
|
+
# Get resolution from manager/kwargs if possible, else default
|
1792
|
+
# We access manager via the method to ensure it's available
|
1793
|
+
manager = self._get_classification_manager()
|
1794
|
+
default_resolution = 150 # Manager doesn't store default res, set here
|
1795
|
+
# Note: classify() passes resolution via **kwargs if user specifies
|
1796
|
+
resolution = kwargs.get('resolution', default_resolution) if 'kwargs' in locals() else default_resolution
|
1797
|
+
|
1798
|
+
img = self.to_image(
|
1799
|
+
resolution=resolution,
|
1800
|
+
include_highlights=False, # No highlights for classification input
|
1801
|
+
crop_only=True # Just the region content
|
1802
|
+
)
|
1803
|
+
if img is None:
|
1804
|
+
raise ValueError("Cannot classify region with 'vision' model: Failed to render image.")
|
1805
|
+
return img
|
1806
|
+
else:
|
1807
|
+
raise ValueError(f"Unsupported model_type for classification: {model_type}")
|
1808
|
+
|
1809
|
+
def _get_metadata_storage(self) -> Dict[str, Any]:
|
1810
|
+
# Ensure metadata exists
|
1811
|
+
if not hasattr(self, 'metadata') or self.metadata is None:
|
1812
|
+
self.metadata = {}
|
1813
|
+
return self.metadata
|
1814
|
+
|
1815
|
+
# --- End Classification Mixin Implementation --- #
|
natural_pdf/elements/text.py
CHANGED
@@ -274,9 +274,12 @@ class TextElement(Element):
|
|
274
274
|
|
275
275
|
return False
|
276
276
|
|
277
|
-
def __repr__(self) -> str:
|
277
|
+
def __repr__(self) -> str:
|
278
278
|
"""String representation of the text element."""
|
279
|
-
|
279
|
+
if self.text:
|
280
|
+
preview = self.text[:10] + "..." if len(self.text) > 10 else self.text
|
281
|
+
else:
|
282
|
+
preview = "..."
|
280
283
|
font_style = []
|
281
284
|
if self.bold:
|
282
285
|
font_style.append("bold")
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import abc
|
2
|
+
import logging
|
3
|
+
from typing import Union, List, TYPE_CHECKING
|
4
|
+
|
5
|
+
if TYPE_CHECKING:
|
6
|
+
from natural_pdf.core.pdf import PDF
|
7
|
+
from natural_pdf.collections.pdf_collection import PDFCollection
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class FinetuneExporter(abc.ABC):
|
13
|
+
"""
|
14
|
+
Abstract base class for exporting data suitable for fine-tuning models.
|
15
|
+
"""
|
16
|
+
|
17
|
+
@abc.abstractmethod
|
18
|
+
def __init__(self, **kwargs):
|
19
|
+
"""
|
20
|
+
Initialize the exporter with format-specific options.
|
21
|
+
"""
|
22
|
+
pass
|
23
|
+
|
24
|
+
@abc.abstractmethod
|
25
|
+
def export(self, source: Union["PDF", "PDFCollection", List["PDF"]], output_dir: str, **kwargs):
|
26
|
+
"""
|
27
|
+
Exports the data from the source PDF(s) to the specified output directory
|
28
|
+
in a format suitable for fine-tuning a specific model type.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
source: The PDF object, PDFCollection, or list of PDF objects to process.
|
32
|
+
output_dir: The path to the directory where the exported files will be saved.
|
33
|
+
**kwargs: Additional export-time arguments.
|
34
|
+
"""
|
35
|
+
pass
|
36
|
+
|
37
|
+
def _resolve_source_pdfs(
|
38
|
+
self, source: Union["PDF", "PDFCollection", List["PDF"]]
|
39
|
+
) -> List["PDF"]:
|
40
|
+
"""
|
41
|
+
Helper to consistently resolve the input source to a list of PDF objects.
|
42
|
+
"""
|
43
|
+
from natural_pdf.core.pdf import PDF # Avoid circular import at module level
|
44
|
+
from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
|
45
|
+
|
46
|
+
pdfs_to_process: List["PDF"] = []
|
47
|
+
if isinstance(source, PDF):
|
48
|
+
pdfs_to_process = [source]
|
49
|
+
elif isinstance(source, PDFCollection):
|
50
|
+
pdfs_to_process = source.pdfs
|
51
|
+
elif isinstance(source, list) and all(isinstance(p, PDF) for p in source):
|
52
|
+
pdfs_to_process = source
|
53
|
+
else:
|
54
|
+
raise TypeError(
|
55
|
+
f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
|
56
|
+
)
|
57
|
+
|
58
|
+
if not pdfs_to_process:
|
59
|
+
logger.warning("No PDF documents provided in the source.")
|
60
|
+
|
61
|
+
return pdfs_to_process
|
@@ -0,0 +1,345 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
import random
|
4
|
+
import shutil
|
5
|
+
from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
|
6
|
+
from tqdm import tqdm
|
7
|
+
|
8
|
+
from natural_pdf.exporters.base import FinetuneExporter
|
9
|
+
|
10
|
+
# Need to import this utility
|
11
|
+
from natural_pdf.utils.identifiers import generate_short_path_hash
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from natural_pdf.core.pdf import PDF
|
15
|
+
from natural_pdf.collections.pdf_collection import PDFCollection
|
16
|
+
from natural_pdf.elements.text import TextElement
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
DEFAULT_SELECTOR_CORRECTED = "text[source^=manifest]" # Match manifest-import etc.
|
21
|
+
|
22
|
+
|
23
|
+
class PaddleOCRRecognitionExporter(FinetuneExporter):
|
24
|
+
"""
|
25
|
+
Exports data for fine-tuning a PaddleOCR text recognition model.
|
26
|
+
|
27
|
+
Creates a directory structure with cropped text images and label files
|
28
|
+
(`train.txt`, `val.txt`, or `label.txt`) suitable for PaddleOCR training.
|
29
|
+
Optionally includes a Jupyter Notebook guide for fine-tuning on Colab.
|
30
|
+
"""
|
31
|
+
|
32
|
+
def __init__(
|
33
|
+
self,
|
34
|
+
resolution: int = 150,
|
35
|
+
padding: int = 2,
|
36
|
+
selector: Optional[str] = None,
|
37
|
+
corrected_only: bool = False,
|
38
|
+
split_ratio: Optional[float] = 0.9,
|
39
|
+
include_guide: bool = True,
|
40
|
+
random_seed: Optional[int] = 42,
|
41
|
+
):
|
42
|
+
"""
|
43
|
+
Initialize the PaddleOCR Recognition Exporter.
|
44
|
+
|
45
|
+
Args:
|
46
|
+
resolution: DPI resolution for rendering text region images (default: 150).
|
47
|
+
padding: Padding (in points) to add around text element bbox before cropping (default: 2).
|
48
|
+
selector: CSS-like selector to filter which TextElements to export.
|
49
|
+
If None and corrected_only is False, all 'text' elements are considered.
|
50
|
+
corrected_only: If True, overrides selector and exports only elements likely
|
51
|
+
originating from a correction manifest (selector="text[source=manifest]").
|
52
|
+
(default: False).
|
53
|
+
split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
|
54
|
+
If None, creates a single `label.txt` file (default: 0.9).
|
55
|
+
include_guide: If True, includes a template Jupyter Notebook guide for fine-tuning
|
56
|
+
in the output directory (default: True).
|
57
|
+
random_seed: Seed for the random number generator used for train/val split shuffling,
|
58
|
+
ensuring reproducibility (default: 42).
|
59
|
+
"""
|
60
|
+
if corrected_only and selector:
|
61
|
+
logger.warning(
|
62
|
+
f"Both 'corrected_only=True' and 'selector=\"{selector}\"' were provided. "
|
63
|
+
f"Using corrected_only=True (selector='{DEFAULT_SELECTOR_CORRECTED}')."
|
64
|
+
)
|
65
|
+
self.selector = DEFAULT_SELECTOR_CORRECTED
|
66
|
+
elif corrected_only:
|
67
|
+
self.selector = DEFAULT_SELECTOR_CORRECTED
|
68
|
+
elif selector:
|
69
|
+
self.selector = selector
|
70
|
+
else:
|
71
|
+
self.selector = "text" # Default to all text elements if nothing else specified
|
72
|
+
|
73
|
+
self.resolution = resolution
|
74
|
+
self.padding = padding
|
75
|
+
self.split_ratio = split_ratio
|
76
|
+
self.include_guide = include_guide
|
77
|
+
self.random_seed = random_seed
|
78
|
+
|
79
|
+
logger.info(
|
80
|
+
f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
|
81
|
+
f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
|
82
|
+
)
|
83
|
+
|
84
|
+
def export(
|
85
|
+
self,
|
86
|
+
source: Union["PDF", "PDFCollection", List["PDF"]],
|
87
|
+
output_dir: str,
|
88
|
+
**kwargs, # Allow for potential future args
|
89
|
+
):
|
90
|
+
"""
|
91
|
+
Exports text elements from the source PDF(s) to the specified output directory
|
92
|
+
in PaddleOCR text recognition format.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
source: The PDF object, PDFCollection, or list of PDF objects to process.
|
96
|
+
output_dir: The path to the directory where the exported files will be saved.
|
97
|
+
The directory will be created if it doesn't exist.
|
98
|
+
**kwargs: Optional keyword arguments (currently unused).
|
99
|
+
"""
|
100
|
+
# --- 1. Setup and Validation ---
|
101
|
+
pdfs_to_process = self._resolve_source_pdfs(source)
|
102
|
+
if not pdfs_to_process:
|
103
|
+
logger.error("No valid PDF sources found. Aborting export.")
|
104
|
+
return
|
105
|
+
|
106
|
+
try:
|
107
|
+
os.makedirs(output_dir, exist_ok=True)
|
108
|
+
images_dir = os.path.join(output_dir, "images")
|
109
|
+
os.makedirs(images_dir, exist_ok=True)
|
110
|
+
except OSError as e:
|
111
|
+
logger.error(f"Failed to create output directory '{output_dir}': {e}", exc_info=True)
|
112
|
+
raise
|
113
|
+
|
114
|
+
# --- 2. Collect Elements and Render Images ---
|
115
|
+
labels: List[Tuple[str, str]] = [] # List of (relative_image_path, text_label)
|
116
|
+
char_set: Set[str] = set()
|
117
|
+
elements_processed = 0
|
118
|
+
elements_skipped = 0
|
119
|
+
|
120
|
+
logger.info(
|
121
|
+
f"Processing {len(pdfs_to_process)} PDF(s) to find elements matching selector: '{self.selector}'"
|
122
|
+
)
|
123
|
+
|
124
|
+
for pdf in tqdm(pdfs_to_process, desc="Processing PDFs"):
|
125
|
+
# Need to ensure pdf.path exists and is string
|
126
|
+
if not hasattr(pdf, "path") or not isinstance(pdf.path, str):
|
127
|
+
logger.warning(f"Skipping PDF object without a valid path attribute: {pdf}")
|
128
|
+
continue
|
129
|
+
pdf_hash = generate_short_path_hash(pdf.path)
|
130
|
+
try:
|
131
|
+
# Find elements using the specified selector
|
132
|
+
# Need to check if pdf has find_all method
|
133
|
+
if not hasattr(pdf, "find_all"):
|
134
|
+
logger.warning(
|
135
|
+
f"PDF object {pdf.path} does not have find_all method. Skipping."
|
136
|
+
)
|
137
|
+
continue
|
138
|
+
|
139
|
+
elements = pdf.find_all(
|
140
|
+
self.selector, apply_exclusions=False
|
141
|
+
) # Usually want all text, even if excluded
|
142
|
+
if not elements:
|
143
|
+
logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
|
144
|
+
continue
|
145
|
+
|
146
|
+
for i, element in enumerate(
|
147
|
+
tqdm(
|
148
|
+
elements,
|
149
|
+
desc=f"Exporting '{os.path.basename(pdf.path)}'",
|
150
|
+
leave=False,
|
151
|
+
position=1,
|
152
|
+
)
|
153
|
+
):
|
154
|
+
# Ensure it's a TextElement with necessary methods/attributes
|
155
|
+
# Removed check for to_image as it's called after expand()
|
156
|
+
if not (
|
157
|
+
hasattr(element, "page")
|
158
|
+
and hasattr(element, "text")
|
159
|
+
and hasattr(element, "expand")
|
160
|
+
):
|
161
|
+
logger.warning(f"Skipping invalid/non-text element {i} in {pdf.path}")
|
162
|
+
elements_skipped += 1
|
163
|
+
continue
|
164
|
+
|
165
|
+
element_text = element.text
|
166
|
+
# Skip elements with no text, non-string text, or newlines
|
167
|
+
if (
|
168
|
+
not element_text
|
169
|
+
or not isinstance(element_text, str)
|
170
|
+
or "\n" in element_text
|
171
|
+
):
|
172
|
+
if "\n" in str(element_text):
|
173
|
+
reason = "contains newline"
|
174
|
+
elif not element_text:
|
175
|
+
reason = "empty text"
|
176
|
+
else:
|
177
|
+
reason = "invalid text type"
|
178
|
+
logger.debug(
|
179
|
+
f"Skipping element {i} in {pdf.path} page {getattr(element.page, 'number', 'N/A')} because {reason}."
|
180
|
+
)
|
181
|
+
elements_skipped += 1
|
182
|
+
continue
|
183
|
+
|
184
|
+
# Use page index if available, otherwise fallback or skip? Fallback to 0 for now.
|
185
|
+
page_index = getattr(element.page, "index", 0)
|
186
|
+
image_filename = f"{pdf_hash}_p{page_index}_e{i}.png"
|
187
|
+
relative_image_path = os.path.join("images", image_filename)
|
188
|
+
absolute_image_path = os.path.join(output_dir, relative_image_path)
|
189
|
+
|
190
|
+
try:
|
191
|
+
# Expand region, render, and save image
|
192
|
+
region = element.expand(self.padding)
|
193
|
+
img = region.to_image(
|
194
|
+
resolution=self.resolution, crop_only=True, include_highlights=False
|
195
|
+
)
|
196
|
+
img.save(absolute_image_path, "PNG")
|
197
|
+
|
198
|
+
# Add to labels and character set
|
199
|
+
labels.append(
|
200
|
+
(relative_image_path.replace(os.path.sep, "/"), element_text)
|
201
|
+
) # Use forward slashes for labels
|
202
|
+
char_set.update(element_text)
|
203
|
+
elements_processed += 1
|
204
|
+
|
205
|
+
except Exception as e:
|
206
|
+
page_num_str = getattr(
|
207
|
+
element.page, "number", "N/A"
|
208
|
+
) # Get page number safely
|
209
|
+
logger.error(
|
210
|
+
f"Failed to process/save image for element {i} in {pdf.path} page {page_num_str}: {e}",
|
211
|
+
exc_info=False, # Keep log cleaner
|
212
|
+
)
|
213
|
+
elements_skipped += 1
|
214
|
+
|
215
|
+
except Exception as e:
|
216
|
+
logger.error(f"Failed to process PDF {pdf.path}: {e}", exc_info=True)
|
217
|
+
# Continue with other PDFs if possible
|
218
|
+
|
219
|
+
if elements_processed == 0:
|
220
|
+
logger.error(
|
221
|
+
f"No text elements were successfully processed and exported matching '{self.selector}'. Aborting."
|
222
|
+
)
|
223
|
+
# Clean up potentially created directories? Or leave them empty? Let's leave them.
|
224
|
+
return
|
225
|
+
|
226
|
+
logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
|
227
|
+
|
228
|
+
# --- 3. Generate Dictionary File (`dict.txt`) ---
|
229
|
+
dict_path = os.path.join(output_dir, "dict.txt")
|
230
|
+
try:
|
231
|
+
# Log the character set before sorting/writing
|
232
|
+
logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
|
233
|
+
# PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
|
234
|
+
# for recognition models, but this might depend on the specific base model.
|
235
|
+
# Start with just the characters found.
|
236
|
+
sorted_chars = sorted(list(char_set), reverse=True)
|
237
|
+
with open(dict_path, "w", encoding="utf-8") as f_dict:
|
238
|
+
for char in sorted_chars:
|
239
|
+
# Ensure we don't write empty strings or just newlines as dictionary entries
|
240
|
+
if char and char != "\n":
|
241
|
+
f_dict.write(char + "\n")
|
242
|
+
logger.info(f"Created dictionary file with {len(sorted_chars)} characters: {dict_path}")
|
243
|
+
except Exception as e:
|
244
|
+
logger.error(f"Failed to write dictionary file '{dict_path}': {e}", exc_info=True)
|
245
|
+
raise # Re-raise as this is critical
|
246
|
+
|
247
|
+
# --- 4. Generate Label Files (`train.txt`, `val.txt` or `label.txt`) ---
|
248
|
+
if self.split_ratio is not None and 0 < self.split_ratio < 1:
|
249
|
+
if self.random_seed is not None:
|
250
|
+
random.seed(self.random_seed)
|
251
|
+
random.shuffle(labels)
|
252
|
+
split_index = int(len(labels) * self.split_ratio)
|
253
|
+
train_labels = labels[:split_index]
|
254
|
+
val_labels = labels[split_index:]
|
255
|
+
|
256
|
+
try:
|
257
|
+
train_path = os.path.join(output_dir, "train.txt")
|
258
|
+
with open(train_path, "w", encoding="utf-8") as f_train:
|
259
|
+
for img_path, text in train_labels:
|
260
|
+
f_train.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
|
261
|
+
logger.info(
|
262
|
+
f"Created training label file with {len(train_labels)} entries: {train_path}"
|
263
|
+
)
|
264
|
+
|
265
|
+
val_path = os.path.join(output_dir, "val.txt")
|
266
|
+
with open(val_path, "w", encoding="utf-8") as f_val:
|
267
|
+
for img_path, text in val_labels:
|
268
|
+
f_val.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
|
269
|
+
logger.info(
|
270
|
+
f"Created validation label file with {len(val_labels)} entries: {val_path}"
|
271
|
+
)
|
272
|
+
except Exception as e:
|
273
|
+
logger.error(f"Failed to write train/validation label files: {e}", exc_info=True)
|
274
|
+
raise
|
275
|
+
else:
|
276
|
+
# Create a single label file
|
277
|
+
label_path = os.path.join(output_dir, "label.txt")
|
278
|
+
try:
|
279
|
+
with open(label_path, "w", encoding="utf-8") as f_label:
|
280
|
+
for img_path, text in labels:
|
281
|
+
f_label.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
|
282
|
+
logger.info(f"Created single label file with {len(labels)} entries: {label_path}")
|
283
|
+
except Exception as e:
|
284
|
+
logger.error(f"Failed to write label file '{label_path}': {e}", exc_info=True)
|
285
|
+
raise
|
286
|
+
|
287
|
+
# --- 5. Include Guide Notebook ---
|
288
|
+
if self.include_guide:
|
289
|
+
self._copy_guide_notebook(output_dir)
|
290
|
+
|
291
|
+
logger.info(f"PaddleOCR recognition data export completed successfully to '{output_dir}'.")
|
292
|
+
|
293
|
+
def _copy_guide_notebook(self, output_dir: str):
|
294
|
+
"""Locates, converts (md->ipynb), and copies the guide notebook."""
|
295
|
+
try:
|
296
|
+
# Try importing conversion library
|
297
|
+
import jupytext
|
298
|
+
from nbformat import write as write_notebook
|
299
|
+
except ImportError:
|
300
|
+
logger.warning(
|
301
|
+
"Could not import 'jupytext' or 'nbformat'. Skipping guide notebook generation. "
|
302
|
+
"Install with 'pip install natural-pdf[dev]' or 'pip install jupytext nbformat'."
|
303
|
+
)
|
304
|
+
return
|
305
|
+
|
306
|
+
try:
|
307
|
+
# Locate the template .md file relative to this script
|
308
|
+
exporter_dir = os.path.dirname(os.path.abspath(__file__))
|
309
|
+
# Go up two levels (exporters -> natural_pdf) then down to templates/finetune
|
310
|
+
template_dir = os.path.abspath(
|
311
|
+
os.path.join(exporter_dir, "..", "templates", "finetune")
|
312
|
+
)
|
313
|
+
template_md_path = os.path.join(template_dir, "fine_tune_paddleocr.md")
|
314
|
+
output_ipynb_path = os.path.join(output_dir, "fine_tune_paddleocr.ipynb")
|
315
|
+
|
316
|
+
if not os.path.exists(template_md_path):
|
317
|
+
logger.error(
|
318
|
+
f"Guide template not found at expected location: {template_md_path}. Trying alternate path."
|
319
|
+
)
|
320
|
+
# Try path relative to workspace root as fallback if run from project root
|
321
|
+
alt_template_path = os.path.abspath(
|
322
|
+
os.path.join("natural_pdf", "templates", "finetune", "fine_tune_paddleocr.md")
|
323
|
+
)
|
324
|
+
if os.path.exists(alt_template_path):
|
325
|
+
template_md_path = alt_template_path
|
326
|
+
logger.info(f"Found guide template at alternate path: {template_md_path}")
|
327
|
+
else:
|
328
|
+
logger.error(
|
329
|
+
f"Guide template also not found at: {alt_template_path}. Cannot copy guide."
|
330
|
+
)
|
331
|
+
return
|
332
|
+
|
333
|
+
# Convert Markdown to Notebook object using jupytext
|
334
|
+
logger.debug(f"Reading guide template from: {template_md_path}")
|
335
|
+
notebook = jupytext.read(template_md_path) # Reads md and returns NotebookNode
|
336
|
+
|
337
|
+
# Write the Notebook object to the output .ipynb file
|
338
|
+
logger.debug(f"Writing guide notebook to: {output_ipynb_path}")
|
339
|
+
with open(output_ipynb_path, "w", encoding="utf-8") as f_nb:
|
340
|
+
write_notebook(notebook, f_nb)
|
341
|
+
|
342
|
+
logger.info(f"Copied and converted fine-tuning guide notebook to: {output_ipynb_path}")
|
343
|
+
|
344
|
+
except Exception as e:
|
345
|
+
logger.error(f"Failed to copy/convert guide notebook: {e}", exc_info=True)
|