natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/finetuning/index.md +176 -0
- docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/collections/pdf_collection.py +5 -2
- natural_pdf/core/element_manager.py +6 -4
- natural_pdf/core/page.py +36 -27
- natural_pdf/core/pdf.py +25 -16
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +13 -14
- natural_pdf/elements/region.py +7 -6
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +81 -40
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +15 -11
- natural_pdf/ocr/ocr_options.py +5 -0
- natural_pdf/ocr/utils.py +46 -31
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +44 -64
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
natural_pdf/elements/region.py
CHANGED
@@ -11,7 +11,7 @@ from natural_pdf.elements.base import DirectionalMixin
|
|
11
11
|
# Import new utils
|
12
12
|
from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
|
13
13
|
|
14
|
-
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements
|
14
|
+
from natural_pdf.ocr.utils import _apply_ocr_correction_to_elements # Import utility
|
15
15
|
|
16
16
|
if TYPE_CHECKING:
|
17
17
|
from natural_pdf.core.page import Page
|
@@ -1108,7 +1108,7 @@ class Region(DirectionalMixin):
|
|
1108
1108
|
|
1109
1109
|
# Determine rendering resolution from parameters
|
1110
1110
|
final_resolution = ocr_params.get("resolution")
|
1111
|
-
if final_resolution is None and hasattr(self.page,
|
1111
|
+
if final_resolution is None and hasattr(self.page, "_parent") and self.page._parent:
|
1112
1112
|
final_resolution = getattr(self.page._parent, "_config", {}).get("resolution", 150)
|
1113
1113
|
elif final_resolution is None:
|
1114
1114
|
final_resolution = 150
|
@@ -1191,6 +1191,7 @@ class Region(DirectionalMixin):
|
|
1191
1191
|
ocr_char_dict.setdefault("adv", ocr_char_dict.get("width", 0))
|
1192
1192
|
element_data["_char_dicts"] = [ocr_char_dict]
|
1193
1193
|
from natural_pdf.elements.text import TextElement
|
1194
|
+
|
1194
1195
|
elem = TextElement(element_data, self.page)
|
1195
1196
|
created_elements.append(elem)
|
1196
1197
|
self.page._element_mgr.add_element(elem, element_type="words")
|
@@ -1692,7 +1693,7 @@ class Region(DirectionalMixin):
|
|
1692
1693
|
def correct_ocr(
|
1693
1694
|
self,
|
1694
1695
|
correction_callback: Callable[[Any], Optional[str]],
|
1695
|
-
) -> "Region":
|
1696
|
+
) -> "Region": # Return self for chaining
|
1696
1697
|
"""
|
1697
1698
|
Applies corrections to OCR-generated text elements within this region
|
1698
1699
|
using a user-provided callback function.
|
@@ -1722,9 +1723,9 @@ class Region(DirectionalMixin):
|
|
1722
1723
|
|
1723
1724
|
# Delegate to the utility function
|
1724
1725
|
_apply_ocr_correction_to_elements(
|
1725
|
-
elements=target_elements,
|
1726
|
+
elements=target_elements, # Pass the ElementCollection directly
|
1726
1727
|
correction_callback=correction_callback,
|
1727
|
-
caller_info=f"Region({self.bbox})",
|
1728
|
+
caller_info=f"Region({self.bbox})", # Pass caller info
|
1728
1729
|
)
|
1729
1730
|
|
1730
|
-
return self
|
1731
|
+
return self # Return self for chaining
|
@@ -0,0 +1,61 @@
|
|
1
|
+
import abc
|
2
|
+
import logging
|
3
|
+
from typing import Union, List, TYPE_CHECKING
|
4
|
+
|
5
|
+
if TYPE_CHECKING:
|
6
|
+
from natural_pdf.core.pdf import PDF
|
7
|
+
from natural_pdf.collections.pdf_collection import PDFCollection
|
8
|
+
|
9
|
+
logger = logging.getLogger(__name__)
|
10
|
+
|
11
|
+
|
12
|
+
class FinetuneExporter(abc.ABC):
|
13
|
+
"""
|
14
|
+
Abstract base class for exporting data suitable for fine-tuning models.
|
15
|
+
"""
|
16
|
+
|
17
|
+
@abc.abstractmethod
|
18
|
+
def __init__(self, **kwargs):
|
19
|
+
"""
|
20
|
+
Initialize the exporter with format-specific options.
|
21
|
+
"""
|
22
|
+
pass
|
23
|
+
|
24
|
+
@abc.abstractmethod
|
25
|
+
def export(self, source: Union["PDF", "PDFCollection", List["PDF"]], output_dir: str, **kwargs):
|
26
|
+
"""
|
27
|
+
Exports the data from the source PDF(s) to the specified output directory
|
28
|
+
in a format suitable for fine-tuning a specific model type.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
source: The PDF object, PDFCollection, or list of PDF objects to process.
|
32
|
+
output_dir: The path to the directory where the exported files will be saved.
|
33
|
+
**kwargs: Additional export-time arguments.
|
34
|
+
"""
|
35
|
+
pass
|
36
|
+
|
37
|
+
def _resolve_source_pdfs(
|
38
|
+
self, source: Union["PDF", "PDFCollection", List["PDF"]]
|
39
|
+
) -> List["PDF"]:
|
40
|
+
"""
|
41
|
+
Helper to consistently resolve the input source to a list of PDF objects.
|
42
|
+
"""
|
43
|
+
from natural_pdf.core.pdf import PDF # Avoid circular import at module level
|
44
|
+
from natural_pdf.collections.pdf_collection import PDFCollection # Avoid circular import
|
45
|
+
|
46
|
+
pdfs_to_process: List["PDF"] = []
|
47
|
+
if isinstance(source, PDF):
|
48
|
+
pdfs_to_process = [source]
|
49
|
+
elif isinstance(source, PDFCollection):
|
50
|
+
pdfs_to_process = source.pdfs
|
51
|
+
elif isinstance(source, list) and all(isinstance(p, PDF) for p in source):
|
52
|
+
pdfs_to_process = source
|
53
|
+
else:
|
54
|
+
raise TypeError(
|
55
|
+
f"Unsupported source type: {type(source)}. Must be PDF, PDFCollection, or List[PDF]."
|
56
|
+
)
|
57
|
+
|
58
|
+
if not pdfs_to_process:
|
59
|
+
logger.warning("No PDF documents provided in the source.")
|
60
|
+
|
61
|
+
return pdfs_to_process
|
@@ -0,0 +1,345 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
import random
|
4
|
+
import shutil
|
5
|
+
from typing import Union, List, Optional, TYPE_CHECKING, Set, Tuple
|
6
|
+
from tqdm import tqdm
|
7
|
+
|
8
|
+
from natural_pdf.exporters.base import FinetuneExporter
|
9
|
+
|
10
|
+
# Need to import this utility
|
11
|
+
from natural_pdf.utils.identifiers import generate_short_path_hash
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from natural_pdf.core.pdf import PDF
|
15
|
+
from natural_pdf.collections.pdf_collection import PDFCollection
|
16
|
+
from natural_pdf.elements.text import TextElement
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
DEFAULT_SELECTOR_CORRECTED = "text[source^=manifest]" # Match manifest-import etc.
|
21
|
+
|
22
|
+
|
23
|
+
class PaddleOCRRecognitionExporter(FinetuneExporter):
|
24
|
+
"""
|
25
|
+
Exports data for fine-tuning a PaddleOCR text recognition model.
|
26
|
+
|
27
|
+
Creates a directory structure with cropped text images and label files
|
28
|
+
(`train.txt`, `val.txt`, or `label.txt`) suitable for PaddleOCR training.
|
29
|
+
Optionally includes a Jupyter Notebook guide for fine-tuning on Colab.
|
30
|
+
"""
|
31
|
+
|
32
|
+
def __init__(
|
33
|
+
self,
|
34
|
+
resolution: int = 150,
|
35
|
+
padding: int = 2,
|
36
|
+
selector: Optional[str] = None,
|
37
|
+
corrected_only: bool = False,
|
38
|
+
split_ratio: Optional[float] = 0.9,
|
39
|
+
include_guide: bool = True,
|
40
|
+
random_seed: Optional[int] = 42,
|
41
|
+
):
|
42
|
+
"""
|
43
|
+
Initialize the PaddleOCR Recognition Exporter.
|
44
|
+
|
45
|
+
Args:
|
46
|
+
resolution: DPI resolution for rendering text region images (default: 150).
|
47
|
+
padding: Padding (in points) to add around text element bbox before cropping (default: 2).
|
48
|
+
selector: CSS-like selector to filter which TextElements to export.
|
49
|
+
If None and corrected_only is False, all 'text' elements are considered.
|
50
|
+
corrected_only: If True, overrides selector and exports only elements likely
|
51
|
+
originating from a correction manifest (selector="text[source^=manifest]").
|
52
|
+
(default: False).
|
53
|
+
split_ratio: Ratio for splitting data into training/validation sets (e.g., 0.9 for 90% train).
|
54
|
+
If None, creates a single `label.txt` file (default: 0.9).
|
55
|
+
include_guide: If True, includes a template Jupyter Notebook guide for fine-tuning
|
56
|
+
in the output directory (default: True).
|
57
|
+
random_seed: Seed for the random number generator used for train/val split shuffling,
|
58
|
+
ensuring reproducibility (default: 42).
|
59
|
+
"""
|
60
|
+
if corrected_only and selector:
|
61
|
+
logger.warning(
|
62
|
+
f"Both 'corrected_only=True' and 'selector=\"{selector}\"' were provided. "
|
63
|
+
f"Using corrected_only=True (selector='{DEFAULT_SELECTOR_CORRECTED}')."
|
64
|
+
)
|
65
|
+
self.selector = DEFAULT_SELECTOR_CORRECTED
|
66
|
+
elif corrected_only:
|
67
|
+
self.selector = DEFAULT_SELECTOR_CORRECTED
|
68
|
+
elif selector:
|
69
|
+
self.selector = selector
|
70
|
+
else:
|
71
|
+
self.selector = "text" # Default to all text elements if nothing else specified
|
72
|
+
|
73
|
+
self.resolution = resolution
|
74
|
+
self.padding = padding
|
75
|
+
self.split_ratio = split_ratio
|
76
|
+
self.include_guide = include_guide
|
77
|
+
self.random_seed = random_seed
|
78
|
+
|
79
|
+
logger.info(
|
80
|
+
f"Initialized PaddleOCRRecognitionExporter: selector='{self.selector}', resolution={resolution}, "
|
81
|
+
f"padding={padding}, split_ratio={split_ratio}, include_guide={include_guide}"
|
82
|
+
)
|
83
|
+
|
84
|
+
def export(
|
85
|
+
self,
|
86
|
+
source: Union["PDF", "PDFCollection", List["PDF"]],
|
87
|
+
output_dir: str,
|
88
|
+
**kwargs, # Allow for potential future args
|
89
|
+
):
|
90
|
+
"""
|
91
|
+
Exports text elements from the source PDF(s) to the specified output directory
|
92
|
+
in PaddleOCR text recognition format.
|
93
|
+
|
94
|
+
Args:
|
95
|
+
source: The PDF object, PDFCollection, or list of PDF objects to process.
|
96
|
+
output_dir: The path to the directory where the exported files will be saved.
|
97
|
+
The directory will be created if it doesn't exist.
|
98
|
+
**kwargs: Optional keyword arguments (currently unused).
|
99
|
+
"""
|
100
|
+
# --- 1. Setup and Validation ---
|
101
|
+
pdfs_to_process = self._resolve_source_pdfs(source)
|
102
|
+
if not pdfs_to_process:
|
103
|
+
logger.error("No valid PDF sources found. Aborting export.")
|
104
|
+
return
|
105
|
+
|
106
|
+
try:
|
107
|
+
os.makedirs(output_dir, exist_ok=True)
|
108
|
+
images_dir = os.path.join(output_dir, "images")
|
109
|
+
os.makedirs(images_dir, exist_ok=True)
|
110
|
+
except OSError as e:
|
111
|
+
logger.error(f"Failed to create output directory '{output_dir}': {e}", exc_info=True)
|
112
|
+
raise
|
113
|
+
|
114
|
+
# --- 2. Collect Elements and Render Images ---
|
115
|
+
labels: List[Tuple[str, str]] = [] # List of (relative_image_path, text_label)
|
116
|
+
char_set: Set[str] = set()
|
117
|
+
elements_processed = 0
|
118
|
+
elements_skipped = 0
|
119
|
+
|
120
|
+
logger.info(
|
121
|
+
f"Processing {len(pdfs_to_process)} PDF(s) to find elements matching selector: '{self.selector}'"
|
122
|
+
)
|
123
|
+
|
124
|
+
for pdf in tqdm(pdfs_to_process, desc="Processing PDFs"):
|
125
|
+
# Need to ensure pdf.path exists and is string
|
126
|
+
if not hasattr(pdf, "path") or not isinstance(pdf.path, str):
|
127
|
+
logger.warning(f"Skipping PDF object without a valid path attribute: {pdf}")
|
128
|
+
continue
|
129
|
+
pdf_hash = generate_short_path_hash(pdf.path)
|
130
|
+
try:
|
131
|
+
# Find elements using the specified selector
|
132
|
+
# Need to check if pdf has find_all method
|
133
|
+
if not hasattr(pdf, "find_all"):
|
134
|
+
logger.warning(
|
135
|
+
f"PDF object {pdf.path} does not have find_all method. Skipping."
|
136
|
+
)
|
137
|
+
continue
|
138
|
+
|
139
|
+
elements = pdf.find_all(
|
140
|
+
self.selector, apply_exclusions=False
|
141
|
+
) # Usually want all text, even if excluded
|
142
|
+
if not elements:
|
143
|
+
logger.debug(f"No elements matching '{self.selector}' found in {pdf.path}")
|
144
|
+
continue
|
145
|
+
|
146
|
+
for i, element in enumerate(
|
147
|
+
tqdm(
|
148
|
+
elements,
|
149
|
+
desc=f"Exporting '{os.path.basename(pdf.path)}'",
|
150
|
+
leave=False,
|
151
|
+
position=1,
|
152
|
+
)
|
153
|
+
):
|
154
|
+
# Ensure it's a TextElement with necessary methods/attributes
|
155
|
+
# Removed check for to_image as it's called after expand()
|
156
|
+
if not (
|
157
|
+
hasattr(element, "page")
|
158
|
+
and hasattr(element, "text")
|
159
|
+
and hasattr(element, "expand")
|
160
|
+
):
|
161
|
+
logger.warning(f"Skipping invalid/non-text element {i} in {pdf.path}")
|
162
|
+
elements_skipped += 1
|
163
|
+
continue
|
164
|
+
|
165
|
+
element_text = element.text
|
166
|
+
# Skip elements with no text, non-string text, or newlines
|
167
|
+
if (
|
168
|
+
not element_text
|
169
|
+
or not isinstance(element_text, str)
|
170
|
+
or "\n" in element_text
|
171
|
+
):
|
172
|
+
if "\n" in str(element_text):
|
173
|
+
reason = "contains newline"
|
174
|
+
elif not element_text:
|
175
|
+
reason = "empty text"
|
176
|
+
else:
|
177
|
+
reason = "invalid text type"
|
178
|
+
logger.debug(
|
179
|
+
f"Skipping element {i} in {pdf.path} page {getattr(element.page, 'number', 'N/A')} because {reason}."
|
180
|
+
)
|
181
|
+
elements_skipped += 1
|
182
|
+
continue
|
183
|
+
|
184
|
+
# Use page index if available, otherwise fallback or skip? Fallback to 0 for now.
|
185
|
+
page_index = getattr(element.page, "index", 0)
|
186
|
+
image_filename = f"{pdf_hash}_p{page_index}_e{i}.png"
|
187
|
+
relative_image_path = os.path.join("images", image_filename)
|
188
|
+
absolute_image_path = os.path.join(output_dir, relative_image_path)
|
189
|
+
|
190
|
+
try:
|
191
|
+
# Expand region, render, and save image
|
192
|
+
region = element.expand(self.padding)
|
193
|
+
img = region.to_image(
|
194
|
+
resolution=self.resolution, crop_only=True, include_highlights=False
|
195
|
+
)
|
196
|
+
img.save(absolute_image_path, "PNG")
|
197
|
+
|
198
|
+
# Add to labels and character set
|
199
|
+
labels.append(
|
200
|
+
(relative_image_path.replace(os.path.sep, "/"), element_text)
|
201
|
+
) # Use forward slashes for labels
|
202
|
+
char_set.update(element_text)
|
203
|
+
elements_processed += 1
|
204
|
+
|
205
|
+
except Exception as e:
|
206
|
+
page_num_str = getattr(
|
207
|
+
element.page, "number", "N/A"
|
208
|
+
) # Get page number safely
|
209
|
+
logger.error(
|
210
|
+
f"Failed to process/save image for element {i} in {pdf.path} page {page_num_str}: {e}",
|
211
|
+
exc_info=False, # Keep log cleaner
|
212
|
+
)
|
213
|
+
elements_skipped += 1
|
214
|
+
|
215
|
+
except Exception as e:
|
216
|
+
logger.error(f"Failed to process PDF {pdf.path}: {e}", exc_info=True)
|
217
|
+
# Continue with other PDFs if possible
|
218
|
+
|
219
|
+
if elements_processed == 0:
|
220
|
+
logger.error(
|
221
|
+
f"No text elements were successfully processed and exported matching '{self.selector}'. Aborting."
|
222
|
+
)
|
223
|
+
# Clean up potentially created directories? Or leave them empty? Let's leave them.
|
224
|
+
return
|
225
|
+
|
226
|
+
logger.info(f"Processed {elements_processed} text elements, skipped {elements_skipped}.")
|
227
|
+
|
228
|
+
# --- 3. Generate Dictionary File (`dict.txt`) ---
|
229
|
+
dict_path = os.path.join(output_dir, "dict.txt")
|
230
|
+
try:
|
231
|
+
# Log the character set before sorting/writing
|
232
|
+
logger.debug(f"Exporter final char_set before sorting: {repr(char_set)}")
|
233
|
+
# PaddleOCR typically doesn't require special tokens like <UNK> or <BLK> in the dict
|
234
|
+
# for recognition models, but this might depend on the specific base model.
|
235
|
+
# Start with just the characters found.
|
236
|
+
sorted_chars = sorted(list(char_set), reverse=True)
|
237
|
+
with open(dict_path, "w", encoding="utf-8") as f_dict:
|
238
|
+
for char in sorted_chars:
|
239
|
+
# Ensure we don't write empty strings or just newlines as dictionary entries
|
240
|
+
if char and char != "\n":
|
241
|
+
f_dict.write(char + "\n")
|
242
|
+
logger.info(f"Created dictionary file with {len(sorted_chars)} characters: {dict_path}")
|
243
|
+
except Exception as e:
|
244
|
+
logger.error(f"Failed to write dictionary file '{dict_path}': {e}", exc_info=True)
|
245
|
+
raise # Re-raise as this is critical
|
246
|
+
|
247
|
+
# --- 4. Generate Label Files (`train.txt`, `val.txt` or `label.txt`) ---
|
248
|
+
if self.split_ratio is not None and 0 < self.split_ratio < 1:
|
249
|
+
if self.random_seed is not None:
|
250
|
+
random.seed(self.random_seed)
|
251
|
+
random.shuffle(labels)
|
252
|
+
split_index = int(len(labels) * self.split_ratio)
|
253
|
+
train_labels = labels[:split_index]
|
254
|
+
val_labels = labels[split_index:]
|
255
|
+
|
256
|
+
try:
|
257
|
+
train_path = os.path.join(output_dir, "train.txt")
|
258
|
+
with open(train_path, "w", encoding="utf-8") as f_train:
|
259
|
+
for img_path, text in train_labels:
|
260
|
+
f_train.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
|
261
|
+
logger.info(
|
262
|
+
f"Created training label file with {len(train_labels)} entries: {train_path}"
|
263
|
+
)
|
264
|
+
|
265
|
+
val_path = os.path.join(output_dir, "val.txt")
|
266
|
+
with open(val_path, "w", encoding="utf-8") as f_val:
|
267
|
+
for img_path, text in val_labels:
|
268
|
+
f_val.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
|
269
|
+
logger.info(
|
270
|
+
f"Created validation label file with {len(val_labels)} entries: {val_path}"
|
271
|
+
)
|
272
|
+
except Exception as e:
|
273
|
+
logger.error(f"Failed to write train/validation label files: {e}", exc_info=True)
|
274
|
+
raise
|
275
|
+
else:
|
276
|
+
# Create a single label file
|
277
|
+
label_path = os.path.join(output_dir, "label.txt")
|
278
|
+
try:
|
279
|
+
with open(label_path, "w", encoding="utf-8") as f_label:
|
280
|
+
for img_path, text in labels:
|
281
|
+
f_label.write(f"{img_path}\t{text}\n") # Use literal tabs and newlines
|
282
|
+
logger.info(f"Created single label file with {len(labels)} entries: {label_path}")
|
283
|
+
except Exception as e:
|
284
|
+
logger.error(f"Failed to write label file '{label_path}': {e}", exc_info=True)
|
285
|
+
raise
|
286
|
+
|
287
|
+
# --- 5. Include Guide Notebook ---
|
288
|
+
if self.include_guide:
|
289
|
+
self._copy_guide_notebook(output_dir)
|
290
|
+
|
291
|
+
logger.info(f"PaddleOCR recognition data export completed successfully to '{output_dir}'.")
|
292
|
+
|
293
|
+
def _copy_guide_notebook(self, output_dir: str):
|
294
|
+
"""Locates, converts (md->ipynb), and copies the guide notebook."""
|
295
|
+
try:
|
296
|
+
# Try importing conversion library
|
297
|
+
import jupytext
|
298
|
+
from nbformat import write as write_notebook
|
299
|
+
except ImportError:
|
300
|
+
logger.warning(
|
301
|
+
"Could not import 'jupytext' or 'nbformat'. Skipping guide notebook generation. "
|
302
|
+
"Install with 'pip install natural-pdf[dev]' or 'pip install jupytext nbformat'."
|
303
|
+
)
|
304
|
+
return
|
305
|
+
|
306
|
+
try:
|
307
|
+
# Locate the template .md file relative to this script
|
308
|
+
exporter_dir = os.path.dirname(os.path.abspath(__file__))
|
309
|
+
# Go up two levels (exporters -> natural_pdf) then down to templates/finetune
|
310
|
+
template_dir = os.path.abspath(
|
311
|
+
os.path.join(exporter_dir, "..", "templates", "finetune")
|
312
|
+
)
|
313
|
+
template_md_path = os.path.join(template_dir, "fine_tune_paddleocr.md")
|
314
|
+
output_ipynb_path = os.path.join(output_dir, "fine_tune_paddleocr.ipynb")
|
315
|
+
|
316
|
+
if not os.path.exists(template_md_path):
|
317
|
+
logger.error(
|
318
|
+
f"Guide template not found at expected location: {template_md_path}. Trying alternate path."
|
319
|
+
)
|
320
|
+
# Try path relative to workspace root as fallback if run from project root
|
321
|
+
alt_template_path = os.path.abspath(
|
322
|
+
os.path.join("natural_pdf", "templates", "finetune", "fine_tune_paddleocr.md")
|
323
|
+
)
|
324
|
+
if os.path.exists(alt_template_path):
|
325
|
+
template_md_path = alt_template_path
|
326
|
+
logger.info(f"Found guide template at alternate path: {template_md_path}")
|
327
|
+
else:
|
328
|
+
logger.error(
|
329
|
+
f"Guide template also not found at: {alt_template_path}. Cannot copy guide."
|
330
|
+
)
|
331
|
+
return
|
332
|
+
|
333
|
+
# Convert Markdown to Notebook object using jupytext
|
334
|
+
logger.debug(f"Reading guide template from: {template_md_path}")
|
335
|
+
notebook = jupytext.read(template_md_path) # Reads md and returns NotebookNode
|
336
|
+
|
337
|
+
# Write the Notebook object to the output .ipynb file
|
338
|
+
logger.debug(f"Writing guide notebook to: {output_ipynb_path}")
|
339
|
+
with open(output_ipynb_path, "w", encoding="utf-8") as f_nb:
|
340
|
+
write_notebook(notebook, f_nb)
|
341
|
+
|
342
|
+
logger.info(f"Copied and converted fine-tuning guide notebook to: {output_ipynb_path}")
|
343
|
+
|
344
|
+
except Exception as e:
|
345
|
+
logger.error(f"Failed to copy/convert guide notebook: {e}", exc_info=True)
|
natural_pdf/ocr/__init__.py
CHANGED
@@ -11,7 +11,13 @@ logger = logging.getLogger("natural_pdf.ocr")
|
|
11
11
|
|
12
12
|
# Import the base classes that are always available
|
13
13
|
from .engine import OCREngine
|
14
|
-
from .ocr_options import
|
14
|
+
from .ocr_options import (
|
15
|
+
OCROptions,
|
16
|
+
BaseOCROptions,
|
17
|
+
EasyOCROptions,
|
18
|
+
PaddleOCROptions,
|
19
|
+
SuryaOCROptions,
|
20
|
+
)
|
15
21
|
from .ocr_manager import OCRManager
|
16
22
|
from .ocr_factory import OCRFactory
|
17
23
|
|
@@ -22,13 +28,14 @@ __all__ = [
|
|
22
28
|
"OCROptions",
|
23
29
|
"BaseOCROptions",
|
24
30
|
"EasyOCROptions",
|
25
|
-
"PaddleOCROptions",
|
31
|
+
"PaddleOCROptions",
|
26
32
|
"SuryaOCROptions",
|
27
33
|
"OCRFactory",
|
28
34
|
"get_engine",
|
29
|
-
"list_available_engines"
|
35
|
+
"list_available_engines",
|
30
36
|
]
|
31
37
|
|
38
|
+
|
32
39
|
def get_engine(engine_name=None, **kwargs):
|
33
40
|
"""
|
34
41
|
Get OCR engine by name with graceful handling of missing dependencies.
|
@@ -40,27 +47,27 @@ def get_engine(engine_name=None, **kwargs):
|
|
40
47
|
|
41
48
|
Returns:
|
42
49
|
OCREngine instance
|
43
|
-
|
50
|
+
|
44
51
|
Raises:
|
45
52
|
ImportError: If the requested engine's dependencies aren't installed
|
46
53
|
ValueError: If the engine_name is unknown
|
47
54
|
"""
|
48
55
|
logger.debug(f"Initializing OCR engine: {engine_name or 'best available'}")
|
49
|
-
|
56
|
+
|
50
57
|
try:
|
51
58
|
if engine_name is None or engine_name == "default":
|
52
59
|
# Use the factory to get the best available engine
|
53
60
|
engine = OCRFactory.get_recommended_engine(**kwargs)
|
54
61
|
logger.info(f"Using recommended OCR engine: {engine.__class__.__name__}")
|
55
62
|
return engine
|
56
|
-
|
63
|
+
|
57
64
|
# Use the factory to create a specific engine
|
58
65
|
normalized_name = engine_name.lower()
|
59
66
|
if normalized_name in ["easyocr", "paddle", "surya"]:
|
60
67
|
return OCRFactory.create_engine(normalized_name, **kwargs)
|
61
68
|
else:
|
62
69
|
raise ValueError(f"Unknown OCR engine: {engine_name}")
|
63
|
-
|
70
|
+
|
64
71
|
except ImportError as e:
|
65
72
|
logger.error(f"OCR engine dependency error: {e}")
|
66
73
|
raise
|
@@ -68,10 +75,11 @@ def get_engine(engine_name=None, **kwargs):
|
|
68
75
|
logger.error(f"Error initializing OCR engine: {e}")
|
69
76
|
raise
|
70
77
|
|
78
|
+
|
71
79
|
def list_available_engines():
|
72
80
|
"""
|
73
81
|
List all available OCR engines.
|
74
|
-
|
82
|
+
|
75
83
|
Returns:
|
76
84
|
Dict[str, bool]: Dictionary mapping engine names to availability status
|
77
85
|
"""
|