natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/finetuning/index.md +176 -0
- docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/collections/pdf_collection.py +5 -2
- natural_pdf/core/element_manager.py +6 -4
- natural_pdf/core/page.py +36 -27
- natural_pdf/core/pdf.py +25 -16
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +13 -14
- natural_pdf/elements/region.py +7 -6
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +81 -40
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +15 -11
- natural_pdf/ocr/ocr_options.py +5 -0
- natural_pdf/ocr/utils.py +46 -31
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +44 -64
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
natural_pdf/ocr/ocr_factory.py
CHANGED
@@ -13,14 +13,14 @@ class OCRFactory:
|
|
13
13
|
@staticmethod
|
14
14
|
def create_engine(engine_type: str, **kwargs) -> OCREngine:
|
15
15
|
"""Create and return an OCR engine instance.
|
16
|
-
|
16
|
+
|
17
17
|
Args:
|
18
18
|
engine_type: One of 'surya', 'easyocr', 'paddle'
|
19
19
|
**kwargs: Arguments to pass to the engine constructor
|
20
|
-
|
20
|
+
|
21
21
|
Returns:
|
22
22
|
An initialized OCR engine
|
23
|
-
|
23
|
+
|
24
24
|
Raises:
|
25
25
|
ImportError: If the required dependencies aren't installed
|
26
26
|
ValueError: If the engine_type is unknown
|
@@ -28,72 +28,83 @@ class OCRFactory:
|
|
28
28
|
if engine_type == "surya":
|
29
29
|
try:
|
30
30
|
from .engine_surya import SuryaOCREngine
|
31
|
+
|
31
32
|
return SuryaOCREngine(**kwargs)
|
32
33
|
except ImportError:
|
33
|
-
raise ImportError(
|
34
|
-
|
34
|
+
raise ImportError(
|
35
|
+
"Surya engine requires the 'surya' package. " "Install with: pip install surya"
|
36
|
+
)
|
35
37
|
elif engine_type == "easyocr":
|
36
38
|
try:
|
37
39
|
from .engine_easyocr import EasyOCREngine
|
40
|
+
|
38
41
|
return EasyOCREngine(**kwargs)
|
39
42
|
except ImportError:
|
40
|
-
raise ImportError(
|
41
|
-
|
43
|
+
raise ImportError(
|
44
|
+
"EasyOCR engine requires the 'easyocr' package. "
|
45
|
+
"Install with: pip install easyocr"
|
46
|
+
)
|
42
47
|
elif engine_type == "paddle":
|
43
48
|
try:
|
44
49
|
from .engine_paddle import PaddleOCREngine
|
50
|
+
|
45
51
|
return PaddleOCREngine(**kwargs)
|
46
52
|
except ImportError:
|
47
|
-
raise ImportError(
|
48
|
-
|
53
|
+
raise ImportError(
|
54
|
+
"PaddleOCR engine requires 'paddleocr' and 'paddlepaddle'. "
|
55
|
+
"Install with: pip install paddleocr paddlepaddle"
|
56
|
+
)
|
49
57
|
else:
|
50
58
|
raise ValueError(f"Unknown engine type: {engine_type}")
|
51
|
-
|
59
|
+
|
52
60
|
@staticmethod
|
53
61
|
def list_available_engines() -> Dict[str, bool]:
|
54
62
|
"""Returns a dictionary of engine names and their availability status."""
|
55
63
|
engines = {}
|
56
|
-
|
64
|
+
|
57
65
|
# Check Surya
|
58
66
|
try:
|
59
67
|
engines["surya"] = importlib.util.find_spec("surya") is not None
|
60
68
|
except ImportError:
|
61
69
|
engines["surya"] = False
|
62
|
-
|
70
|
+
|
63
71
|
# Check EasyOCR
|
64
72
|
try:
|
65
73
|
engines["easyocr"] = importlib.util.find_spec("easyocr") is not None
|
66
74
|
except ImportError:
|
67
75
|
engines["easyocr"] = False
|
68
|
-
|
76
|
+
|
69
77
|
# Check PaddleOCR
|
70
78
|
try:
|
71
|
-
paddle =
|
79
|
+
paddle = (
|
80
|
+
importlib.util.find_spec("paddle") is not None
|
81
|
+
or importlib.util.find_spec("paddlepaddle") is not None
|
82
|
+
)
|
72
83
|
paddleocr = importlib.util.find_spec("paddleocr") is not None
|
73
84
|
engines["paddle"] = paddle and paddleocr
|
74
85
|
except ImportError:
|
75
86
|
engines["paddle"] = False
|
76
|
-
|
87
|
+
|
77
88
|
return engines
|
78
|
-
|
89
|
+
|
79
90
|
@staticmethod
|
80
91
|
def get_recommended_engine(**kwargs) -> OCREngine:
|
81
92
|
"""Returns the best available OCR engine based on what's installed.
|
82
|
-
|
93
|
+
|
83
94
|
First tries engines in order of preference: EasyOCR, Paddle, Surya.
|
84
95
|
If none are available, raises ImportError with installation instructions.
|
85
|
-
|
96
|
+
|
86
97
|
Args:
|
87
98
|
**kwargs: Arguments to pass to the engine constructor
|
88
|
-
|
99
|
+
|
89
100
|
Returns:
|
90
101
|
The best available OCR engine instance
|
91
|
-
|
102
|
+
|
92
103
|
Raises:
|
93
104
|
ImportError: If no engines are available
|
94
105
|
"""
|
95
106
|
available = OCRFactory.list_available_engines()
|
96
|
-
|
107
|
+
|
97
108
|
# Try engines in order of recommendation
|
98
109
|
if available.get("easyocr", False):
|
99
110
|
logger.info("Using EasyOCR engine (recommended)")
|
@@ -104,11 +115,11 @@ class OCRFactory:
|
|
104
115
|
elif available.get("surya", False):
|
105
116
|
logger.info("Using Surya OCR engine")
|
106
117
|
return OCRFactory.create_engine("surya", **kwargs)
|
107
|
-
|
118
|
+
|
108
119
|
# If we get here, no engines are available
|
109
120
|
raise ImportError(
|
110
121
|
"No OCR engines available. Please install at least one of: \n"
|
111
122
|
"- EasyOCR (recommended): pip install easyocr\n"
|
112
123
|
"- PaddleOCR: pip install paddleocr paddlepaddle\n"
|
113
124
|
"- Surya OCR: pip install surya"
|
114
|
-
)
|
125
|
+
)
|
natural_pdf/ocr/ocr_manager.py
CHANGED
@@ -65,7 +65,7 @@ class OCRManager:
|
|
65
65
|
device: Optional[str] = None,
|
66
66
|
detect_only: bool = False,
|
67
67
|
# --- Engine-Specific Options ---
|
68
|
-
options: Optional[Any] = None,
|
68
|
+
options: Optional[Any] = None, # e.g. EasyOCROptions(), PaddleOCROptions()
|
69
69
|
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
70
70
|
"""
|
71
71
|
Applies OCR to a single image or a batch of images.
|
@@ -100,7 +100,7 @@ class OCRManager:
|
|
100
100
|
if not is_batch and not isinstance(images, Image.Image):
|
101
101
|
raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
|
102
102
|
|
103
|
-
# --- Determine Engine ---
|
103
|
+
# --- Determine Engine ---
|
104
104
|
selected_engine_name = (engine or "easyocr").lower()
|
105
105
|
if selected_engine_name not in self.ENGINE_REGISTRY:
|
106
106
|
raise ValueError(
|
@@ -108,35 +108,39 @@ class OCRManager:
|
|
108
108
|
)
|
109
109
|
logger.debug(f"Selected engine: '{selected_engine_name}'")
|
110
110
|
|
111
|
-
# --- Prepare Options ---
|
111
|
+
# --- Prepare Options ---
|
112
112
|
final_options = copy.deepcopy(options) if options is not None else None
|
113
|
-
|
113
|
+
|
114
114
|
# Type check options object if provided
|
115
115
|
if final_options is not None:
|
116
|
-
options_class = self.ENGINE_REGISTRY[selected_engine_name].get(
|
116
|
+
options_class = self.ENGINE_REGISTRY[selected_engine_name].get(
|
117
|
+
"options_class", BaseOCROptions
|
118
|
+
)
|
117
119
|
if not isinstance(final_options, options_class):
|
118
|
-
|
120
|
+
# Allow dicts to be passed directly too, assuming engine handles them
|
119
121
|
if not isinstance(final_options, dict):
|
120
|
-
|
122
|
+
raise TypeError(
|
121
123
|
f"Provided options type '{type(final_options).__name__}' is not compatible with engine '{selected_engine_name}'. Expected '{options_class.__name__}' or dict."
|
122
124
|
)
|
123
125
|
|
124
|
-
# --- Get Engine Instance and Process ---
|
126
|
+
# --- Get Engine Instance and Process ---
|
125
127
|
try:
|
126
128
|
engine_instance = self._get_engine_instance(selected_engine_name)
|
127
129
|
processing_mode = "batch" if is_batch else "single image"
|
128
130
|
logger.info(f"Processing {processing_mode} with engine '{selected_engine_name}'...")
|
129
|
-
logger.debug(
|
131
|
+
logger.debug(
|
132
|
+
f" Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
|
133
|
+
)
|
130
134
|
|
131
135
|
# Call the engine's process_image, passing common args and options object
|
132
136
|
# **ASSUMPTION**: Engine process_image signatures are updated to accept these common args.
|
133
137
|
results = engine_instance.process_image(
|
134
|
-
images=images,
|
138
|
+
images=images,
|
135
139
|
languages=languages,
|
136
140
|
min_confidence=min_confidence,
|
137
141
|
device=device,
|
138
142
|
detect_only=detect_only,
|
139
|
-
options=final_options
|
143
|
+
options=final_options,
|
140
144
|
)
|
141
145
|
|
142
146
|
# Log result summary based on mode
|
natural_pdf/ocr/ocr_options.py
CHANGED
@@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
13
13
|
@dataclass
|
14
14
|
class BaseOCROptions:
|
15
15
|
"""Base class for OCR engine options."""
|
16
|
+
|
16
17
|
extra_args: Dict[str, Any] = field(default_factory=dict)
|
17
18
|
|
18
19
|
|
@@ -20,6 +21,7 @@ class BaseOCROptions:
|
|
20
21
|
@dataclass
|
21
22
|
class EasyOCROptions(BaseOCROptions):
|
22
23
|
"""Specific options for the EasyOCR engine."""
|
24
|
+
|
23
25
|
model_storage_directory: Optional[str] = None
|
24
26
|
user_network_directory: Optional[str] = None
|
25
27
|
recog_network: str = "english_g2"
|
@@ -64,6 +66,7 @@ class EasyOCROptions(BaseOCROptions):
|
|
64
66
|
@dataclass
|
65
67
|
class PaddleOCROptions(BaseOCROptions):
|
66
68
|
"""Specific options for the PaddleOCR engine."""
|
69
|
+
|
67
70
|
use_angle_cls: bool = True
|
68
71
|
use_gpu: Optional[bool] = None
|
69
72
|
gpu_mem: int = 500
|
@@ -90,6 +93,7 @@ class PaddleOCROptions(BaseOCROptions):
|
|
90
93
|
|
91
94
|
def __post_init__(self):
|
92
95
|
pass
|
96
|
+
|
93
97
|
# if self.use_gpu is None:
|
94
98
|
# if self.device and "cuda" in self.device.lower():
|
95
99
|
# self.use_gpu = True
|
@@ -102,6 +106,7 @@ class PaddleOCROptions(BaseOCROptions):
|
|
102
106
|
@dataclass
|
103
107
|
class SuryaOCROptions(BaseOCROptions):
|
104
108
|
"""Specific options for the Surya OCR engine."""
|
109
|
+
|
105
110
|
# Currently, Surya example shows languages passed at prediction time.
|
106
111
|
pass
|
107
112
|
|
natural_pdf/ocr/utils.py
CHANGED
@@ -10,51 +10,71 @@ if TYPE_CHECKING:
|
|
10
10
|
|
11
11
|
logger = logging.getLogger(__name__)
|
12
12
|
|
13
|
+
|
13
14
|
def _apply_ocr_correction_to_elements(
|
14
15
|
elements: Iterable["Element"],
|
15
16
|
correction_callback: Callable[[Any], Optional[str]],
|
17
|
+
caller_info: str = "Utility",
|
16
18
|
) -> None:
|
17
19
|
"""
|
18
|
-
Applies correction callback to a list of elements in place,
|
20
|
+
Applies OCR correction callback to a list of elements in place,
|
19
21
|
showing a progress bar.
|
20
22
|
|
21
|
-
Iterates through elements,
|
22
|
-
element.text if a new string is returned.
|
23
|
-
|
23
|
+
Iterates through elements, checks if source starts with 'ocr', calls
|
24
|
+
the callback, and updates element.text if a new string is returned.
|
25
|
+
|
24
26
|
Args:
|
25
27
|
elements: An iterable of Element objects.
|
26
28
|
correction_callback: A function accepting an element and returning
|
27
29
|
Optional[str] (new text or None).
|
30
|
+
caller_info: String identifying the calling context for logs.
|
28
31
|
"""
|
32
|
+
if not callable(correction_callback):
|
33
|
+
# Raise error here so individual methods don't need to repeat the check
|
34
|
+
raise TypeError("`correction_callback` must be a callable function.")
|
35
|
+
|
36
|
+
if not elements:
|
37
|
+
logger.warning(f"{caller_info}: No elements provided for correction.")
|
38
|
+
return
|
39
|
+
|
29
40
|
corrections_applied = 0
|
30
41
|
elements_checked = 0
|
31
42
|
|
32
43
|
# Prepare the iterable with tqdm
|
33
|
-
element_iterable = tqdm(elements, desc=f"Correcting OCR", unit="element")
|
44
|
+
element_iterable = tqdm(elements, desc=f"Correcting OCR ({caller_info})", unit="element")
|
34
45
|
|
35
46
|
for element in element_iterable:
|
36
47
|
# Check if the element is likely from OCR and has text attribute
|
37
|
-
element_source = getattr(element,
|
38
|
-
if
|
48
|
+
element_source = getattr(element, "source", None)
|
49
|
+
if (
|
50
|
+
isinstance(element_source, str)
|
51
|
+
and element_source.startswith("ocr")
|
52
|
+
and hasattr(element, "text")
|
53
|
+
):
|
39
54
|
elements_checked += 1
|
40
|
-
current_text = getattr(element,
|
55
|
+
current_text = getattr(element, "text") # Already checked hasattr
|
41
56
|
|
42
57
|
new_text = correction_callback(element)
|
43
58
|
|
44
59
|
if new_text is not None:
|
45
60
|
if new_text != current_text:
|
46
|
-
element.text = new_text
|
61
|
+
element.text = new_text # Update in place
|
47
62
|
corrections_applied += 1
|
48
63
|
|
49
|
-
logger.info(
|
64
|
+
logger.info(
|
65
|
+
f"{caller_info}: OCR correction finished. Checked: {elements_checked}, Applied: {corrections_applied}"
|
66
|
+
)
|
67
|
+
# No return value needed, modifies elements in place
|
50
68
|
|
51
69
|
|
52
|
-
def direct_ocr_llm(
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
70
|
+
def direct_ocr_llm(
|
71
|
+
element,
|
72
|
+
client,
|
73
|
+
model="",
|
74
|
+
resolution=150,
|
75
|
+
prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc.",
|
76
|
+
padding=2,
|
77
|
+
) -> str:
|
58
78
|
"""Convenience method to directly OCR a region of the page."""
|
59
79
|
|
60
80
|
if isinstance(element, TextElement):
|
@@ -65,34 +85,29 @@ def direct_ocr_llm(element,
|
|
65
85
|
buffered = io.BytesIO()
|
66
86
|
region_img = region.to_image(resolution=resolution, include_highlights=False)
|
67
87
|
region_img.save(buffered, format="PNG")
|
68
|
-
base64_image = base64.b64encode(buffered.getvalue()).decode(
|
88
|
+
base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
69
89
|
|
70
90
|
response = client.chat.completions.create(
|
71
91
|
model=model,
|
72
92
|
messages=[
|
73
93
|
{
|
74
94
|
"role": "system",
|
75
|
-
"content": "You are an expert OCR engineer. You will be given an image of a region of a page. You will return the exact text from the image."
|
95
|
+
"content": "You are an expert OCR engineer. You will be given an image of a region of a page. You will return the exact text from the image.",
|
76
96
|
},
|
77
97
|
{
|
78
98
|
"role": "user",
|
79
99
|
"content": [
|
80
|
-
{
|
81
|
-
"type": "text",
|
82
|
-
"text": prompt
|
83
|
-
},
|
100
|
+
{"type": "text", "text": prompt},
|
84
101
|
{
|
85
102
|
"type": "image_url",
|
86
|
-
"image_url": {
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
}
|
92
|
-
]
|
103
|
+
"image_url": {"url": f"data:image/png;base64,{base64_image}"},
|
104
|
+
},
|
105
|
+
],
|
106
|
+
},
|
107
|
+
],
|
93
108
|
)
|
94
|
-
|
109
|
+
|
95
110
|
corrected = response.choices[0].message.content
|
96
111
|
logger.debug(f"Corrected {region.extract_text()} to {corrected}")
|
97
112
|
|
98
|
-
return corrected
|
113
|
+
return corrected
|