natural-pdf 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/categorizing-documents/index.md +168 -0
- docs/data-extraction/index.md +87 -0
- docs/element-selection/index.ipynb +218 -164
- docs/element-selection/index.md +20 -0
- docs/finetuning/index.md +176 -0
- docs/index.md +19 -0
- docs/ocr/index.md +63 -16
- docs/tutorials/01-loading-and-extraction.ipynb +411 -248
- docs/tutorials/02-finding-elements.ipynb +123 -46
- docs/tutorials/03-extracting-blocks.ipynb +24 -19
- docs/tutorials/04-table-extraction.ipynb +17 -12
- docs/tutorials/05-excluding-content.ipynb +37 -32
- docs/tutorials/06-document-qa.ipynb +36 -31
- docs/tutorials/07-layout-analysis.ipynb +45 -40
- docs/tutorials/07-working-with-regions.ipynb +61 -60
- docs/tutorials/08-spatial-navigation.ipynb +76 -71
- docs/tutorials/09-section-extraction.ipynb +160 -155
- docs/tutorials/10-form-field-extraction.ipynb +71 -66
- docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
- docs/tutorials/12-ocr-integration.ipynb +3420 -312
- docs/tutorials/12-ocr-integration.md +68 -106
- docs/tutorials/13-semantic-search.ipynb +641 -251
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/classification/manager.py +343 -0
- natural_pdf/classification/mixin.py +149 -0
- natural_pdf/classification/results.py +62 -0
- natural_pdf/collections/mixins.py +63 -0
- natural_pdf/collections/pdf_collection.py +326 -17
- natural_pdf/core/element_manager.py +73 -4
- natural_pdf/core/page.py +255 -83
- natural_pdf/core/pdf.py +385 -367
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +279 -49
- natural_pdf/elements/region.py +106 -21
- natural_pdf/elements/text.py +5 -2
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/extraction/manager.py +134 -0
- natural_pdf/extraction/mixin.py +246 -0
- natural_pdf/extraction/result.py +37 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +86 -42
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +98 -34
- natural_pdf/ocr/ocr_options.py +38 -10
- natural_pdf/ocr/utils.py +59 -33
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/selectors/parser.py +363 -238
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +420 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +96 -65
- natural_pdf/utils/tqdm_utils.py +43 -0
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +10 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +66 -51
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
natural_pdf/ocr/ocr_manager.py
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
import copy # For deep copying options
|
3
3
|
import logging
|
4
4
|
from typing import Any, Dict, List, Optional, Type, Union
|
5
|
+
import threading # Import threading for lock
|
6
|
+
import time # Import time for timing
|
5
7
|
|
6
8
|
from PIL import Image
|
7
9
|
|
@@ -30,30 +32,68 @@ class OCRManager:
|
|
30
32
|
def __init__(self):
|
31
33
|
"""Initializes the OCR Manager."""
|
32
34
|
self._engine_instances: Dict[str, OCREngine] = {} # Cache for engine instances
|
35
|
+
self._engine_locks: Dict[str, threading.Lock] = {} # Lock per engine type for initialization
|
36
|
+
self._engine_inference_locks: Dict[str, threading.Lock] = {} # Lock per engine type for inference
|
33
37
|
logger.info("OCRManager initialized.")
|
34
38
|
|
35
39
|
def _get_engine_instance(self, engine_name: str) -> OCREngine:
|
36
|
-
"""Retrieves or creates an instance of the specified OCR engine."""
|
40
|
+
"""Retrieves or creates an instance of the specified OCR engine, ensuring thread-safe initialization."""
|
37
41
|
engine_name = engine_name.lower()
|
38
42
|
if engine_name not in self.ENGINE_REGISTRY:
|
39
43
|
raise ValueError(
|
40
44
|
f"Unknown OCR engine: '{engine_name}'. Available: {list(self.ENGINE_REGISTRY.keys())}"
|
41
45
|
)
|
42
46
|
|
43
|
-
if
|
44
|
-
|
45
|
-
|
46
|
-
engine_instance = engine_class() # Instantiate first
|
47
|
-
if not engine_instance.is_available():
|
48
|
-
# Check availability before storing
|
49
|
-
# Construct helpful error message with install hint
|
50
|
-
install_hint = f"pip install 'natural-pdf[{engine_name}]'"
|
51
|
-
raise RuntimeError(
|
52
|
-
f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
|
53
|
-
)
|
54
|
-
self._engine_instances[engine_name] = engine_instance # Store if available
|
47
|
+
# Quick check if instance already exists (avoid lock contention)
|
48
|
+
if engine_name in self._engine_instances:
|
49
|
+
return self._engine_instances[engine_name]
|
55
50
|
|
56
|
-
|
51
|
+
# Get or create the lock for this engine type
|
52
|
+
if engine_name not in self._engine_locks:
|
53
|
+
self._engine_locks[engine_name] = threading.Lock()
|
54
|
+
|
55
|
+
engine_init_lock = self._engine_locks[engine_name]
|
56
|
+
|
57
|
+
# Acquire lock to safely check and potentially initialize the engine
|
58
|
+
with engine_init_lock:
|
59
|
+
# Double-check if another thread initialized it while we waited for the lock
|
60
|
+
if engine_name in self._engine_instances:
|
61
|
+
return self._engine_instances[engine_name]
|
62
|
+
|
63
|
+
# If still not initialized, create it now under the lock
|
64
|
+
logger.info(f"[{threading.current_thread().name}] Creating shared instance of engine: {engine_name}")
|
65
|
+
engine_class = self.ENGINE_REGISTRY[engine_name]["class"]
|
66
|
+
start_time = time.monotonic() # Optional: time initialization
|
67
|
+
try:
|
68
|
+
engine_instance = engine_class() # Instantiate first
|
69
|
+
if not engine_instance.is_available():
|
70
|
+
# Check availability before storing
|
71
|
+
install_hint = f"pip install 'natural-pdf[{engine_name}]'"
|
72
|
+
raise RuntimeError(
|
73
|
+
f"Engine '{engine_name}' is not available. Please install the required dependencies: {install_hint}"
|
74
|
+
)
|
75
|
+
# Store the shared instance
|
76
|
+
self._engine_instances[engine_name] = engine_instance
|
77
|
+
end_time = time.monotonic()
|
78
|
+
logger.info(f"[{threading.current_thread().name}] Shared instance of {engine_name} created successfully (Duration: {end_time - start_time:.2f}s).")
|
79
|
+
return engine_instance
|
80
|
+
except Exception as e:
|
81
|
+
# Ensure we don't leave a partial state if init fails
|
82
|
+
logger.error(f"[{threading.current_thread().name}] Failed to create shared instance of {engine_name}: {e}", exc_info=True)
|
83
|
+
# Remove potentially partial entry if exists
|
84
|
+
if engine_name in self._engine_instances: del self._engine_instances[engine_name]
|
85
|
+
raise # Re-raise the exception after logging
|
86
|
+
|
87
|
+
def _get_engine_inference_lock(self, engine_name: str) -> threading.Lock:
|
88
|
+
"""Gets or creates the inference lock for a given engine type."""
|
89
|
+
engine_name = engine_name.lower()
|
90
|
+
# Assume engine_name is valid as it's checked before this would be called
|
91
|
+
if engine_name not in self._engine_inference_locks:
|
92
|
+
# Create lock if it doesn't exist (basic thread safety for dict access)
|
93
|
+
# A more robust approach might lock around this check/creation too,
|
94
|
+
# but contention here is less critical than for engine init or inference itself.
|
95
|
+
self._engine_inference_locks[engine_name] = threading.Lock()
|
96
|
+
return self._engine_inference_locks[engine_name]
|
57
97
|
|
58
98
|
def apply_ocr(
|
59
99
|
self,
|
@@ -65,7 +105,7 @@ class OCRManager:
|
|
65
105
|
device: Optional[str] = None,
|
66
106
|
detect_only: bool = False,
|
67
107
|
# --- Engine-Specific Options ---
|
68
|
-
options: Optional[Any] = None,
|
108
|
+
options: Optional[Any] = None, # e.g. EasyOCROptions(), PaddleOCROptions()
|
69
109
|
) -> Union[List[Dict[str, Any]], List[List[Dict[str, Any]]]]:
|
70
110
|
"""
|
71
111
|
Applies OCR to a single image or a batch of images.
|
@@ -100,7 +140,7 @@ class OCRManager:
|
|
100
140
|
if not is_batch and not isinstance(images, Image.Image):
|
101
141
|
raise TypeError("Input 'images' must be a PIL Image or a list of PIL Images.")
|
102
142
|
|
103
|
-
# --- Determine Engine ---
|
143
|
+
# --- Determine Engine ---
|
104
144
|
selected_engine_name = (engine or "easyocr").lower()
|
105
145
|
if selected_engine_name not in self.ENGINE_REGISTRY:
|
106
146
|
raise ValueError(
|
@@ -108,37 +148,61 @@ class OCRManager:
|
|
108
148
|
)
|
109
149
|
logger.debug(f"Selected engine: '{selected_engine_name}'")
|
110
150
|
|
111
|
-
# --- Prepare Options ---
|
151
|
+
# --- Prepare Options ---
|
112
152
|
final_options = copy.deepcopy(options) if options is not None else None
|
113
|
-
|
153
|
+
|
114
154
|
# Type check options object if provided
|
115
155
|
if final_options is not None:
|
116
|
-
options_class = self.ENGINE_REGISTRY[selected_engine_name].get(
|
156
|
+
options_class = self.ENGINE_REGISTRY[selected_engine_name].get(
|
157
|
+
"options_class", BaseOCROptions
|
158
|
+
)
|
117
159
|
if not isinstance(final_options, options_class):
|
118
|
-
|
160
|
+
# Allow dicts to be passed directly too, assuming engine handles them
|
119
161
|
if not isinstance(final_options, dict):
|
120
|
-
|
162
|
+
raise TypeError(
|
121
163
|
f"Provided options type '{type(final_options).__name__}' is not compatible with engine '{selected_engine_name}'. Expected '{options_class.__name__}' or dict."
|
122
164
|
)
|
123
165
|
|
124
|
-
# --- Get Engine Instance and Process ---
|
166
|
+
# --- Get Engine Instance and Process ---
|
125
167
|
try:
|
126
168
|
engine_instance = self._get_engine_instance(selected_engine_name)
|
127
169
|
processing_mode = "batch" if is_batch else "single image"
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
results = engine_instance.process_image(
|
134
|
-
images=images,
|
135
|
-
languages=languages,
|
136
|
-
min_confidence=min_confidence,
|
137
|
-
device=device,
|
138
|
-
detect_only=detect_only,
|
139
|
-
options=final_options
|
170
|
+
# Log thread name for clarity during parallel calls
|
171
|
+
thread_id = threading.current_thread().name
|
172
|
+
logger.info(f"[{thread_id}] Processing {processing_mode} using shared engine instance '{selected_engine_name}'...")
|
173
|
+
logger.debug(
|
174
|
+
f" Engine Args: languages={languages}, min_confidence={min_confidence}, device={device}, options={final_options}"
|
140
175
|
)
|
141
176
|
|
177
|
+
# Log image dimensions before processing
|
178
|
+
if is_batch:
|
179
|
+
image_dims = [f"{img.width}x{img.height}" for img in images if hasattr(img, 'width') and hasattr(img, 'height')]
|
180
|
+
logger.debug(f"[{thread_id}] Processing batch of {len(images)} images with dimensions: {image_dims}")
|
181
|
+
elif hasattr(images, 'width') and hasattr(images, 'height'):
|
182
|
+
logger.debug(f"[{thread_id}] Processing single image with dimensions: {images.width}x{images.height}")
|
183
|
+
else:
|
184
|
+
logger.warning(f"[{thread_id}] Could not determine dimensions of input image(s).")
|
185
|
+
|
186
|
+
# Acquire lock specifically for the inference call
|
187
|
+
inference_lock = self._get_engine_inference_lock(selected_engine_name)
|
188
|
+
logger.debug(f"[{thread_id}] Attempting to acquire inference lock for {selected_engine_name}...")
|
189
|
+
inference_wait_start = time.monotonic()
|
190
|
+
with inference_lock:
|
191
|
+
inference_acquired_time = time.monotonic()
|
192
|
+
logger.debug(f"[{thread_id}] Acquired inference lock for {selected_engine_name} (waited {inference_acquired_time - inference_wait_start:.2f}s). Calling process_image...")
|
193
|
+
inference_start_time = time.monotonic()
|
194
|
+
|
195
|
+
results = engine_instance.process_image(
|
196
|
+
images=images,
|
197
|
+
languages=languages,
|
198
|
+
min_confidence=min_confidence,
|
199
|
+
device=device,
|
200
|
+
detect_only=detect_only,
|
201
|
+
options=final_options,
|
202
|
+
)
|
203
|
+
inference_end_time = time.monotonic()
|
204
|
+
logger.debug(f"[{thread_id}] process_image call finished for {selected_engine_name} (Duration: {inference_end_time - inference_start_time:.2f}s). Releasing lock.")
|
205
|
+
|
142
206
|
# Log result summary based on mode
|
143
207
|
if is_batch:
|
144
208
|
# Ensure results is a list before trying to get lengths
|
natural_pdf/ocr/ocr_options.py
CHANGED
@@ -13,6 +13,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
13
13
|
@dataclass
|
14
14
|
class BaseOCROptions:
|
15
15
|
"""Base class for OCR engine options."""
|
16
|
+
|
16
17
|
extra_args: Dict[str, Any] = field(default_factory=dict)
|
17
18
|
|
18
19
|
|
@@ -20,6 +21,7 @@ class BaseOCROptions:
|
|
20
21
|
@dataclass
|
21
22
|
class EasyOCROptions(BaseOCROptions):
|
22
23
|
"""Specific options for the EasyOCR engine."""
|
24
|
+
|
23
25
|
model_storage_directory: Optional[str] = None
|
24
26
|
user_network_directory: Optional[str] = None
|
25
27
|
recog_network: str = "english_g2"
|
@@ -64,9 +66,10 @@ class EasyOCROptions(BaseOCROptions):
|
|
64
66
|
@dataclass
|
65
67
|
class PaddleOCROptions(BaseOCROptions):
|
66
68
|
"""Specific options for the PaddleOCR engine."""
|
67
|
-
|
69
|
+
|
70
|
+
# General
|
68
71
|
use_gpu: Optional[bool] = None
|
69
|
-
gpu_mem: int =
|
72
|
+
gpu_mem: int = 8000 # Default from Paddle documentation
|
70
73
|
ir_optim: bool = True
|
71
74
|
use_tensorrt: bool = False
|
72
75
|
min_subgraph_size: int = 15
|
@@ -74,22 +77,46 @@ class PaddleOCROptions(BaseOCROptions):
|
|
74
77
|
enable_mkldnn: bool = False
|
75
78
|
cpu_threads: int = 10
|
76
79
|
use_fp16: bool = False
|
80
|
+
show_log: bool = False
|
81
|
+
use_onnx: bool = False
|
82
|
+
use_zero_copy_run: bool = False
|
83
|
+
|
84
|
+
# Detection
|
85
|
+
det: bool = True
|
86
|
+
det_algorithm: str = "DB"
|
77
87
|
det_model_dir: Optional[str] = None
|
88
|
+
det_limit_side_len: int = 960 # Corresponds to det_max_side_len
|
89
|
+
# DB specific
|
90
|
+
det_db_thresh: float = 0.3
|
91
|
+
det_db_box_thresh: float = 0.5
|
92
|
+
det_db_unclip_ratio: float = 2.0
|
93
|
+
# EAST specific
|
94
|
+
det_east_score_thresh: float = 0.8
|
95
|
+
det_east_cover_thresh: float = 0.1
|
96
|
+
det_east_nms_thresh: float = 0.2
|
97
|
+
|
98
|
+
# Recognition
|
99
|
+
rec: bool = True
|
100
|
+
rec_algorithm: str = "CRNN"
|
78
101
|
rec_model_dir: Optional[str] = None
|
79
|
-
|
80
|
-
|
81
|
-
rec_batch_num: int = 6
|
102
|
+
rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
|
103
|
+
rec_batch_num: int = 30 # Default from Paddle documentation
|
82
104
|
max_text_length: int = 25
|
105
|
+
rec_char_dict_path: Optional[str] = None # Path to char dictionary file
|
83
106
|
use_space_char: bool = True
|
84
107
|
drop_score: float = 0.5
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
108
|
+
|
109
|
+
# Classification
|
110
|
+
cls: Optional[bool] = None # Often inferred from use_angle_cls
|
111
|
+
use_angle_cls: bool = False # Default from Paddle documentation
|
112
|
+
cls_model_dir: Optional[str] = None
|
113
|
+
cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
|
114
|
+
label_list: List[str] = field(default_factory=lambda: ['0', '180']) # Default from Paddle doc
|
115
|
+
cls_batch_num: int = 30
|
90
116
|
|
91
117
|
def __post_init__(self):
|
92
118
|
pass
|
119
|
+
|
93
120
|
# if self.use_gpu is None:
|
94
121
|
# if self.device and "cuda" in self.device.lower():
|
95
122
|
# self.use_gpu = True
|
@@ -102,6 +129,7 @@ class PaddleOCROptions(BaseOCROptions):
|
|
102
129
|
@dataclass
|
103
130
|
class SuryaOCROptions(BaseOCROptions):
|
104
131
|
"""Specific options for the Surya OCR engine."""
|
132
|
+
|
105
133
|
# Currently, Surya example shows languages passed at prediction time.
|
106
134
|
pass
|
107
135
|
|
natural_pdf/ocr/utils.py
CHANGED
@@ -8,53 +8,76 @@ from tqdm.auto import tqdm
|
|
8
8
|
if TYPE_CHECKING:
|
9
9
|
from natural_pdf.elements.base import Element
|
10
10
|
|
11
|
+
# Import the global PDF render lock from dedicated locks module
|
12
|
+
from natural_pdf.utils.locks import pdf_render_lock
|
13
|
+
|
11
14
|
logger = logging.getLogger(__name__)
|
12
15
|
|
16
|
+
|
13
17
|
def _apply_ocr_correction_to_elements(
|
14
18
|
elements: Iterable["Element"],
|
15
19
|
correction_callback: Callable[[Any], Optional[str]],
|
20
|
+
caller_info: str = "Utility",
|
16
21
|
) -> None:
|
17
22
|
"""
|
18
|
-
Applies correction callback to a list of elements in place,
|
23
|
+
Applies OCR correction callback to a list of elements in place,
|
19
24
|
showing a progress bar.
|
20
25
|
|
21
|
-
Iterates through elements,
|
22
|
-
element.text if a new string is returned.
|
23
|
-
|
26
|
+
Iterates through elements, checks if source starts with 'ocr', calls
|
27
|
+
the callback, and updates element.text if a new string is returned.
|
28
|
+
|
24
29
|
Args:
|
25
30
|
elements: An iterable of Element objects.
|
26
31
|
correction_callback: A function accepting an element and returning
|
27
32
|
Optional[str] (new text or None).
|
33
|
+
caller_info: String identifying the calling context for logs.
|
28
34
|
"""
|
35
|
+
if not callable(correction_callback):
|
36
|
+
# Raise error here so individual methods don't need to repeat the check
|
37
|
+
raise TypeError("`correction_callback` must be a callable function.")
|
38
|
+
|
39
|
+
if not elements:
|
40
|
+
logger.warning(f"{caller_info}: No elements provided for correction.")
|
41
|
+
return
|
42
|
+
|
29
43
|
corrections_applied = 0
|
30
44
|
elements_checked = 0
|
31
45
|
|
32
46
|
# Prepare the iterable with tqdm
|
33
|
-
element_iterable = tqdm(elements, desc=f"Correcting OCR", unit="element")
|
47
|
+
element_iterable = tqdm(elements, desc=f"Correcting OCR ({caller_info})", unit="element")
|
34
48
|
|
35
49
|
for element in element_iterable:
|
36
50
|
# Check if the element is likely from OCR and has text attribute
|
37
|
-
element_source = getattr(element,
|
38
|
-
if
|
51
|
+
element_source = getattr(element, "source", None)
|
52
|
+
if (
|
53
|
+
isinstance(element_source, str)
|
54
|
+
and element_source.startswith("ocr")
|
55
|
+
and hasattr(element, "text")
|
56
|
+
):
|
39
57
|
elements_checked += 1
|
40
|
-
current_text = getattr(element,
|
58
|
+
current_text = getattr(element, "text") # Already checked hasattr
|
41
59
|
|
42
60
|
new_text = correction_callback(element)
|
43
61
|
|
44
62
|
if new_text is not None:
|
45
63
|
if new_text != current_text:
|
46
|
-
element.text = new_text
|
64
|
+
element.text = new_text # Update in place
|
47
65
|
corrections_applied += 1
|
48
66
|
|
49
|
-
logger.info(
|
67
|
+
logger.info(
|
68
|
+
f"{caller_info}: OCR correction finished. Checked: {elements_checked}, Applied: {corrections_applied}"
|
69
|
+
)
|
70
|
+
# No return value needed, modifies elements in place
|
50
71
|
|
51
72
|
|
52
|
-
def direct_ocr_llm(
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
73
|
+
def direct_ocr_llm(
|
74
|
+
element,
|
75
|
+
client,
|
76
|
+
model="",
|
77
|
+
resolution=150,
|
78
|
+
prompt="OCR this image. Return only the exact text from the image. Include misspellings, punctuation, etc. If you cannot see any text, return an empty string.",
|
79
|
+
padding=2,
|
80
|
+
) -> str:
|
58
81
|
"""Convenience method to directly OCR a region of the page."""
|
59
82
|
|
60
83
|
if isinstance(element, TextElement):
|
@@ -63,36 +86,39 @@ def direct_ocr_llm(element,
|
|
63
86
|
region = element
|
64
87
|
|
65
88
|
buffered = io.BytesIO()
|
66
|
-
|
89
|
+
# Use the global PDF render lock when rendering images
|
90
|
+
with pdf_render_lock:
|
91
|
+
region_img = region.to_image(resolution=resolution, include_highlights=False)
|
92
|
+
|
93
|
+
# Handle cases where image creation might fail (e.g., zero-dim region)
|
94
|
+
if region_img is None:
|
95
|
+
logger.warning(f"Could not generate image for region {region.bbox}, skipping OCR.")
|
96
|
+
return "" # Return empty string if image creation failed
|
97
|
+
|
67
98
|
region_img.save(buffered, format="PNG")
|
68
|
-
base64_image = base64.b64encode(buffered.getvalue()).decode(
|
99
|
+
base64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
|
69
100
|
|
70
101
|
response = client.chat.completions.create(
|
71
102
|
model=model,
|
72
103
|
messages=[
|
73
104
|
{
|
74
105
|
"role": "system",
|
75
|
-
"content": "You are an expert OCR engineer. You will be given an image of a region of a page. You will return the exact text from the image."
|
106
|
+
"content": "You are an expert OCR engineer. You will be given an image of a region of a page. You will return the exact text from the image.",
|
76
107
|
},
|
77
108
|
{
|
78
109
|
"role": "user",
|
79
110
|
"content": [
|
80
|
-
{
|
81
|
-
"type": "text",
|
82
|
-
"text": prompt
|
83
|
-
},
|
111
|
+
{"type": "text", "text": prompt},
|
84
112
|
{
|
85
113
|
"type": "image_url",
|
86
|
-
"image_url": {
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
}
|
92
|
-
]
|
114
|
+
"image_url": {"url": f"data:image/png;base64,{base64_image}"},
|
115
|
+
},
|
116
|
+
],
|
117
|
+
},
|
118
|
+
],
|
93
119
|
)
|
94
|
-
|
95
|
-
corrected = response.choices[0].message.content
|
120
|
+
|
121
|
+
corrected = response.choices[0].message.content.strip()
|
96
122
|
logger.debug(f"Corrected {region.extract_text()} to {corrected}")
|
97
123
|
|
98
|
-
return corrected
|
124
|
+
return corrected
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -58,10 +58,6 @@ class DocumentQA:
|
|
58
58
|
import torch
|
59
59
|
from transformers import pipeline
|
60
60
|
|
61
|
-
# Determine device
|
62
|
-
if device is None:
|
63
|
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
64
|
-
|
65
61
|
logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
|
66
62
|
|
67
63
|
# Initialize the pipeline
|