natural-pdf 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/classification/manager.py +2 -3
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/highlighting_service.py +29 -38
- natural_pdf/core/page.py +283 -186
- natural_pdf/core/pdf.py +4 -4
- natural_pdf/elements/base.py +34 -0
- natural_pdf/elements/collections.py +160 -9
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +353 -12
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/selectors/parser.py +163 -8
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +22 -17
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.12.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
natural_pdf/core/page.py
CHANGED
@@ -74,6 +74,11 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
|
|
74
74
|
# --- End Classification Imports --- #
|
75
75
|
|
76
76
|
|
77
|
+
# --- Shape Detection Mixin --- #
|
78
|
+
from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
|
79
|
+
# --- End Shape Detection Mixin --- #
|
80
|
+
|
81
|
+
|
77
82
|
try:
|
78
83
|
from deskew import determine_skew
|
79
84
|
|
@@ -86,7 +91,7 @@ except ImportError:
|
|
86
91
|
logger = logging.getLogger(__name__)
|
87
92
|
|
88
93
|
|
89
|
-
class Page(ClassificationMixin, ExtractionMixin):
|
94
|
+
class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
|
90
95
|
"""
|
91
96
|
Enhanced Page wrapper built on top of pdfplumber.Page.
|
92
97
|
|
@@ -161,6 +166,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
161
166
|
self._layout_analyzer = None
|
162
167
|
|
163
168
|
self._load_elements()
|
169
|
+
self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
|
164
170
|
|
165
171
|
@property
|
166
172
|
def pdf(self) -> "PDF":
|
@@ -649,7 +655,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
649
655
|
Exclusions are now handled by the calling methods (find, find_all) if requested.
|
650
656
|
|
651
657
|
Args:
|
652
|
-
selector_obj: Parsed selector dictionary
|
658
|
+
selector_obj: Parsed selector dictionary (single or compound OR selector)
|
653
659
|
**kwargs: Additional filter parameters including 'regex' and 'case'
|
654
660
|
|
655
661
|
Returns:
|
@@ -657,6 +663,30 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
657
663
|
"""
|
658
664
|
from natural_pdf.selectors.parser import selector_to_filter_func
|
659
665
|
|
666
|
+
# Handle compound OR selectors
|
667
|
+
if selector_obj.get("type") == "or":
|
668
|
+
# For OR selectors, search all elements and let the filter function decide
|
669
|
+
elements_to_search = self._element_mgr.get_all_elements()
|
670
|
+
|
671
|
+
# Create filter function from compound selector
|
672
|
+
filter_func = selector_to_filter_func(selector_obj, **kwargs)
|
673
|
+
|
674
|
+
# Apply the filter to all elements
|
675
|
+
matching_elements = [element for element in elements_to_search if filter_func(element)]
|
676
|
+
|
677
|
+
# Sort elements in reading order if requested
|
678
|
+
if kwargs.get("reading_order", True):
|
679
|
+
if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
|
680
|
+
matching_elements.sort(key=lambda el: (el.top, el.x0))
|
681
|
+
else:
|
682
|
+
logger.warning(
|
683
|
+
"Cannot sort elements in reading order: Missing required attributes (top, x0)."
|
684
|
+
)
|
685
|
+
|
686
|
+
# Return result collection
|
687
|
+
return ElementCollection(matching_elements)
|
688
|
+
|
689
|
+
# Handle single selectors (existing logic)
|
660
690
|
# Get element type to filter
|
661
691
|
element_type = selector_obj.get("type", "any").lower()
|
662
692
|
|
@@ -1411,114 +1441,171 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1411
1441
|
Returns:
|
1412
1442
|
PIL Image of the page, or None if rendering fails.
|
1413
1443
|
"""
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
1444
|
+
# 1. Create cache key (excluding path)
|
1445
|
+
cache_key_parts = [
|
1446
|
+
scale,
|
1447
|
+
width,
|
1448
|
+
labels,
|
1449
|
+
legend_position,
|
1450
|
+
render_ocr,
|
1451
|
+
resolution,
|
1452
|
+
include_highlights,
|
1453
|
+
exclusions,
|
1454
|
+
]
|
1455
|
+
# Convert kwargs to a stable, hashable representation
|
1456
|
+
sorted_kwargs_list = []
|
1457
|
+
for k, v in sorted(kwargs.items()):
|
1458
|
+
if isinstance(v, list):
|
1459
|
+
try:
|
1460
|
+
v = tuple(v) # Convert lists to tuples
|
1461
|
+
except TypeError: # pragma: no cover
|
1462
|
+
# If list contains unhashable items, fall back to repr or skip
|
1463
|
+
# For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
|
1464
|
+
logger.warning(f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements.")
|
1465
|
+
sorted_kwargs_list.append((k, v))
|
1466
|
+
|
1467
|
+
cache_key_parts.append(tuple(sorted_kwargs_list))
|
1468
|
+
|
1421
1469
|
try:
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
|
1426
|
-
f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
|
1427
|
-
)
|
1428
|
-
if include_highlights:
|
1429
|
-
# Delegate rendering to the central service
|
1430
|
-
image = self._highlighter.render_page(
|
1431
|
-
page_index=self.index,
|
1432
|
-
scale=scale,
|
1433
|
-
labels=labels,
|
1434
|
-
legend_position=legend_position,
|
1435
|
-
render_ocr=render_ocr,
|
1436
|
-
resolution=render_resolution, # Pass the calculated resolution
|
1437
|
-
**kwargs,
|
1438
|
-
)
|
1439
|
-
else:
|
1440
|
-
image = render_plain_page(self, render_resolution)
|
1441
|
-
except Exception as e:
|
1442
|
-
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1443
|
-
return None # Return None on error
|
1444
|
-
finally:
|
1445
|
-
render_end_time = time.monotonic()
|
1446
|
-
logger.debug(
|
1447
|
-
f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
|
1448
|
-
)
|
1470
|
+
cache_key = tuple(cache_key_parts)
|
1471
|
+
except TypeError as e: # pragma: no cover
|
1472
|
+
logger.warning(f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call.")
|
1473
|
+
cache_key = None # Fallback to not using cache for this call
|
1449
1474
|
|
1450
|
-
|
1451
|
-
return None
|
1475
|
+
image_to_return: Optional[Image.Image] = None
|
1452
1476
|
|
1453
|
-
#
|
1454
|
-
if
|
1477
|
+
# 2. Check cache
|
1478
|
+
if cache_key is not None and cache_key in self._to_image_cache:
|
1479
|
+
image_to_return = self._to_image_cache[cache_key]
|
1480
|
+
logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
|
1481
|
+
else:
|
1482
|
+
# --- This is the original logic to generate the image ---
|
1483
|
+
rendered_image_component: Optional[Image.Image] = None # Renamed from 'image' in original
|
1484
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
1485
|
+
thread_id = threading.current_thread().name
|
1486
|
+
logger.debug(
|
1487
|
+
f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
|
1488
|
+
)
|
1489
|
+
lock_wait_start = time.monotonic()
|
1455
1490
|
try:
|
1456
|
-
#
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
|
1465
|
-
|
1466
|
-
|
1467
|
-
|
1468
|
-
|
1469
|
-
|
1470
|
-
|
1471
|
-
|
1472
|
-
img_top = region.top * img_scale
|
1473
|
-
img_x1 = region.x1 * img_scale
|
1474
|
-
img_bottom = region.bottom * img_scale
|
1475
|
-
|
1476
|
-
# Draw a white rectangle over the excluded area
|
1477
|
-
# Ensure coordinates are within image bounds (though region should be)
|
1478
|
-
img_coords = (
|
1479
|
-
max(0, img_x0),
|
1480
|
-
max(0, img_top),
|
1481
|
-
min(image.width, img_x1),
|
1482
|
-
min(image.height, img_bottom),
|
1491
|
+
# Acquire the global PDF rendering lock
|
1492
|
+
with pdf_render_lock:
|
1493
|
+
lock_acquired_time = time.monotonic()
|
1494
|
+
logger.debug(
|
1495
|
+
f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
|
1496
|
+
)
|
1497
|
+
if include_highlights:
|
1498
|
+
# Delegate rendering to the central service
|
1499
|
+
rendered_image_component = self._highlighter.render_page(
|
1500
|
+
page_index=self.index,
|
1501
|
+
scale=scale,
|
1502
|
+
labels=labels,
|
1503
|
+
legend_position=legend_position,
|
1504
|
+
render_ocr=render_ocr,
|
1505
|
+
resolution=render_resolution, # Pass the calculated resolution
|
1506
|
+
**kwargs,
|
1483
1507
|
)
|
1484
|
-
|
1485
|
-
|
1486
|
-
|
1487
|
-
|
1488
|
-
|
1489
|
-
|
1490
|
-
|
1491
|
-
|
1492
|
-
|
1493
|
-
logger.error(
|
1494
|
-
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
1495
|
-
exc_info=True,
|
1508
|
+
else:
|
1509
|
+
rendered_image_component = render_plain_page(self, render_resolution)
|
1510
|
+
except Exception as e:
|
1511
|
+
logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
|
1512
|
+
# rendered_image_component remains None
|
1513
|
+
finally:
|
1514
|
+
render_end_time = time.monotonic()
|
1515
|
+
logger.debug(
|
1516
|
+
f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
|
1496
1517
|
)
|
1497
|
-
# Decide if you want to return None or continue without mask
|
1498
|
-
# For now, continue without mask
|
1499
1518
|
|
1500
|
-
|
1501
|
-
|
1502
|
-
|
1503
|
-
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1512
|
-
|
1519
|
+
if rendered_image_component is None:
|
1520
|
+
if cache_key is not None:
|
1521
|
+
self._to_image_cache[cache_key] = None # Cache the failure
|
1522
|
+
# Save the image if path is provided (will try to save None, handled by PIL/OS)
|
1523
|
+
if path:
|
1524
|
+
try:
|
1525
|
+
if os.path.dirname(path):
|
1526
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1527
|
+
if rendered_image_component is not None: # Should be None here
|
1528
|
+
rendered_image_component.save(path) # This line won't be hit if None
|
1529
|
+
# else: logger.debug("Not saving None image") # Not strictly needed
|
1530
|
+
except Exception as save_error: # pragma: no cover
|
1531
|
+
logger.error(f"Failed to save image to {path}: {save_error}")
|
1532
|
+
return None
|
1533
|
+
|
1534
|
+
# --- Apply exclusion masking if requested ---
|
1535
|
+
# This modifies 'rendered_image_component'
|
1536
|
+
image_after_masking = rendered_image_component # Start with the rendered image
|
1537
|
+
if exclusions == "mask" and self._exclusions:
|
1538
|
+
try:
|
1539
|
+
# Ensure image is mutable (RGB or RGBA)
|
1540
|
+
if image_after_masking.mode not in ("RGB", "RGBA"):
|
1541
|
+
image_after_masking = image_after_masking.convert("RGB")
|
1542
|
+
|
1543
|
+
exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
|
1544
|
+
if exclusion_regions:
|
1545
|
+
draw = ImageDraw.Draw(image_after_masking)
|
1546
|
+
# Calculate the scaling factor used for the image
|
1547
|
+
img_scale = render_resolution / 72.0
|
1548
|
+
|
1549
|
+
for region in exclusion_regions:
|
1550
|
+
# Convert PDF points (x0, top, x1, bottom) to image pixels
|
1551
|
+
img_x0 = region.x0 * img_scale
|
1552
|
+
img_top = region.top * img_scale
|
1553
|
+
img_x1 = region.x1 * img_scale
|
1554
|
+
img_bottom = region.bottom * img_scale
|
1555
|
+
|
1556
|
+
# Draw a white rectangle over the excluded area
|
1557
|
+
img_coords = (
|
1558
|
+
max(0, img_x0),
|
1559
|
+
max(0, img_top),
|
1560
|
+
min(image_after_masking.width, img_x1),
|
1561
|
+
min(image_after_masking.height, img_bottom),
|
1562
|
+
)
|
1563
|
+
if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
|
1564
|
+
draw.rectangle(img_coords, fill="white")
|
1565
|
+
else: # pragma: no cover
|
1566
|
+
logger.warning(
|
1567
|
+
f"Skipping invalid exclusion rect for masking: {img_coords}"
|
1568
|
+
)
|
1569
|
+
del draw # Release drawing context
|
1570
|
+
except Exception as mask_error: # pragma: no cover
|
1571
|
+
logger.error(
|
1572
|
+
f"Error applying exclusion mask to page {self.index}: {mask_error}",
|
1573
|
+
exc_info=True,
|
1574
|
+
)
|
1575
|
+
# Continue with potentially unmasked or partially masked image
|
1576
|
+
|
1577
|
+
# --- Resize the final image if width is provided ---
|
1578
|
+
image_final_content = image_after_masking # Start with image after masking
|
1579
|
+
if width is not None and width > 0 and image_final_content.width > 0:
|
1580
|
+
aspect_ratio = image_final_content.height / image_final_content.width
|
1581
|
+
height = int(width * aspect_ratio)
|
1582
|
+
try:
|
1583
|
+
image_final_content = image_final_content.resize(
|
1584
|
+
(width, height), Image.Resampling.LANCZOS
|
1585
|
+
)
|
1586
|
+
except Exception as resize_error: # pragma: no cover
|
1587
|
+
logger.warning(f"Could not resize image: {resize_error}")
|
1588
|
+
# image_final_content remains the un-resized version if resize fails
|
1589
|
+
|
1590
|
+
# Store in cache
|
1591
|
+
if cache_key is not None:
|
1592
|
+
self._to_image_cache[cache_key] = image_final_content
|
1593
|
+
logger.debug(f"Page {self.index}: Cached image for key: {cache_key}")
|
1594
|
+
image_to_return = image_final_content
|
1595
|
+
# --- End of cache miss block ---
|
1596
|
+
|
1597
|
+
# Save the image (either from cache or newly generated) if path is provided
|
1598
|
+
if path and image_to_return:
|
1513
1599
|
try:
|
1514
1600
|
# Ensure directory exists
|
1515
|
-
os.
|
1516
|
-
|
1601
|
+
if os.path.dirname(path): # Only call makedirs if there's a directory part
|
1602
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
1603
|
+
image_to_return.save(path)
|
1517
1604
|
logger.debug(f"Saved page image to: {path}")
|
1518
|
-
except Exception as save_error:
|
1605
|
+
except Exception as save_error: # pragma: no cover
|
1519
1606
|
logger.error(f"Failed to save image to {path}: {save_error}")
|
1520
1607
|
|
1521
|
-
return
|
1608
|
+
return image_to_return
|
1522
1609
|
|
1523
1610
|
def _create_text_elements_from_ocr(
|
1524
1611
|
self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
|
@@ -1986,7 +2073,6 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
1986
2073
|
region.is_end_next_start = False
|
1987
2074
|
regions.append(region)
|
1988
2075
|
|
1989
|
-
# Return the list wrapped in an ElementCollection
|
1990
2076
|
return ElementCollection(regions)
|
1991
2077
|
|
1992
2078
|
def __repr__(self) -> str:
|
@@ -2213,6 +2299,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2213
2299
|
def correct_ocr(
|
2214
2300
|
self,
|
2215
2301
|
correction_callback: Callable[[Any], Optional[str]],
|
2302
|
+
selector: Optional[str] = "text[source=ocr]",
|
2216
2303
|
max_workers: Optional[int] = None,
|
2217
2304
|
progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
|
2218
2305
|
) -> "Page": # Return self for chaining
|
@@ -2240,7 +2327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2240
2327
|
)
|
2241
2328
|
|
2242
2329
|
target_elements_collection = self.find_all(
|
2243
|
-
selector=
|
2330
|
+
selector=selector, apply_exclusions=False
|
2244
2331
|
)
|
2245
2332
|
target_elements = target_elements_collection.elements # Get the list
|
2246
2333
|
|
@@ -2248,102 +2335,112 @@ class Page(ClassificationMixin, ExtractionMixin):
|
|
2248
2335
|
logger.info(f"Page {self.number}: No OCR elements found to correct.")
|
2249
2336
|
return self
|
2250
2337
|
|
2251
|
-
|
2252
|
-
|
2253
|
-
|
2338
|
+
element_pbar = None
|
2339
|
+
try:
|
2340
|
+
element_pbar = tqdm(total=len(target_elements), desc=f"Correcting OCR Page {self.number}", unit="element", leave=False)
|
2254
2341
|
|
2255
|
-
|
2256
|
-
|
2257
|
-
|
2258
|
-
current_text = getattr(element, "text", None)
|
2259
|
-
# Call the user-provided callback
|
2260
|
-
corrected_text = correction_callback(element)
|
2342
|
+
processed_count = 0
|
2343
|
+
updated_count = 0
|
2344
|
+
error_count = 0
|
2261
2345
|
|
2262
|
-
|
2263
|
-
|
2264
|
-
|
2265
|
-
|
2266
|
-
|
2267
|
-
|
2346
|
+
# Define the task to be run by the worker thread or sequentially
|
2347
|
+
def _process_element_task(element):
|
2348
|
+
try:
|
2349
|
+
current_text = getattr(element, "text", None)
|
2350
|
+
# Call the user-provided callback
|
2351
|
+
corrected_text = correction_callback(element)
|
2268
2352
|
|
2269
|
-
|
2270
|
-
|
2271
|
-
|
2272
|
-
|
2273
|
-
exc_info=False, # Keep log concise
|
2274
|
-
)
|
2275
|
-
return element, None, e # Return element, no result, error
|
2276
|
-
finally:
|
2277
|
-
# --- Call progress callback here --- #
|
2278
|
-
if progress_callback:
|
2279
|
-
try:
|
2280
|
-
progress_callback()
|
2281
|
-
except Exception as cb_e:
|
2282
|
-
# Log error in callback itself, but don't stop processing
|
2283
|
-
logger.error(
|
2284
|
-
f"Page {self.number}: Error executing progress_callback: {cb_e}",
|
2285
|
-
exc_info=False,
|
2353
|
+
# Validate result type
|
2354
|
+
if corrected_text is not None and not isinstance(corrected_text, str):
|
2355
|
+
logger.warning(
|
2356
|
+
f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
|
2286
2357
|
)
|
2358
|
+
return element, None, None # Treat as no correction
|
2287
2359
|
|
2288
|
-
|
2289
|
-
|
2290
|
-
|
2291
|
-
|
2292
|
-
|
2293
|
-
|
2294
|
-
|
2295
|
-
|
2296
|
-
|
2297
|
-
|
2298
|
-
|
2299
|
-
|
2300
|
-
|
2360
|
+
return element, corrected_text, None # Return element, result, no error
|
2361
|
+
except Exception as e:
|
2362
|
+
logger.error(
|
2363
|
+
f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
|
2364
|
+
exc_info=False, # Keep log concise
|
2365
|
+
)
|
2366
|
+
return element, None, e # Return element, no result, error
|
2367
|
+
finally:
|
2368
|
+
# --- Update internal tqdm progress bar ---
|
2369
|
+
if element_pbar:
|
2370
|
+
element_pbar.update(1)
|
2371
|
+
# --- Call user's progress callback --- #
|
2372
|
+
if progress_callback:
|
2373
|
+
try:
|
2374
|
+
progress_callback()
|
2375
|
+
except Exception as cb_e:
|
2376
|
+
# Log error in callback itself, but don't stop processing
|
2377
|
+
logger.error(
|
2378
|
+
f"Page {self.number}: Error executing progress_callback: {cb_e}",
|
2379
|
+
exc_info=False,
|
2380
|
+
)
|
2301
2381
|
|
2302
|
-
|
2303
|
-
|
2304
|
-
|
2305
|
-
|
2306
|
-
|
2307
|
-
|
2382
|
+
# Choose execution strategy based on max_workers
|
2383
|
+
if max_workers is not None and max_workers > 1:
|
2384
|
+
# --- Parallel execution --- #
|
2385
|
+
logger.info(
|
2386
|
+
f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
|
2387
|
+
)
|
2388
|
+
futures = []
|
2389
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
2390
|
+
# Submit all tasks
|
2391
|
+
future_to_element = {
|
2392
|
+
executor.submit(_process_element_task, element): element
|
2393
|
+
for element in target_elements
|
2394
|
+
}
|
2395
|
+
|
2396
|
+
# Process results as they complete (progress_callback called by worker)
|
2397
|
+
for future in concurrent.futures.as_completed(future_to_element):
|
2398
|
+
processed_count += 1
|
2399
|
+
try:
|
2400
|
+
element, corrected_text, error = future.result()
|
2401
|
+
if error:
|
2402
|
+
error_count += 1
|
2403
|
+
# Error already logged in worker
|
2404
|
+
elif corrected_text is not None:
|
2405
|
+
# Apply correction if text changed
|
2406
|
+
current_text = getattr(element, "text", None)
|
2407
|
+
if corrected_text != current_text:
|
2408
|
+
element.text = corrected_text
|
2409
|
+
updated_count += 1
|
2410
|
+
except Exception as exc:
|
2411
|
+
# Catch errors from future.result() itself
|
2412
|
+
element = future_to_element[future] # Find original element
|
2413
|
+
logger.error(
|
2414
|
+
f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
|
2415
|
+
exc_info=True,
|
2416
|
+
)
|
2308
2417
|
error_count += 1
|
2309
|
-
#
|
2310
|
-
elif corrected_text is not None:
|
2311
|
-
# Apply correction if text changed
|
2312
|
-
current_text = getattr(element, "text", None)
|
2313
|
-
if corrected_text != current_text:
|
2314
|
-
element.text = corrected_text
|
2315
|
-
updated_count += 1
|
2316
|
-
except Exception as exc:
|
2317
|
-
# Catch errors from future.result() itself
|
2318
|
-
element = future_to_element[future] # Find original element
|
2319
|
-
logger.error(
|
2320
|
-
f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
|
2321
|
-
exc_info=True,
|
2322
|
-
)
|
2323
|
-
error_count += 1
|
2324
|
-
# Note: progress_callback was already called in the worker's finally block
|
2418
|
+
# Note: progress_callback was already called in the worker's finally block
|
2325
2419
|
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2329
|
-
|
2330
|
-
|
2331
|
-
|
2332
|
-
|
2333
|
-
|
2334
|
-
|
2335
|
-
|
2336
|
-
|
2337
|
-
|
2338
|
-
|
2339
|
-
|
2340
|
-
|
2420
|
+
else:
|
2421
|
+
# --- Sequential execution --- #
|
2422
|
+
logger.info(f"Page {self.number}: Running OCR correction sequentially.")
|
2423
|
+
for element in target_elements:
|
2424
|
+
# Call the task function directly (it handles progress_callback)
|
2425
|
+
processed_count += 1
|
2426
|
+
_element, corrected_text, error = _process_element_task(element)
|
2427
|
+
if error:
|
2428
|
+
error_count += 1
|
2429
|
+
elif corrected_text is not None:
|
2430
|
+
# Apply correction if text changed
|
2431
|
+
current_text = getattr(_element, "text", None)
|
2432
|
+
if corrected_text != current_text:
|
2433
|
+
_element.text = corrected_text
|
2434
|
+
updated_count += 1
|
2341
2435
|
|
2342
|
-
|
2343
|
-
|
2344
|
-
|
2436
|
+
logger.info(
|
2437
|
+
f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
|
2438
|
+
)
|
2345
2439
|
|
2346
|
-
|
2440
|
+
return self # Return self for chaining
|
2441
|
+
finally:
|
2442
|
+
if element_pbar:
|
2443
|
+
element_pbar.close()
|
2347
2444
|
|
2348
2445
|
# --- Classification Mixin Implementation --- #
|
2349
2446
|
def _get_classification_manager(self) -> "ClassificationManager":
|
natural_pdf/core/pdf.py
CHANGED
@@ -38,7 +38,7 @@ from natural_pdf.extraction.mixin import ExtractionMixin
|
|
38
38
|
from natural_pdf.ocr import OCRManager, OCROptions
|
39
39
|
from natural_pdf.selectors.parser import parse_selector
|
40
40
|
from natural_pdf.utils.locks import pdf_render_lock
|
41
|
-
from
|
41
|
+
from tqdm.auto import tqdm
|
42
42
|
|
43
43
|
try:
|
44
44
|
from typing import Any as TypingAny
|
@@ -71,7 +71,6 @@ except ImportError:
|
|
71
71
|
create_original_pdf = None
|
72
72
|
|
73
73
|
logger = logging.getLogger("natural_pdf.core.pdf")
|
74
|
-
tqdm = get_tqdm()
|
75
74
|
|
76
75
|
DEFAULT_MANAGERS = {
|
77
76
|
"classification": ClassificationManager,
|
@@ -1253,6 +1252,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1253
1252
|
self,
|
1254
1253
|
pages: Optional[Union[Iterable[int], range, slice]] = None,
|
1255
1254
|
resolution: int = 300,
|
1255
|
+
angle: Optional[float] = None,
|
1256
1256
|
detection_resolution: int = 72,
|
1257
1257
|
force_overwrite: bool = False,
|
1258
1258
|
**deskew_kwargs,
|
@@ -1271,6 +1271,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1271
1271
|
Args:
|
1272
1272
|
pages: Page indices/slice to include (0-based). If None, processes all pages.
|
1273
1273
|
resolution: DPI resolution for rendering the output deskewed pages.
|
1274
|
+
angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
|
1274
1275
|
detection_resolution: DPI resolution used for skew detection if angles are not
|
1275
1276
|
already cached on the page objects.
|
1276
1277
|
force_overwrite: If False (default), raises a ValueError if any target page
|
@@ -1315,14 +1316,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
|
|
1315
1316
|
deskewed_images_bytes = []
|
1316
1317
|
logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
|
1317
1318
|
|
1318
|
-
# Use tqdm via get_tqdm
|
1319
1319
|
for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
|
1320
1320
|
try:
|
1321
1321
|
# Use page.deskew to get the corrected PIL image
|
1322
1322
|
# Pass down resolutions and kwargs
|
1323
1323
|
deskewed_img = page.deskew(
|
1324
1324
|
resolution=resolution,
|
1325
|
-
angle=
|
1325
|
+
angle=angle, # Let page.deskew handle detection/caching
|
1326
1326
|
detection_resolution=detection_resolution,
|
1327
1327
|
**deskew_kwargs,
|
1328
1328
|
)
|
natural_pdf/elements/base.py
CHANGED
@@ -15,6 +15,40 @@ if TYPE_CHECKING:
|
|
15
15
|
from natural_pdf.elements.region import Region
|
16
16
|
|
17
17
|
|
18
|
+
def extract_bbox(obj: Any) -> Optional[Tuple[float, float, float, float]]:
|
19
|
+
"""
|
20
|
+
Extract bounding box coordinates from any object that has bbox properties.
|
21
|
+
|
22
|
+
Args:
|
23
|
+
obj: Object that might have bbox coordinates (Element, Region, etc.)
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
Tuple of (x0, top, x1, bottom) or None if object doesn't have bbox properties
|
27
|
+
"""
|
28
|
+
# Try bbox property first (most common)
|
29
|
+
if hasattr(obj, 'bbox') and obj.bbox is not None:
|
30
|
+
bbox = obj.bbox
|
31
|
+
if isinstance(bbox, (tuple, list)) and len(bbox) == 4:
|
32
|
+
return tuple(float(coord) for coord in bbox)
|
33
|
+
|
34
|
+
# Try individual coordinate properties
|
35
|
+
if all(hasattr(obj, attr) for attr in ['x0', 'top', 'x1', 'bottom']):
|
36
|
+
try:
|
37
|
+
return (float(obj.x0), float(obj.top), float(obj.x1), float(obj.bottom))
|
38
|
+
except (ValueError, TypeError):
|
39
|
+
pass
|
40
|
+
|
41
|
+
# If object is a dict with bbox keys
|
42
|
+
if isinstance(obj, dict):
|
43
|
+
if all(key in obj for key in ['x0', 'top', 'x1', 'bottom']):
|
44
|
+
try:
|
45
|
+
return (float(obj['x0']), float(obj['top']), float(obj['x1']), float(obj['bottom']))
|
46
|
+
except (ValueError, TypeError):
|
47
|
+
pass
|
48
|
+
|
49
|
+
return None
|
50
|
+
|
51
|
+
|
18
52
|
class DirectionalMixin:
|
19
53
|
"""
|
20
54
|
Mixin class providing directional methods for both Element and Region classes.
|