natural-pdf 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
natural_pdf/core/page.py CHANGED
@@ -74,6 +74,11 @@ from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, SimpleInteractiveV
74
74
  # --- End Classification Imports --- #
75
75
 
76
76
 
77
+ # --- Shape Detection Mixin --- #
78
+ from natural_pdf.analyzers.shape_detection_mixin import ShapeDetectionMixin
79
+ # --- End Shape Detection Mixin --- #
80
+
81
+
77
82
  try:
78
83
  from deskew import determine_skew
79
84
 
@@ -86,7 +91,7 @@ except ImportError:
86
91
  logger = logging.getLogger(__name__)
87
92
 
88
93
 
89
- class Page(ClassificationMixin, ExtractionMixin):
94
+ class Page(ClassificationMixin, ExtractionMixin, ShapeDetectionMixin):
90
95
  """
91
96
  Enhanced Page wrapper built on top of pdfplumber.Page.
92
97
 
@@ -161,6 +166,7 @@ class Page(ClassificationMixin, ExtractionMixin):
161
166
  self._layout_analyzer = None
162
167
 
163
168
  self._load_elements()
169
+ self._to_image_cache: Dict[tuple, Optional["Image.Image"]] = {}
164
170
 
165
171
  @property
166
172
  def pdf(self) -> "PDF":
@@ -649,7 +655,7 @@ class Page(ClassificationMixin, ExtractionMixin):
649
655
  Exclusions are now handled by the calling methods (find, find_all) if requested.
650
656
 
651
657
  Args:
652
- selector_obj: Parsed selector dictionary
658
+ selector_obj: Parsed selector dictionary (single or compound OR selector)
653
659
  **kwargs: Additional filter parameters including 'regex' and 'case'
654
660
 
655
661
  Returns:
@@ -657,6 +663,30 @@ class Page(ClassificationMixin, ExtractionMixin):
657
663
  """
658
664
  from natural_pdf.selectors.parser import selector_to_filter_func
659
665
 
666
+ # Handle compound OR selectors
667
+ if selector_obj.get("type") == "or":
668
+ # For OR selectors, search all elements and let the filter function decide
669
+ elements_to_search = self._element_mgr.get_all_elements()
670
+
671
+ # Create filter function from compound selector
672
+ filter_func = selector_to_filter_func(selector_obj, **kwargs)
673
+
674
+ # Apply the filter to all elements
675
+ matching_elements = [element for element in elements_to_search if filter_func(element)]
676
+
677
+ # Sort elements in reading order if requested
678
+ if kwargs.get("reading_order", True):
679
+ if all(hasattr(el, "top") and hasattr(el, "x0") for el in matching_elements):
680
+ matching_elements.sort(key=lambda el: (el.top, el.x0))
681
+ else:
682
+ logger.warning(
683
+ "Cannot sort elements in reading order: Missing required attributes (top, x0)."
684
+ )
685
+
686
+ # Return result collection
687
+ return ElementCollection(matching_elements)
688
+
689
+ # Handle single selectors (existing logic)
660
690
  # Get element type to filter
661
691
  element_type = selector_obj.get("type", "any").lower()
662
692
 
@@ -1411,114 +1441,171 @@ class Page(ClassificationMixin, ExtractionMixin):
1411
1441
  Returns:
1412
1442
  PIL Image of the page, or None if rendering fails.
1413
1443
  """
1414
- image = None
1415
- render_resolution = resolution if resolution is not None else scale * 72
1416
- thread_id = threading.current_thread().name
1417
- logger.debug(
1418
- f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
1419
- )
1420
- lock_wait_start = time.monotonic()
1444
+ # 1. Create cache key (excluding path)
1445
+ cache_key_parts = [
1446
+ scale,
1447
+ width,
1448
+ labels,
1449
+ legend_position,
1450
+ render_ocr,
1451
+ resolution,
1452
+ include_highlights,
1453
+ exclusions,
1454
+ ]
1455
+ # Convert kwargs to a stable, hashable representation
1456
+ sorted_kwargs_list = []
1457
+ for k, v in sorted(kwargs.items()):
1458
+ if isinstance(v, list):
1459
+ try:
1460
+ v = tuple(v) # Convert lists to tuples
1461
+ except TypeError: # pragma: no cover
1462
+ # If list contains unhashable items, fall back to repr or skip
1463
+ # For simplicity, we'll try to proceed; hashing will fail if v remains unhashable
1464
+ logger.warning(f"Cache key generation: List item in kwargs['{k}'] could not be converted to tuple due to unhashable elements.")
1465
+ sorted_kwargs_list.append((k, v))
1466
+
1467
+ cache_key_parts.append(tuple(sorted_kwargs_list))
1468
+
1421
1469
  try:
1422
- # Acquire the global PDF rendering lock
1423
- with pdf_render_lock:
1424
- lock_acquired_time = time.monotonic()
1425
- logger.debug(
1426
- f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
1427
- )
1428
- if include_highlights:
1429
- # Delegate rendering to the central service
1430
- image = self._highlighter.render_page(
1431
- page_index=self.index,
1432
- scale=scale,
1433
- labels=labels,
1434
- legend_position=legend_position,
1435
- render_ocr=render_ocr,
1436
- resolution=render_resolution, # Pass the calculated resolution
1437
- **kwargs,
1438
- )
1439
- else:
1440
- image = render_plain_page(self, render_resolution)
1441
- except Exception as e:
1442
- logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1443
- return None # Return None on error
1444
- finally:
1445
- render_end_time = time.monotonic()
1446
- logger.debug(
1447
- f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
1448
- )
1470
+ cache_key = tuple(cache_key_parts)
1471
+ except TypeError as e: # pragma: no cover
1472
+ logger.warning(f"Page {self.index}: Could not create cache key for to_image due to unhashable item: {e}. Proceeding without cache for this call.")
1473
+ cache_key = None # Fallback to not using cache for this call
1449
1474
 
1450
- if image is None:
1451
- return None
1475
+ image_to_return: Optional[Image.Image] = None
1452
1476
 
1453
- # --- Apply exclusion masking if requested ---
1454
- if exclusions == "mask" and self._exclusions:
1477
+ # 2. Check cache
1478
+ if cache_key is not None and cache_key in self._to_image_cache:
1479
+ image_to_return = self._to_image_cache[cache_key]
1480
+ logger.debug(f"Page {self.index}: Returning cached image for key: {cache_key}")
1481
+ else:
1482
+ # --- This is the original logic to generate the image ---
1483
+ rendered_image_component: Optional[Image.Image] = None # Renamed from 'image' in original
1484
+ render_resolution = resolution if resolution is not None else scale * 72
1485
+ thread_id = threading.current_thread().name
1486
+ logger.debug(
1487
+ f"[{thread_id}] Page {self.index}: Attempting to acquire pdf_render_lock for to_image..."
1488
+ )
1489
+ lock_wait_start = time.monotonic()
1455
1490
  try:
1456
- # Ensure image is mutable (RGB or RGBA)
1457
- if image.mode not in ("RGB", "RGBA"):
1458
- image = image.convert("RGB")
1459
-
1460
- exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1461
- if exclusion_regions:
1462
- draw = ImageDraw.Draw(image)
1463
- # Calculate the scaling factor used for the image
1464
- # Base image was rendered at render_resolution (DPI)
1465
- # pdfplumber default is 72 DPI
1466
- # Scale factor = (pixels / inch) / (points / inch) = DPI / 72
1467
- img_scale = render_resolution / 72.0
1468
-
1469
- for region in exclusion_regions:
1470
- # Convert PDF points (x0, top, x1, bottom) to image pixels
1471
- img_x0 = region.x0 * img_scale
1472
- img_top = region.top * img_scale
1473
- img_x1 = region.x1 * img_scale
1474
- img_bottom = region.bottom * img_scale
1475
-
1476
- # Draw a white rectangle over the excluded area
1477
- # Ensure coordinates are within image bounds (though region should be)
1478
- img_coords = (
1479
- max(0, img_x0),
1480
- max(0, img_top),
1481
- min(image.width, img_x1),
1482
- min(image.height, img_bottom),
1491
+ # Acquire the global PDF rendering lock
1492
+ with pdf_render_lock:
1493
+ lock_acquired_time = time.monotonic()
1494
+ logger.debug(
1495
+ f"[{thread_id}] Page {self.index}: Acquired pdf_render_lock (waited {lock_acquired_time - lock_wait_start:.2f}s). Starting render..."
1496
+ )
1497
+ if include_highlights:
1498
+ # Delegate rendering to the central service
1499
+ rendered_image_component = self._highlighter.render_page(
1500
+ page_index=self.index,
1501
+ scale=scale,
1502
+ labels=labels,
1503
+ legend_position=legend_position,
1504
+ render_ocr=render_ocr,
1505
+ resolution=render_resolution, # Pass the calculated resolution
1506
+ **kwargs,
1483
1507
  )
1484
- if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1485
- draw.rectangle(img_coords, fill="white")
1486
- else:
1487
- logger.warning(
1488
- f"Skipping invalid exclusion rect for masking: {img_coords}"
1489
- )
1490
-
1491
- del draw # Release drawing context
1492
- except Exception as mask_error:
1493
- logger.error(
1494
- f"Error applying exclusion mask to page {self.index}: {mask_error}",
1495
- exc_info=True,
1508
+ else:
1509
+ rendered_image_component = render_plain_page(self, render_resolution)
1510
+ except Exception as e:
1511
+ logger.error(f"Error rendering page {self.index}: {e}", exc_info=True)
1512
+ # rendered_image_component remains None
1513
+ finally:
1514
+ render_end_time = time.monotonic()
1515
+ logger.debug(
1516
+ f"[{thread_id}] Page {self.index}: Released pdf_render_lock. Total render time (incl. lock wait): {render_end_time - lock_wait_start:.2f}s"
1496
1517
  )
1497
- # Decide if you want to return None or continue without mask
1498
- # For now, continue without mask
1499
1518
 
1500
- # Resize the final image if width is provided
1501
- if width is not None and width > 0 and image.width > 0:
1502
- aspect_ratio = image.height / image.width
1503
- height = int(width * aspect_ratio)
1504
- try:
1505
- image = image.resize(
1506
- (width, height), Image.Resampling.LANCZOS
1507
- ) # Use modern resampling
1508
- except Exception as resize_error:
1509
- logger.warning(f"Could not resize image: {resize_error}")
1510
-
1511
- # Save the image if path is provided
1512
- if path:
1519
+ if rendered_image_component is None:
1520
+ if cache_key is not None:
1521
+ self._to_image_cache[cache_key] = None # Cache the failure
1522
+ # Save the image if path is provided (will try to save None, handled by PIL/OS)
1523
+ if path:
1524
+ try:
1525
+ if os.path.dirname(path):
1526
+ os.makedirs(os.path.dirname(path), exist_ok=True)
1527
+ if rendered_image_component is not None: # Should be None here
1528
+ rendered_image_component.save(path) # This line won't be hit if None
1529
+ # else: logger.debug("Not saving None image") # Not strictly needed
1530
+ except Exception as save_error: # pragma: no cover
1531
+ logger.error(f"Failed to save image to {path}: {save_error}")
1532
+ return None
1533
+
1534
+ # --- Apply exclusion masking if requested ---
1535
+ # This modifies 'rendered_image_component'
1536
+ image_after_masking = rendered_image_component # Start with the rendered image
1537
+ if exclusions == "mask" and self._exclusions:
1538
+ try:
1539
+ # Ensure image is mutable (RGB or RGBA)
1540
+ if image_after_masking.mode not in ("RGB", "RGBA"):
1541
+ image_after_masking = image_after_masking.convert("RGB")
1542
+
1543
+ exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=False)
1544
+ if exclusion_regions:
1545
+ draw = ImageDraw.Draw(image_after_masking)
1546
+ # Calculate the scaling factor used for the image
1547
+ img_scale = render_resolution / 72.0
1548
+
1549
+ for region in exclusion_regions:
1550
+ # Convert PDF points (x0, top, x1, bottom) to image pixels
1551
+ img_x0 = region.x0 * img_scale
1552
+ img_top = region.top * img_scale
1553
+ img_x1 = region.x1 * img_scale
1554
+ img_bottom = region.bottom * img_scale
1555
+
1556
+ # Draw a white rectangle over the excluded area
1557
+ img_coords = (
1558
+ max(0, img_x0),
1559
+ max(0, img_top),
1560
+ min(image_after_masking.width, img_x1),
1561
+ min(image_after_masking.height, img_bottom),
1562
+ )
1563
+ if img_coords[0] < img_coords[2] and img_coords[1] < img_coords[3]:
1564
+ draw.rectangle(img_coords, fill="white")
1565
+ else: # pragma: no cover
1566
+ logger.warning(
1567
+ f"Skipping invalid exclusion rect for masking: {img_coords}"
1568
+ )
1569
+ del draw # Release drawing context
1570
+ except Exception as mask_error: # pragma: no cover
1571
+ logger.error(
1572
+ f"Error applying exclusion mask to page {self.index}: {mask_error}",
1573
+ exc_info=True,
1574
+ )
1575
+ # Continue with potentially unmasked or partially masked image
1576
+
1577
+ # --- Resize the final image if width is provided ---
1578
+ image_final_content = image_after_masking # Start with image after masking
1579
+ if width is not None and width > 0 and image_final_content.width > 0:
1580
+ aspect_ratio = image_final_content.height / image_final_content.width
1581
+ height = int(width * aspect_ratio)
1582
+ try:
1583
+ image_final_content = image_final_content.resize(
1584
+ (width, height), Image.Resampling.LANCZOS
1585
+ )
1586
+ except Exception as resize_error: # pragma: no cover
1587
+ logger.warning(f"Could not resize image: {resize_error}")
1588
+ # image_final_content remains the un-resized version if resize fails
1589
+
1590
+ # Store in cache
1591
+ if cache_key is not None:
1592
+ self._to_image_cache[cache_key] = image_final_content
1593
+ logger.debug(f"Page {self.index}: Cached image for key: {cache_key}")
1594
+ image_to_return = image_final_content
1595
+ # --- End of cache miss block ---
1596
+
1597
+ # Save the image (either from cache or newly generated) if path is provided
1598
+ if path and image_to_return:
1513
1599
  try:
1514
1600
  # Ensure directory exists
1515
- os.makedirs(os.path.dirname(path), exist_ok=True)
1516
- image.save(path)
1601
+ if os.path.dirname(path): # Only call makedirs if there's a directory part
1602
+ os.makedirs(os.path.dirname(path), exist_ok=True)
1603
+ image_to_return.save(path)
1517
1604
  logger.debug(f"Saved page image to: {path}")
1518
- except Exception as save_error:
1605
+ except Exception as save_error: # pragma: no cover
1519
1606
  logger.error(f"Failed to save image to {path}: {save_error}")
1520
1607
 
1521
- return image
1608
+ return image_to_return
1522
1609
 
1523
1610
  def _create_text_elements_from_ocr(
1524
1611
  self, ocr_results: List[Dict[str, Any]], image_width=None, image_height=None
@@ -1986,7 +2073,6 @@ class Page(ClassificationMixin, ExtractionMixin):
1986
2073
  region.is_end_next_start = False
1987
2074
  regions.append(region)
1988
2075
 
1989
- # Return the list wrapped in an ElementCollection
1990
2076
  return ElementCollection(regions)
1991
2077
 
1992
2078
  def __repr__(self) -> str:
@@ -2112,7 +2198,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2112
2198
  def viewer(
2113
2199
  self,
2114
2200
  # elements_to_render: Optional[List['Element']] = None, # No longer needed, from_page handles it
2115
- # include_element_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
2201
+ # include_source_types: List[str] = ['word', 'line', 'rect', 'region'] # No longer needed
2116
2202
  ) -> Optional["SimpleInteractiveViewerWidget"]: # Return type hint updated
2117
2203
  """
2118
2204
  Creates and returns an interactive ipywidget for exploring elements on this page.
@@ -2213,6 +2299,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2213
2299
  def correct_ocr(
2214
2300
  self,
2215
2301
  correction_callback: Callable[[Any], Optional[str]],
2302
+ selector: Optional[str] = "text[source=ocr]",
2216
2303
  max_workers: Optional[int] = None,
2217
2304
  progress_callback: Optional[Callable[[], None]] = None, # Added progress callback
2218
2305
  ) -> "Page": # Return self for chaining
@@ -2240,7 +2327,7 @@ class Page(ClassificationMixin, ExtractionMixin):
2240
2327
  )
2241
2328
 
2242
2329
  target_elements_collection = self.find_all(
2243
- selector="text[source=ocr]", apply_exclusions=False
2330
+ selector=selector, apply_exclusions=False
2244
2331
  )
2245
2332
  target_elements = target_elements_collection.elements # Get the list
2246
2333
 
@@ -2248,102 +2335,112 @@ class Page(ClassificationMixin, ExtractionMixin):
2248
2335
  logger.info(f"Page {self.number}: No OCR elements found to correct.")
2249
2336
  return self
2250
2337
 
2251
- processed_count = 0
2252
- updated_count = 0
2253
- error_count = 0
2338
+ element_pbar = None
2339
+ try:
2340
+ element_pbar = tqdm(total=len(target_elements), desc=f"Correcting OCR Page {self.number}", unit="element", leave=False)
2254
2341
 
2255
- # Define the task to be run by the worker thread or sequentially
2256
- def _process_element_task(element):
2257
- try:
2258
- current_text = getattr(element, "text", None)
2259
- # Call the user-provided callback
2260
- corrected_text = correction_callback(element)
2342
+ processed_count = 0
2343
+ updated_count = 0
2344
+ error_count = 0
2261
2345
 
2262
- # Validate result type
2263
- if corrected_text is not None and not isinstance(corrected_text, str):
2264
- logger.warning(
2265
- f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
2266
- )
2267
- return element, None, None # Treat as no correction
2346
+ # Define the task to be run by the worker thread or sequentially
2347
+ def _process_element_task(element):
2348
+ try:
2349
+ current_text = getattr(element, "text", None)
2350
+ # Call the user-provided callback
2351
+ corrected_text = correction_callback(element)
2268
2352
 
2269
- return element, corrected_text, None # Return element, result, no error
2270
- except Exception as e:
2271
- logger.error(
2272
- f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
2273
- exc_info=False, # Keep log concise
2274
- )
2275
- return element, None, e # Return element, no result, error
2276
- finally:
2277
- # --- Call progress callback here --- #
2278
- if progress_callback:
2279
- try:
2280
- progress_callback()
2281
- except Exception as cb_e:
2282
- # Log error in callback itself, but don't stop processing
2283
- logger.error(
2284
- f"Page {self.number}: Error executing progress_callback: {cb_e}",
2285
- exc_info=False,
2353
+ # Validate result type
2354
+ if corrected_text is not None and not isinstance(corrected_text, str):
2355
+ logger.warning(
2356
+ f"Page {self.number}: Correction callback for element '{getattr(element, 'text', '')[:20]}...' returned non-string, non-None type: {type(corrected_text)}. Skipping update."
2286
2357
  )
2358
+ return element, None, None # Treat as no correction
2287
2359
 
2288
- # Choose execution strategy based on max_workers
2289
- if max_workers is not None and max_workers > 1:
2290
- # --- Parallel execution --- #
2291
- logger.info(
2292
- f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
2293
- )
2294
- futures = []
2295
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
2296
- # Submit all tasks
2297
- future_to_element = {
2298
- executor.submit(_process_element_task, element): element
2299
- for element in target_elements
2300
- }
2360
+ return element, corrected_text, None # Return element, result, no error
2361
+ except Exception as e:
2362
+ logger.error(
2363
+ f"Page {self.number}: Error applying correction callback to element '{getattr(element, 'text', '')[:30]}...' ({element.bbox}): {e}",
2364
+ exc_info=False, # Keep log concise
2365
+ )
2366
+ return element, None, e # Return element, no result, error
2367
+ finally:
2368
+ # --- Update internal tqdm progress bar ---
2369
+ if element_pbar:
2370
+ element_pbar.update(1)
2371
+ # --- Call user's progress callback --- #
2372
+ if progress_callback:
2373
+ try:
2374
+ progress_callback()
2375
+ except Exception as cb_e:
2376
+ # Log error in callback itself, but don't stop processing
2377
+ logger.error(
2378
+ f"Page {self.number}: Error executing progress_callback: {cb_e}",
2379
+ exc_info=False,
2380
+ )
2301
2381
 
2302
- # Process results as they complete (progress_callback called by worker)
2303
- for future in concurrent.futures.as_completed(future_to_element):
2304
- processed_count += 1
2305
- try:
2306
- element, corrected_text, error = future.result()
2307
- if error:
2382
+ # Choose execution strategy based on max_workers
2383
+ if max_workers is not None and max_workers > 1:
2384
+ # --- Parallel execution --- #
2385
+ logger.info(
2386
+ f"Page {self.number}: Running OCR correction in parallel with {max_workers} workers."
2387
+ )
2388
+ futures = []
2389
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
2390
+ # Submit all tasks
2391
+ future_to_element = {
2392
+ executor.submit(_process_element_task, element): element
2393
+ for element in target_elements
2394
+ }
2395
+
2396
+ # Process results as they complete (progress_callback called by worker)
2397
+ for future in concurrent.futures.as_completed(future_to_element):
2398
+ processed_count += 1
2399
+ try:
2400
+ element, corrected_text, error = future.result()
2401
+ if error:
2402
+ error_count += 1
2403
+ # Error already logged in worker
2404
+ elif corrected_text is not None:
2405
+ # Apply correction if text changed
2406
+ current_text = getattr(element, "text", None)
2407
+ if corrected_text != current_text:
2408
+ element.text = corrected_text
2409
+ updated_count += 1
2410
+ except Exception as exc:
2411
+ # Catch errors from future.result() itself
2412
+ element = future_to_element[future] # Find original element
2413
+ logger.error(
2414
+ f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
2415
+ exc_info=True,
2416
+ )
2308
2417
  error_count += 1
2309
- # Error already logged in worker
2310
- elif corrected_text is not None:
2311
- # Apply correction if text changed
2312
- current_text = getattr(element, "text", None)
2313
- if corrected_text != current_text:
2314
- element.text = corrected_text
2315
- updated_count += 1
2316
- except Exception as exc:
2317
- # Catch errors from future.result() itself
2318
- element = future_to_element[future] # Find original element
2319
- logger.error(
2320
- f"Page {self.number}: Internal error retrieving correction result for element {element.bbox}: {exc}",
2321
- exc_info=True,
2322
- )
2323
- error_count += 1
2324
- # Note: progress_callback was already called in the worker's finally block
2418
+ # Note: progress_callback was already called in the worker's finally block
2325
2419
 
2326
- else:
2327
- # --- Sequential execution --- #
2328
- logger.info(f"Page {self.number}: Running OCR correction sequentially.")
2329
- for element in target_elements:
2330
- # Call the task function directly (it handles progress_callback)
2331
- processed_count += 1
2332
- _element, corrected_text, error = _process_element_task(element)
2333
- if error:
2334
- error_count += 1
2335
- elif corrected_text is not None:
2336
- # Apply correction if text changed
2337
- current_text = getattr(_element, "text", None)
2338
- if corrected_text != current_text:
2339
- _element.text = corrected_text
2340
- updated_count += 1
2420
+ else:
2421
+ # --- Sequential execution --- #
2422
+ logger.info(f"Page {self.number}: Running OCR correction sequentially.")
2423
+ for element in target_elements:
2424
+ # Call the task function directly (it handles progress_callback)
2425
+ processed_count += 1
2426
+ _element, corrected_text, error = _process_element_task(element)
2427
+ if error:
2428
+ error_count += 1
2429
+ elif corrected_text is not None:
2430
+ # Apply correction if text changed
2431
+ current_text = getattr(_element, "text", None)
2432
+ if corrected_text != current_text:
2433
+ _element.text = corrected_text
2434
+ updated_count += 1
2341
2435
 
2342
- logger.info(
2343
- f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
2344
- )
2436
+ logger.info(
2437
+ f"Page {self.number}: OCR correction finished. Processed: {processed_count}/{len(target_elements)}, Updated: {updated_count}, Errors: {error_count}."
2438
+ )
2345
2439
 
2346
- return self # Return self for chaining
2440
+ return self # Return self for chaining
2441
+ finally:
2442
+ if element_pbar:
2443
+ element_pbar.close()
2347
2444
 
2348
2445
  # --- Classification Mixin Implementation --- #
2349
2446
  def _get_classification_manager(self) -> "ClassificationManager":
natural_pdf/core/pdf.py CHANGED
@@ -38,7 +38,7 @@ from natural_pdf.extraction.mixin import ExtractionMixin
38
38
  from natural_pdf.ocr import OCRManager, OCROptions
39
39
  from natural_pdf.selectors.parser import parse_selector
40
40
  from natural_pdf.utils.locks import pdf_render_lock
41
- from natural_pdf.utils.tqdm_utils import get_tqdm
41
+ from tqdm.auto import tqdm
42
42
 
43
43
  try:
44
44
  from typing import Any as TypingAny
@@ -71,7 +71,6 @@ except ImportError:
71
71
  create_original_pdf = None
72
72
 
73
73
  logger = logging.getLogger("natural_pdf.core.pdf")
74
- tqdm = get_tqdm()
75
74
 
76
75
  DEFAULT_MANAGERS = {
77
76
  "classification": ClassificationManager,
@@ -1253,6 +1252,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1253
1252
  self,
1254
1253
  pages: Optional[Union[Iterable[int], range, slice]] = None,
1255
1254
  resolution: int = 300,
1255
+ angle: Optional[float] = None,
1256
1256
  detection_resolution: int = 72,
1257
1257
  force_overwrite: bool = False,
1258
1258
  **deskew_kwargs,
@@ -1271,6 +1271,7 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1271
1271
  Args:
1272
1272
  pages: Page indices/slice to include (0-based). If None, processes all pages.
1273
1273
  resolution: DPI resolution for rendering the output deskewed pages.
1274
+ angle: The specific angle (in degrees) to rotate by. If None, detects automatically.
1274
1275
  detection_resolution: DPI resolution used for skew detection if angles are not
1275
1276
  already cached on the page objects.
1276
1277
  force_overwrite: If False (default), raises a ValueError if any target page
@@ -1315,14 +1316,13 @@ class PDF(ExtractionMixin, ExportMixin, ClassificationMixin):
1315
1316
  deskewed_images_bytes = []
1316
1317
  logger.info(f"Deskewing {len(target_pages)} pages (output resolution={resolution} DPI)...")
1317
1318
 
1318
- # Use tqdm via get_tqdm
1319
1319
  for page in tqdm(target_pages, desc="Deskewing Pages", leave=False):
1320
1320
  try:
1321
1321
  # Use page.deskew to get the corrected PIL image
1322
1322
  # Pass down resolutions and kwargs
1323
1323
  deskewed_img = page.deskew(
1324
1324
  resolution=resolution,
1325
- angle=None, # Let page.deskew handle detection/caching
1325
+ angle=angle, # Let page.deskew handle detection/caching
1326
1326
  detection_resolution=detection_resolution,
1327
1327
  **deskew_kwargs,
1328
1328
  )