natural-pdf 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +578 -27
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +118 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.31.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0
@@ -5,6 +5,8 @@ import numpy as np
5
5
  from PIL import Image, ImageDraw
6
6
  from scipy.ndimage import binary_closing, binary_opening, gaussian_filter1d
7
7
  from scipy.signal import find_peaks
8
+ from sklearn.cluster import MiniBatchKMeans
9
+ from scipy.ndimage import label as nd_label, find_objects
8
10
 
9
11
  if TYPE_CHECKING:
10
12
  from natural_pdf.core.page import Page
@@ -1766,6 +1768,232 @@ class ShapeDetectionMixin:
1766
1768
 
1767
1769
  return self
1768
1770
 
1771
+ def detect_blobs(
1772
+ self,
1773
+ k: Optional[int] = None,
1774
+ tolerance: float = 40.0,
1775
+ min_area_pts: float = 400.0,
1776
+ resolution: int = 150,
1777
+ replace: bool = True,
1778
+ source_label: str = "detected",
1779
+ overlap_threshold: float = 0.5,
1780
+ ) -> "ShapeDetectionMixin":
1781
+ """Detect colour blobs on a page/region and convert them to Region objects.
1782
+
1783
+ Args:
1784
+ k: Desired number of colour clusters. ``None`` → automatically choose k
1785
+ (2‒15) using the elbow/knee method on inertia.
1786
+ tolerance: Maximum Delta-E CIE2000 distance at which two colour
1787
+ clusters are merged (40 ≈ perceptually "very similar"). Higher
1788
+ values merge more shades; set 0 to disable.
1789
+ min_area_pts: Ignore components whose bounding-box area in PDF points² is
1790
+ smaller than this value.
1791
+ resolution: DPI used for rasterising the page/region before detection.
1792
+ replace: If *True* purge existing ``region[type=blob]`` that share the
1793
+ same ``source_label`` before adding new ones.
1794
+ source_label: Stored in ``region.source`` so callers can distinguish
1795
+ between different detection passes.
1796
+ overlap_threshold: After blobs are built, discard any blob whose
1797
+ area overlaps vector elements (rects/words/lines/curves) by
1798
+ more than this fraction (0‒1). Use this instead of pixel
1799
+ masking so large painted areas are not cut by text boxes.
1800
+ """
1801
+ import numpy as np
1802
+ from scipy.ndimage import label as nd_label, find_objects
1803
+
1804
+ # Acquire raster image & scale info
1805
+ cv_image, scale_factor, origin_offset_pdf, page_obj = self._get_image_for_detection(resolution)
1806
+ if cv_image is None or page_obj is None:
1807
+ return self # nothing to do
1808
+ img_arr = cv_image.reshape(-1, 3).astype(np.float32) / 255.0 # normalised
1809
+
1810
+ # No pre-masking of vector boxes; cluster entire image.
1811
+ h, w, _ = cv_image.shape
1812
+ unmasked_pixels = np.full(img_arr.shape[0], True, dtype=bool)
1813
+ img_arr_unmasked = img_arr # cluster all pixels
1814
+
1815
+ # ── choose k ────────────────────────────────────────────────────────────
1816
+ if k is None:
1817
+ inertias = []
1818
+ ks = list(range(2, 16)) # 2 … 15
1819
+ for _k in ks:
1820
+ km = MiniBatchKMeans(n_clusters=_k, random_state=0, batch_size=1024)
1821
+ km.fit(img_arr_unmasked[:: max(1, img_arr_unmasked.shape[0] // 50000)]) # subsample
1822
+ inertias.append(km.inertia_)
1823
+ # knee: biggest drop in inertia
1824
+ diffs = np.diff(inertias)
1825
+ knee_idx = int(np.argmin(diffs)) # most negative diff
1826
+ k = ks[knee_idx]
1827
+ # fit final model
1828
+ kmeans = MiniBatchKMeans(n_clusters=k, random_state=0, batch_size=1024)
1829
+ full_labels = kmeans.fit_predict(img_arr_unmasked)
1830
+ centroids = kmeans.cluster_centers_ # in 0-1 RGB
1831
+ h, w, _ = cv_image.shape
1832
+ full_label_flat = np.full(img_arr.shape[0], -1, dtype=int)
1833
+ full_label_flat[unmasked_pixels] = full_labels
1834
+ labels_img = full_label_flat.reshape(h, w)
1835
+
1836
+ # ------------------------------------------------------------------
1837
+ # Merge clusters whose centroid colours are perceptually close
1838
+ # (Delta-E CIE2000 < tolerance). We first identify the most frequent
1839
+ # cluster (likely background) and do NOT merge *into* it so that real
1840
+ # colourful blobs don't disappear when tolerance is large.
1841
+ # ------------------------------------------------------------------
1842
+ try:
1843
+ from colormath2.color_conversions import convert_color
1844
+ from colormath2.color_diff import delta_e_cie2000
1845
+ from colormath2.color_objects import LabColor, sRGBColor
1846
+
1847
+ # Compute pixel counts per cluster to locate background
1848
+ counts = np.bincount(full_labels, minlength=k)
1849
+ bg_cluster = int(np.argmax(counts)) # largest cluster by pixel count
1850
+
1851
+ lab_centroids = [convert_color(sRGBColor(*rgb), LabColor) for rgb in centroids]
1852
+
1853
+ # Union-find parent array
1854
+ parent = list(range(k))
1855
+
1856
+ def find(x):
1857
+ while parent[x] != x:
1858
+ parent[x] = parent[parent[x]]
1859
+ x = parent[x]
1860
+ return x
1861
+
1862
+ def union(a, b):
1863
+ ra, rb = find(a), find(b)
1864
+ if ra != rb:
1865
+ parent[rb] = ra
1866
+
1867
+ for i in range(k):
1868
+ for j in range(i + 1, k):
1869
+ if bg_cluster in (i, j):
1870
+ continue # never merge INTO background
1871
+ if delta_e_cie2000(lab_centroids[i], lab_centroids[j]) < tolerance:
1872
+ union(i, j)
1873
+
1874
+ # Remap every cluster id to its root parent
1875
+ root_map = [find(idx) for idx in range(k)]
1876
+ for old_id, new_id in enumerate(root_map):
1877
+ if old_id != new_id:
1878
+ full_label_flat[full_label_flat == old_id] = new_id
1879
+
1880
+ except ImportError:
1881
+ # colormath2 not available – skip merging
1882
+ pass
1883
+
1884
+ labels_img = full_label_flat.reshape(h, w)
1885
+
1886
+ # ── optional purge ──
1887
+ if replace and hasattr(page_obj, "_element_mgr"):
1888
+ old_blobs = [r for r in page_obj._element_mgr.regions if getattr(r, "region_type", None) == "blob" and getattr(r, "source", None) == source_label]
1889
+ for r in old_blobs:
1890
+ try:
1891
+ page_obj._element_mgr.regions.remove(r)
1892
+ except ValueError:
1893
+ pass
1894
+
1895
+ # ── iterate clusters ───────────────────────────────────────────────────
1896
+ unique_clusters = [cid for cid in np.unique(labels_img) if cid >= 0]
1897
+ for c_idx in unique_clusters:
1898
+ mask = labels_img == c_idx
1899
+ # clean tiny specks to avoid too many components
1900
+ mask_small = binary_opening(mask, structure=np.ones((3, 3)))
1901
+ # Bridge small gaps so contiguous paint isn't split by tiny holes
1902
+ if not mask_small.any():
1903
+ continue
1904
+ comp_labels, n_comps = nd_label(mask_small)
1905
+ if n_comps == 0:
1906
+ continue
1907
+ slices = find_objects(comp_labels)
1908
+ for comp_idx, sl in enumerate(slices):
1909
+ if sl is None:
1910
+ continue
1911
+ y0, y1 = sl[0].start, sl[0].stop
1912
+ x0, x1 = sl[1].start, sl[1].stop
1913
+ # bbox area in pixels → in pts²
1914
+ area_pixels = (y1 - y0) * (x1 - x0)
1915
+ area_pts = area_pixels * (scale_factor ** 2)
1916
+
1917
+ # Skip tiny regions
1918
+ if area_pts < min_area_pts:
1919
+ continue
1920
+
1921
+ # Skip page-background blocks (≥80 % page area)
1922
+ page_area_pts = page_obj.width * page_obj.height
1923
+ if area_pts / page_area_pts > 0.8:
1924
+ continue
1925
+
1926
+ # Compute mean colour of the component
1927
+ comp_pixels = cv_image[y0:y1, x0:x1]
1928
+ avg_rgb = comp_pixels.mean(axis=(0, 1)) / 255.0 # 0-1 range
1929
+ # Skip almost-white / almost-black near-grayscale areas (likely background or text)
1930
+ brightness = float(np.mean(avg_rgb))
1931
+ color_std = float(np.std(avg_rgb))
1932
+ if color_std < 0.05 and (brightness < 0.2 or brightness > 0.95):
1933
+ continue
1934
+
1935
+ # ----------------------------------------------------------------
1936
+ # Check overlap with characters BEFORE creating the Region.
1937
+ # If more than overlap_threshold of the blob area is covered by
1938
+ # any characters we discard it as likely text fill.
1939
+ # ----------------------------------------------------------------
1940
+
1941
+ region_bbox_pdf = (
1942
+ origin_offset_pdf[0] + x0 * scale_factor,
1943
+ origin_offset_pdf[1] + y0 * scale_factor,
1944
+ origin_offset_pdf[0] + x1 * scale_factor,
1945
+ origin_offset_pdf[1] + y1 * scale_factor,
1946
+ )
1947
+
1948
+ rx0, rtop, rx1, rbot = region_bbox_pdf
1949
+ region_area_pts = (rx1 - rx0) * (rbot - rtop)
1950
+ if region_area_pts == 0:
1951
+ continue
1952
+
1953
+ chars = getattr(page_obj, "chars", []) or []
1954
+ overlap_area = 0.0
1955
+ for ch in chars:
1956
+ vx0, vtop, vx1, vbot = ch.x0, ch.top, ch.x1, ch.bottom
1957
+ ix0 = max(rx0, vx0)
1958
+ iy0 = max(rtop, vtop)
1959
+ ix1 = min(rx1, vx1)
1960
+ iy1 = min(rbot, vbot)
1961
+ if ix1 > ix0 and iy1 > iy0:
1962
+ overlap_area += (ix1 - ix0) * (iy1 - iy0)
1963
+ if overlap_area / region_area_pts >= overlap_threshold:
1964
+ break
1965
+
1966
+ if overlap_area / region_area_pts >= overlap_threshold:
1967
+ continue # skip, mostly text
1968
+
1969
+ # Map to PDF coords and create region after passing overlap test
1970
+ pdf_x0, pdf_top, pdf_x1, pdf_bottom = region_bbox_pdf
1971
+
1972
+ from natural_pdf.elements.region import Region
1973
+ region = Region(page_obj, (pdf_x0, pdf_top, pdf_x1, pdf_bottom))
1974
+ region.region_type = "blob"
1975
+ region.normalized_type = "blob"
1976
+ region.source = source_label
1977
+ # Produce compact web colour using the 'colour' library if available
1978
+ try:
1979
+ from colour import Color # type: ignore
1980
+
1981
+ hex_str = str(Color(rgb=tuple(avg_rgb))) # gives named/shortest rep
1982
+ except Exception:
1983
+ hex_str = "#{:02x}{:02x}{:02x}".format(
1984
+ int(avg_rgb[0] * 255), int(avg_rgb[1] * 255), int(avg_rgb[2] * 255)
1985
+ )
1986
+ region.rgb = tuple(map(float, avg_rgb)) # numeric backup
1987
+ region.color = hex_str
1988
+ region.fill = hex_str
1989
+
1990
+ # Store readable colour for inspection tables
1991
+ region.metadata["color_hex"] = hex_str
1992
+
1993
+ page_obj._element_mgr.add_region(region)
1994
+
1995
+ return self
1996
+
1769
1997
 
1770
1998
  # Example usage would be:
1771
1999
  # page.detect_lines(source_label="my_table_lines")
@@ -448,3 +448,70 @@ class ClassificationManager:
448
448
  raise ClassificationError(
449
449
  f"Batch classification failed using model '{model_id}'. Error: {e}"
450
450
  ) from e
451
+
452
+ def cleanup_models(self, model_id: Optional[str] = None) -> int:
453
+ """
454
+ Cleanup classification models to free memory.
455
+
456
+ Args:
457
+ model_id: Specific model to cleanup, or None to cleanup all models
458
+
459
+ Returns:
460
+ Number of models cleaned up
461
+ """
462
+ global _PIPELINE_CACHE, _TOKENIZER_CACHE, _MODEL_CACHE
463
+
464
+ cleaned_count = 0
465
+
466
+ if model_id:
467
+ # Cleanup specific model - search cache keys that contain the model_id
468
+ keys_to_remove = [key for key in _PIPELINE_CACHE.keys() if model_id in key]
469
+ for key in keys_to_remove:
470
+ pipeline = _PIPELINE_CACHE.pop(key, None)
471
+ if pipeline and hasattr(pipeline, 'model'):
472
+ # Try to cleanup GPU memory if using torch
473
+ try:
474
+ torch = _get_torch()
475
+ if hasattr(pipeline.model, 'to'):
476
+ pipeline.model.to('cpu') # Move to CPU
477
+ if torch.cuda.is_available():
478
+ torch.cuda.empty_cache() # Clear GPU cache
479
+ except Exception as e:
480
+ logger.debug(f"GPU cleanup failed for model {model_id}: {e}")
481
+
482
+ cleaned_count += 1
483
+ logger.info(f"Cleaned up classification pipeline: {key}")
484
+
485
+ # Also cleanup tokenizer and model caches for this model
486
+ tokenizer_keys = [key for key in _TOKENIZER_CACHE.keys() if model_id in key]
487
+ for key in tokenizer_keys:
488
+ _TOKENIZER_CACHE.pop(key, None)
489
+
490
+ model_keys = [key for key in _MODEL_CACHE.keys() if model_id in key]
491
+ for key in model_keys:
492
+ _MODEL_CACHE.pop(key, None)
493
+
494
+ else:
495
+ # Cleanup all models
496
+ for key, pipeline in list(_PIPELINE_CACHE.items()):
497
+ if hasattr(pipeline, 'model'):
498
+ try:
499
+ torch = _get_torch()
500
+ if hasattr(pipeline.model, 'to'):
501
+ pipeline.model.to('cpu') # Move to CPU
502
+ if torch.cuda.is_available():
503
+ torch.cuda.empty_cache() # Clear GPU cache
504
+ except Exception as e:
505
+ logger.debug(f"GPU cleanup failed for pipeline {key}: {e}")
506
+
507
+ # Clear all caches
508
+ pipeline_count = len(_PIPELINE_CACHE)
509
+ _PIPELINE_CACHE.clear()
510
+ _TOKENIZER_CACHE.clear()
511
+ _MODEL_CACHE.clear()
512
+
513
+ if pipeline_count > 0:
514
+ logger.info(f"Cleaned up {pipeline_count} classification models")
515
+ cleaned_count = pipeline_count
516
+
517
+ return cleaned_count