natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +44 -0
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +556 -25
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +89 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/ocr_manager.py +50 -0
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
- natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -5,6 +5,8 @@ import numpy as np
|
|
5
5
|
from PIL import Image, ImageDraw
|
6
6
|
from scipy.ndimage import binary_closing, binary_opening, gaussian_filter1d
|
7
7
|
from scipy.signal import find_peaks
|
8
|
+
from sklearn.cluster import MiniBatchKMeans
|
9
|
+
from scipy.ndimage import label as nd_label, find_objects
|
8
10
|
|
9
11
|
if TYPE_CHECKING:
|
10
12
|
from natural_pdf.core.page import Page
|
@@ -1766,6 +1768,232 @@ class ShapeDetectionMixin:
|
|
1766
1768
|
|
1767
1769
|
return self
|
1768
1770
|
|
1771
|
+
def detect_blobs(
|
1772
|
+
self,
|
1773
|
+
k: Optional[int] = None,
|
1774
|
+
tolerance: float = 40.0,
|
1775
|
+
min_area_pts: float = 400.0,
|
1776
|
+
resolution: int = 150,
|
1777
|
+
replace: bool = True,
|
1778
|
+
source_label: str = "detected",
|
1779
|
+
overlap_threshold: float = 0.5,
|
1780
|
+
) -> "ShapeDetectionMixin":
|
1781
|
+
"""Detect colour blobs on a page/region and convert them to Region objects.
|
1782
|
+
|
1783
|
+
Args:
|
1784
|
+
k: Desired number of colour clusters. ``None`` → automatically choose k
|
1785
|
+
(2‒15) using the elbow/knee method on inertia.
|
1786
|
+
tolerance: Maximum Delta-E CIE2000 distance at which two colour
|
1787
|
+
clusters are merged (40 ≈ perceptually "very similar"). Higher
|
1788
|
+
values merge more shades; set 0 to disable.
|
1789
|
+
min_area_pts: Ignore components whose bounding-box area in PDF points² is
|
1790
|
+
smaller than this value.
|
1791
|
+
resolution: DPI used for rasterising the page/region before detection.
|
1792
|
+
replace: If *True* purge existing ``region[type=blob]`` that share the
|
1793
|
+
same ``source_label`` before adding new ones.
|
1794
|
+
source_label: Stored in ``region.source`` so callers can distinguish
|
1795
|
+
between different detection passes.
|
1796
|
+
overlap_threshold: After blobs are built, discard any blob whose
|
1797
|
+
area overlaps vector elements (rects/words/lines/curves) by
|
1798
|
+
more than this fraction (0‒1). Use this instead of pixel
|
1799
|
+
masking so large painted areas are not cut by text boxes.
|
1800
|
+
"""
|
1801
|
+
import numpy as np
|
1802
|
+
from scipy.ndimage import label as nd_label, find_objects
|
1803
|
+
|
1804
|
+
# Acquire raster image & scale info
|
1805
|
+
cv_image, scale_factor, origin_offset_pdf, page_obj = self._get_image_for_detection(resolution)
|
1806
|
+
if cv_image is None or page_obj is None:
|
1807
|
+
return self # nothing to do
|
1808
|
+
img_arr = cv_image.reshape(-1, 3).astype(np.float32) / 255.0 # normalised
|
1809
|
+
|
1810
|
+
# No pre-masking of vector boxes; cluster entire image.
|
1811
|
+
h, w, _ = cv_image.shape
|
1812
|
+
unmasked_pixels = np.full(img_arr.shape[0], True, dtype=bool)
|
1813
|
+
img_arr_unmasked = img_arr # cluster all pixels
|
1814
|
+
|
1815
|
+
# ── choose k ────────────────────────────────────────────────────────────
|
1816
|
+
if k is None:
|
1817
|
+
inertias = []
|
1818
|
+
ks = list(range(2, 16)) # 2 … 15
|
1819
|
+
for _k in ks:
|
1820
|
+
km = MiniBatchKMeans(n_clusters=_k, random_state=0, batch_size=1024)
|
1821
|
+
km.fit(img_arr_unmasked[:: max(1, img_arr_unmasked.shape[0] // 50000)]) # subsample
|
1822
|
+
inertias.append(km.inertia_)
|
1823
|
+
# knee: biggest drop in inertia
|
1824
|
+
diffs = np.diff(inertias)
|
1825
|
+
knee_idx = int(np.argmin(diffs)) # most negative diff
|
1826
|
+
k = ks[knee_idx]
|
1827
|
+
# fit final model
|
1828
|
+
kmeans = MiniBatchKMeans(n_clusters=k, random_state=0, batch_size=1024)
|
1829
|
+
full_labels = kmeans.fit_predict(img_arr_unmasked)
|
1830
|
+
centroids = kmeans.cluster_centers_ # in 0-1 RGB
|
1831
|
+
h, w, _ = cv_image.shape
|
1832
|
+
full_label_flat = np.full(img_arr.shape[0], -1, dtype=int)
|
1833
|
+
full_label_flat[unmasked_pixels] = full_labels
|
1834
|
+
labels_img = full_label_flat.reshape(h, w)
|
1835
|
+
|
1836
|
+
# ------------------------------------------------------------------
|
1837
|
+
# Merge clusters whose centroid colours are perceptually close
|
1838
|
+
# (Delta-E CIE2000 < tolerance). We first identify the most frequent
|
1839
|
+
# cluster (likely background) and do NOT merge *into* it so that real
|
1840
|
+
# colourful blobs don't disappear when tolerance is large.
|
1841
|
+
# ------------------------------------------------------------------
|
1842
|
+
try:
|
1843
|
+
from colormath2.color_conversions import convert_color
|
1844
|
+
from colormath2.color_diff import delta_e_cie2000
|
1845
|
+
from colormath2.color_objects import LabColor, sRGBColor
|
1846
|
+
|
1847
|
+
# Compute pixel counts per cluster to locate background
|
1848
|
+
counts = np.bincount(full_labels, minlength=k)
|
1849
|
+
bg_cluster = int(np.argmax(counts)) # largest cluster by pixel count
|
1850
|
+
|
1851
|
+
lab_centroids = [convert_color(sRGBColor(*rgb), LabColor) for rgb in centroids]
|
1852
|
+
|
1853
|
+
# Union-find parent array
|
1854
|
+
parent = list(range(k))
|
1855
|
+
|
1856
|
+
def find(x):
|
1857
|
+
while parent[x] != x:
|
1858
|
+
parent[x] = parent[parent[x]]
|
1859
|
+
x = parent[x]
|
1860
|
+
return x
|
1861
|
+
|
1862
|
+
def union(a, b):
|
1863
|
+
ra, rb = find(a), find(b)
|
1864
|
+
if ra != rb:
|
1865
|
+
parent[rb] = ra
|
1866
|
+
|
1867
|
+
for i in range(k):
|
1868
|
+
for j in range(i + 1, k):
|
1869
|
+
if bg_cluster in (i, j):
|
1870
|
+
continue # never merge INTO background
|
1871
|
+
if delta_e_cie2000(lab_centroids[i], lab_centroids[j]) < tolerance:
|
1872
|
+
union(i, j)
|
1873
|
+
|
1874
|
+
# Remap every cluster id to its root parent
|
1875
|
+
root_map = [find(idx) for idx in range(k)]
|
1876
|
+
for old_id, new_id in enumerate(root_map):
|
1877
|
+
if old_id != new_id:
|
1878
|
+
full_label_flat[full_label_flat == old_id] = new_id
|
1879
|
+
|
1880
|
+
except ImportError:
|
1881
|
+
# colormath2 not available – skip merging
|
1882
|
+
pass
|
1883
|
+
|
1884
|
+
labels_img = full_label_flat.reshape(h, w)
|
1885
|
+
|
1886
|
+
# ── optional purge ──
|
1887
|
+
if replace and hasattr(page_obj, "_element_mgr"):
|
1888
|
+
old_blobs = [r for r in page_obj._element_mgr.regions if getattr(r, "region_type", None) == "blob" and getattr(r, "source", None) == source_label]
|
1889
|
+
for r in old_blobs:
|
1890
|
+
try:
|
1891
|
+
page_obj._element_mgr.regions.remove(r)
|
1892
|
+
except ValueError:
|
1893
|
+
pass
|
1894
|
+
|
1895
|
+
# ── iterate clusters ───────────────────────────────────────────────────
|
1896
|
+
unique_clusters = [cid for cid in np.unique(labels_img) if cid >= 0]
|
1897
|
+
for c_idx in unique_clusters:
|
1898
|
+
mask = labels_img == c_idx
|
1899
|
+
# clean tiny specks to avoid too many components
|
1900
|
+
mask_small = binary_opening(mask, structure=np.ones((3, 3)))
|
1901
|
+
# Bridge small gaps so contiguous paint isn't split by tiny holes
|
1902
|
+
if not mask_small.any():
|
1903
|
+
continue
|
1904
|
+
comp_labels, n_comps = nd_label(mask_small)
|
1905
|
+
if n_comps == 0:
|
1906
|
+
continue
|
1907
|
+
slices = find_objects(comp_labels)
|
1908
|
+
for comp_idx, sl in enumerate(slices):
|
1909
|
+
if sl is None:
|
1910
|
+
continue
|
1911
|
+
y0, y1 = sl[0].start, sl[0].stop
|
1912
|
+
x0, x1 = sl[1].start, sl[1].stop
|
1913
|
+
# bbox area in pixels → in pts²
|
1914
|
+
area_pixels = (y1 - y0) * (x1 - x0)
|
1915
|
+
area_pts = area_pixels * (scale_factor ** 2)
|
1916
|
+
|
1917
|
+
# Skip tiny regions
|
1918
|
+
if area_pts < min_area_pts:
|
1919
|
+
continue
|
1920
|
+
|
1921
|
+
# Skip page-background blocks (≥80 % page area)
|
1922
|
+
page_area_pts = page_obj.width * page_obj.height
|
1923
|
+
if area_pts / page_area_pts > 0.8:
|
1924
|
+
continue
|
1925
|
+
|
1926
|
+
# Compute mean colour of the component
|
1927
|
+
comp_pixels = cv_image[y0:y1, x0:x1]
|
1928
|
+
avg_rgb = comp_pixels.mean(axis=(0, 1)) / 255.0 # 0-1 range
|
1929
|
+
# Skip almost-white / almost-black near-grayscale areas (likely background or text)
|
1930
|
+
brightness = float(np.mean(avg_rgb))
|
1931
|
+
color_std = float(np.std(avg_rgb))
|
1932
|
+
if color_std < 0.05 and (brightness < 0.2 or brightness > 0.95):
|
1933
|
+
continue
|
1934
|
+
|
1935
|
+
# ----------------------------------------------------------------
|
1936
|
+
# Check overlap with characters BEFORE creating the Region.
|
1937
|
+
# If more than overlap_threshold of the blob area is covered by
|
1938
|
+
# any characters we discard it as likely text fill.
|
1939
|
+
# ----------------------------------------------------------------
|
1940
|
+
|
1941
|
+
region_bbox_pdf = (
|
1942
|
+
origin_offset_pdf[0] + x0 * scale_factor,
|
1943
|
+
origin_offset_pdf[1] + y0 * scale_factor,
|
1944
|
+
origin_offset_pdf[0] + x1 * scale_factor,
|
1945
|
+
origin_offset_pdf[1] + y1 * scale_factor,
|
1946
|
+
)
|
1947
|
+
|
1948
|
+
rx0, rtop, rx1, rbot = region_bbox_pdf
|
1949
|
+
region_area_pts = (rx1 - rx0) * (rbot - rtop)
|
1950
|
+
if region_area_pts == 0:
|
1951
|
+
continue
|
1952
|
+
|
1953
|
+
chars = getattr(page_obj, "chars", []) or []
|
1954
|
+
overlap_area = 0.0
|
1955
|
+
for ch in chars:
|
1956
|
+
vx0, vtop, vx1, vbot = ch.x0, ch.top, ch.x1, ch.bottom
|
1957
|
+
ix0 = max(rx0, vx0)
|
1958
|
+
iy0 = max(rtop, vtop)
|
1959
|
+
ix1 = min(rx1, vx1)
|
1960
|
+
iy1 = min(rbot, vbot)
|
1961
|
+
if ix1 > ix0 and iy1 > iy0:
|
1962
|
+
overlap_area += (ix1 - ix0) * (iy1 - iy0)
|
1963
|
+
if overlap_area / region_area_pts >= overlap_threshold:
|
1964
|
+
break
|
1965
|
+
|
1966
|
+
if overlap_area / region_area_pts >= overlap_threshold:
|
1967
|
+
continue # skip, mostly text
|
1968
|
+
|
1969
|
+
# Map to PDF coords and create region after passing overlap test
|
1970
|
+
pdf_x0, pdf_top, pdf_x1, pdf_bottom = region_bbox_pdf
|
1971
|
+
|
1972
|
+
from natural_pdf.elements.region import Region
|
1973
|
+
region = Region(page_obj, (pdf_x0, pdf_top, pdf_x1, pdf_bottom))
|
1974
|
+
region.region_type = "blob"
|
1975
|
+
region.normalized_type = "blob"
|
1976
|
+
region.source = source_label
|
1977
|
+
# Produce compact web colour using the 'colour' library if available
|
1978
|
+
try:
|
1979
|
+
from colour import Color # type: ignore
|
1980
|
+
|
1981
|
+
hex_str = str(Color(rgb=tuple(avg_rgb))) # gives named/shortest rep
|
1982
|
+
except Exception:
|
1983
|
+
hex_str = "#{:02x}{:02x}{:02x}".format(
|
1984
|
+
int(avg_rgb[0] * 255), int(avg_rgb[1] * 255), int(avg_rgb[2] * 255)
|
1985
|
+
)
|
1986
|
+
region.rgb = tuple(map(float, avg_rgb)) # numeric backup
|
1987
|
+
region.color = hex_str
|
1988
|
+
region.fill = hex_str
|
1989
|
+
|
1990
|
+
# Store readable colour for inspection tables
|
1991
|
+
region.metadata["color_hex"] = hex_str
|
1992
|
+
|
1993
|
+
page_obj._element_mgr.add_region(region)
|
1994
|
+
|
1995
|
+
return self
|
1996
|
+
|
1769
1997
|
|
1770
1998
|
# Example usage would be:
|
1771
1999
|
# page.detect_lines(source_label="my_table_lines")
|
@@ -448,3 +448,70 @@ class ClassificationManager:
|
|
448
448
|
raise ClassificationError(
|
449
449
|
f"Batch classification failed using model '{model_id}'. Error: {e}"
|
450
450
|
) from e
|
451
|
+
|
452
|
+
def cleanup_models(self, model_id: Optional[str] = None) -> int:
|
453
|
+
"""
|
454
|
+
Cleanup classification models to free memory.
|
455
|
+
|
456
|
+
Args:
|
457
|
+
model_id: Specific model to cleanup, or None to cleanup all models
|
458
|
+
|
459
|
+
Returns:
|
460
|
+
Number of models cleaned up
|
461
|
+
"""
|
462
|
+
global _PIPELINE_CACHE, _TOKENIZER_CACHE, _MODEL_CACHE
|
463
|
+
|
464
|
+
cleaned_count = 0
|
465
|
+
|
466
|
+
if model_id:
|
467
|
+
# Cleanup specific model - search cache keys that contain the model_id
|
468
|
+
keys_to_remove = [key for key in _PIPELINE_CACHE.keys() if model_id in key]
|
469
|
+
for key in keys_to_remove:
|
470
|
+
pipeline = _PIPELINE_CACHE.pop(key, None)
|
471
|
+
if pipeline and hasattr(pipeline, 'model'):
|
472
|
+
# Try to cleanup GPU memory if using torch
|
473
|
+
try:
|
474
|
+
torch = _get_torch()
|
475
|
+
if hasattr(pipeline.model, 'to'):
|
476
|
+
pipeline.model.to('cpu') # Move to CPU
|
477
|
+
if torch.cuda.is_available():
|
478
|
+
torch.cuda.empty_cache() # Clear GPU cache
|
479
|
+
except Exception as e:
|
480
|
+
logger.debug(f"GPU cleanup failed for model {model_id}: {e}")
|
481
|
+
|
482
|
+
cleaned_count += 1
|
483
|
+
logger.info(f"Cleaned up classification pipeline: {key}")
|
484
|
+
|
485
|
+
# Also cleanup tokenizer and model caches for this model
|
486
|
+
tokenizer_keys = [key for key in _TOKENIZER_CACHE.keys() if model_id in key]
|
487
|
+
for key in tokenizer_keys:
|
488
|
+
_TOKENIZER_CACHE.pop(key, None)
|
489
|
+
|
490
|
+
model_keys = [key for key in _MODEL_CACHE.keys() if model_id in key]
|
491
|
+
for key in model_keys:
|
492
|
+
_MODEL_CACHE.pop(key, None)
|
493
|
+
|
494
|
+
else:
|
495
|
+
# Cleanup all models
|
496
|
+
for key, pipeline in list(_PIPELINE_CACHE.items()):
|
497
|
+
if hasattr(pipeline, 'model'):
|
498
|
+
try:
|
499
|
+
torch = _get_torch()
|
500
|
+
if hasattr(pipeline.model, 'to'):
|
501
|
+
pipeline.model.to('cpu') # Move to CPU
|
502
|
+
if torch.cuda.is_available():
|
503
|
+
torch.cuda.empty_cache() # Clear GPU cache
|
504
|
+
except Exception as e:
|
505
|
+
logger.debug(f"GPU cleanup failed for pipeline {key}: {e}")
|
506
|
+
|
507
|
+
# Clear all caches
|
508
|
+
pipeline_count = len(_PIPELINE_CACHE)
|
509
|
+
_PIPELINE_CACHE.clear()
|
510
|
+
_TOKENIZER_CACHE.clear()
|
511
|
+
_MODEL_CACHE.clear()
|
512
|
+
|
513
|
+
if pipeline_count > 0:
|
514
|
+
logger.info(f"Cleaned up {pipeline_count} classification models")
|
515
|
+
cleaned_count = pipeline_count
|
516
|
+
|
517
|
+
return cleaned_count
|