natural-pdf 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +8 -0
- natural_pdf/analyzers/checkbox/__init__.py +6 -0
- natural_pdf/analyzers/checkbox/base.py +265 -0
- natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
- natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
- natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
- natural_pdf/analyzers/checkbox/mixin.py +95 -0
- natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
- natural_pdf/collections/mixins.py +14 -5
- natural_pdf/core/element_manager.py +5 -1
- natural_pdf/core/page.py +103 -9
- natural_pdf/core/page_collection.py +41 -1
- natural_pdf/core/pdf.py +24 -1
- natural_pdf/describe/base.py +20 -0
- natural_pdf/elements/base.py +152 -10
- natural_pdf/elements/element_collection.py +41 -2
- natural_pdf/elements/region.py +115 -2
- natural_pdf/judge.py +1509 -0
- natural_pdf/selectors/parser.py +42 -1
- natural_pdf/utils/spatial.py +42 -39
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +42 -18
- temp/check_model.py +49 -0
- temp/check_pdf_content.py +9 -0
- temp/checkbox_checks.py +590 -0
- temp/checkbox_simple.py +117 -0
- temp/checkbox_ux_ideas.py +400 -0
- temp/context_manager_prototype.py +177 -0
- temp/convert_to_hf.py +60 -0
- temp/demo_text_closest.py +66 -0
- temp/inspect_model.py +43 -0
- temp/rtdetr_dinov2_test.py +49 -0
- temp/test_closest_debug.py +26 -0
- temp/test_closest_debug2.py +22 -0
- temp/test_context_exploration.py +85 -0
- temp/test_durham.py +30 -0
- temp/test_empty_string.py +16 -0
- temp/test_similarity.py +15 -0
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
natural_pdf/selectors/parser.py
CHANGED
@@ -7,11 +7,12 @@ selectors with extensions for PDF-specific attributes and spatial relationships.
|
|
7
7
|
The parser handles:
|
8
8
|
- Basic element selectors (text, rect, line, image)
|
9
9
|
- Attribute selectors with comparisons ([size>12], [color="red"])
|
10
|
-
- Pseudo-selectors for text content (:contains(), :regex())
|
10
|
+
- Pseudo-selectors for text content (:contains(), :regex(), :closest())
|
11
11
|
- Spatial relationship selectors (:above(), :below(), :near())
|
12
12
|
- Color matching with Delta E distance calculations
|
13
13
|
- Logical operators (AND, OR) and grouping
|
14
14
|
- Complex nested expressions with proper precedence
|
15
|
+
- Fuzzy text matching for OCR errors (:closest())
|
15
16
|
|
16
17
|
Key features:
|
17
18
|
- Safe value parsing without eval() for security
|
@@ -25,9 +26,12 @@ This enables powerful document navigation like:
|
|
25
26
|
- page.find_all('rect[color~="red"]:above(text:contains("Total"))')
|
26
27
|
- page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
|
27
28
|
- page.find('text:regex("[\u2500-\u257f]")') # Box drawing characters
|
29
|
+
- page.find('text:closest("Date(s) of Review")') # Fuzzy match for OCR errors
|
30
|
+
- page.find('text:closest("Invoice Date@0.9")') # 90% similarity threshold
|
28
31
|
"""
|
29
32
|
|
30
33
|
import ast
|
34
|
+
import difflib
|
31
35
|
import logging
|
32
36
|
import re
|
33
37
|
from collections import Counter
|
@@ -691,6 +695,9 @@ def _build_filter_list(
|
|
691
695
|
return getattr(element, "region_type", "").lower().replace(" ", "_")
|
692
696
|
elif name == "model":
|
693
697
|
return getattr(element, "model", None)
|
698
|
+
elif name == "checked":
|
699
|
+
# Map 'checked' attribute to is_checked for checkboxes
|
700
|
+
return getattr(element, "is_checked", None)
|
694
701
|
else:
|
695
702
|
return getattr(element, python_name, None)
|
696
703
|
else:
|
@@ -724,6 +731,29 @@ def _build_filter_list(
|
|
724
731
|
]:
|
725
732
|
op_desc = f"= {value!r} (exact color)"
|
726
733
|
compare_func = lambda el_val, sel_val: _is_exact_color_match(el_val, sel_val)
|
734
|
+
# For boolean attributes, handle string/bool comparison
|
735
|
+
elif name in ["checked", "is_checked", "bold", "italic"]:
|
736
|
+
|
737
|
+
def bool_compare(el_val, sel_val):
|
738
|
+
# Convert both to boolean for comparison
|
739
|
+
if isinstance(el_val, bool):
|
740
|
+
el_bool = el_val
|
741
|
+
else:
|
742
|
+
el_bool = str(el_val).lower() in ("true", "1", "yes")
|
743
|
+
|
744
|
+
if isinstance(sel_val, bool):
|
745
|
+
sel_bool = sel_val
|
746
|
+
else:
|
747
|
+
sel_bool = str(sel_val).lower() in ("true", "1", "yes")
|
748
|
+
|
749
|
+
# Debug logging
|
750
|
+
logger.debug(
|
751
|
+
f"Boolean comparison: el_val={el_val} ({type(el_val)}) -> {el_bool}, sel_val={sel_val} ({type(sel_val)}) -> {sel_bool}"
|
752
|
+
)
|
753
|
+
|
754
|
+
return el_bool == sel_bool
|
755
|
+
|
756
|
+
compare_func = bool_compare
|
727
757
|
else:
|
728
758
|
compare_func = lambda el_val, sel_val: el_val == sel_val
|
729
759
|
elif op == "!=":
|
@@ -894,6 +924,13 @@ def _build_filter_list(
|
|
894
924
|
|
895
925
|
filter_lambda = regex_check
|
896
926
|
|
927
|
+
# --- Handle :closest pseudo-class for fuzzy text matching --- #
|
928
|
+
elif name == "closest" and args is not None:
|
929
|
+
# Note: :closest is handled specially in the page._apply_selector method
|
930
|
+
# It doesn't filter elements here, but marks them for special processing
|
931
|
+
# This allows us to first check :contains matches, then sort by similarity
|
932
|
+
filter_lambda = lambda el: True # Accept all elements for now
|
933
|
+
|
897
934
|
# --- Handle :startswith and :starts-with (alias) --- #
|
898
935
|
elif name in ("starts-with", "startswith") and args is not None:
|
899
936
|
filter_name = f"pseudo-class :{name}({args!r})"
|
@@ -936,6 +973,10 @@ def _build_filter_list(
|
|
936
973
|
filter_lambda = lambda el: hasattr(el, "is_horizontal") and el.is_horizontal
|
937
974
|
elif name == "vertical":
|
938
975
|
filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
|
976
|
+
elif name == "checked":
|
977
|
+
filter_lambda = lambda el: hasattr(el, "is_checked") and el.is_checked
|
978
|
+
elif name == "unchecked":
|
979
|
+
filter_lambda = lambda el: hasattr(el, "is_checked") and not el.is_checked
|
939
980
|
|
940
981
|
# --- New: :strike / :strikethrough / :strikeout pseudo-classes --- #
|
941
982
|
elif name in ("strike", "strikethrough", "strikeout"):
|
natural_pdf/utils/spatial.py
CHANGED
@@ -10,11 +10,11 @@ with include_boundaries='none'.
|
|
10
10
|
|
11
11
|
Example:
|
12
12
|
from natural_pdf.utils.spatial import is_element_in_region
|
13
|
-
|
13
|
+
|
14
14
|
# Check if element is in region using center-based logic (default)
|
15
15
|
if is_element_in_region(element, region):
|
16
16
|
print("Element is in region")
|
17
|
-
|
17
|
+
|
18
18
|
# Use different strategies
|
19
19
|
if is_element_in_region(element, region, strategy="intersects"):
|
20
20
|
print("Element overlaps with region")
|
@@ -35,16 +35,16 @@ InclusionStrategy = Literal["center", "intersects", "contains"]
|
|
35
35
|
|
36
36
|
def is_element_in_region(
|
37
37
|
element: "Element",
|
38
|
-
region: "Region",
|
38
|
+
region: "Region",
|
39
39
|
strategy: InclusionStrategy = "center",
|
40
|
-
check_page: bool = True
|
40
|
+
check_page: bool = True,
|
41
41
|
) -> bool:
|
42
42
|
"""
|
43
43
|
Unified function to check if an element is inside a region.
|
44
|
-
|
44
|
+
|
45
45
|
This centralizes the logic used across Region, Page, and Flow to ensure
|
46
46
|
consistent behavior throughout the library.
|
47
|
-
|
47
|
+
|
48
48
|
Args:
|
49
49
|
element: The element to check
|
50
50
|
region: The region to check against
|
@@ -53,7 +53,7 @@ def is_element_in_region(
|
|
53
53
|
- "intersects": Element belongs if any part overlaps
|
54
54
|
- "contains": Element belongs only if fully contained
|
55
55
|
check_page: Whether to verify element and region are on the same page
|
56
|
-
|
56
|
+
|
57
57
|
Returns:
|
58
58
|
bool: True if element is in region according to the strategy
|
59
59
|
"""
|
@@ -61,18 +61,18 @@ def is_element_in_region(
|
|
61
61
|
if not hasattr(element, "bbox") or not element.bbox:
|
62
62
|
logger.debug(f"Element lacks bbox attributes: {element}")
|
63
63
|
return False
|
64
|
-
|
64
|
+
|
65
65
|
if not hasattr(region, "bbox") or not region.bbox:
|
66
66
|
logger.debug(f"Region lacks bbox attributes: {region}")
|
67
67
|
return False
|
68
|
-
|
68
|
+
|
69
69
|
# Check page membership if requested
|
70
70
|
if check_page:
|
71
71
|
if not hasattr(element, "page") or not hasattr(region, "page"):
|
72
72
|
return False
|
73
73
|
if element.page != region.page:
|
74
74
|
return False
|
75
|
-
|
75
|
+
|
76
76
|
# Apply the appropriate strategy
|
77
77
|
if strategy == "center":
|
78
78
|
# Use existing region method if available
|
@@ -82,37 +82,43 @@ def is_element_in_region(
|
|
82
82
|
# Fallback calculation
|
83
83
|
elem_center_x = (element.x0 + element.x1) / 2
|
84
84
|
elem_center_y = (element.top + element.bottom) / 2
|
85
|
-
|
85
|
+
|
86
86
|
# Use region's is_point_inside if available
|
87
87
|
if hasattr(region, "is_point_inside"):
|
88
88
|
return region.is_point_inside(elem_center_x, elem_center_y)
|
89
89
|
else:
|
90
90
|
# Simple bounds check
|
91
|
-
return (
|
92
|
-
|
93
|
-
|
91
|
+
return (
|
92
|
+
region.x0 <= elem_center_x <= region.x1
|
93
|
+
and region.top <= elem_center_y <= region.bottom
|
94
|
+
)
|
95
|
+
|
94
96
|
elif strategy == "intersects":
|
95
97
|
# Use existing region method if available
|
96
98
|
if hasattr(region, "intersects"):
|
97
99
|
return region.intersects(element)
|
98
100
|
else:
|
99
101
|
# Simple bbox overlap check
|
100
|
-
return not (
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
102
|
+
return not (
|
103
|
+
element.x1 < region.x0
|
104
|
+
or element.x0 > region.x1
|
105
|
+
or element.bottom < region.top
|
106
|
+
or element.top > region.bottom
|
107
|
+
)
|
108
|
+
|
105
109
|
elif strategy == "contains":
|
106
110
|
# Use existing region method if available
|
107
111
|
if hasattr(region, "contains"):
|
108
112
|
return region.contains(element)
|
109
113
|
else:
|
110
114
|
# Simple full containment check
|
111
|
-
return (
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
115
|
+
return (
|
116
|
+
region.x0 <= element.x0
|
117
|
+
and element.x1 <= region.x1
|
118
|
+
and region.top <= element.top
|
119
|
+
and element.bottom <= region.bottom
|
120
|
+
)
|
121
|
+
|
116
122
|
else:
|
117
123
|
raise ValueError(f"Unknown inclusion strategy: {strategy}")
|
118
124
|
|
@@ -120,10 +126,10 @@ def is_element_in_region(
|
|
120
126
|
def get_inclusion_strategy() -> InclusionStrategy:
|
121
127
|
"""
|
122
128
|
Get the current global inclusion strategy.
|
123
|
-
|
129
|
+
|
124
130
|
This could be made configurable via environment variable or settings.
|
125
131
|
For now, returns the default strategy.
|
126
|
-
|
132
|
+
|
127
133
|
Returns:
|
128
134
|
The current inclusion strategy (default: "center")
|
129
135
|
"""
|
@@ -132,38 +138,35 @@ def get_inclusion_strategy() -> InclusionStrategy:
|
|
132
138
|
return "center"
|
133
139
|
|
134
140
|
|
135
|
-
def calculate_element_overlap_percentage(
|
136
|
-
element: "Element",
|
137
|
-
region: "Region"
|
138
|
-
) -> float:
|
141
|
+
def calculate_element_overlap_percentage(element: "Element", region: "Region") -> float:
|
139
142
|
"""
|
140
143
|
Calculate what percentage of an element overlaps with a region.
|
141
|
-
|
144
|
+
|
142
145
|
Args:
|
143
146
|
element: The element to check
|
144
147
|
region: The region to check against
|
145
|
-
|
148
|
+
|
146
149
|
Returns:
|
147
150
|
float: Percentage of element area that overlaps with region (0.0 to 1.0)
|
148
151
|
"""
|
149
152
|
if not hasattr(element, "bbox") or not hasattr(region, "bbox"):
|
150
153
|
return 0.0
|
151
|
-
|
154
|
+
|
152
155
|
# Calculate intersection bounds
|
153
156
|
intersect_x0 = max(element.x0, region.x0)
|
154
|
-
intersect_y0 = max(element.top, region.top)
|
157
|
+
intersect_y0 = max(element.top, region.top)
|
155
158
|
intersect_x1 = min(element.x1, region.x1)
|
156
159
|
intersect_y1 = min(element.bottom, region.bottom)
|
157
|
-
|
160
|
+
|
158
161
|
# Check if there's an intersection
|
159
162
|
if intersect_x1 <= intersect_x0 or intersect_y1 <= intersect_y0:
|
160
163
|
return 0.0
|
161
|
-
|
164
|
+
|
162
165
|
# Calculate areas
|
163
166
|
element_area = (element.x1 - element.x0) * (element.bottom - element.top)
|
164
167
|
if element_area == 0:
|
165
168
|
return 0.0
|
166
|
-
|
169
|
+
|
167
170
|
intersect_area = (intersect_x1 - intersect_x0) * (intersect_y1 - intersect_y0)
|
168
|
-
|
169
|
-
return intersect_area / element_area
|
171
|
+
|
172
|
+
return intersect_area / element_area
|
@@ -1,5 +1,6 @@
|
|
1
|
-
natural_pdf/__init__.py,sha256=
|
1
|
+
natural_pdf/__init__.py,sha256=JPuQBMN0mZPnPB4z-RAHm8jPSVLKbgw4gxfSXyEgdX4,4957
|
2
2
|
natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
|
3
|
+
natural_pdf/judge.py,sha256=mRPJfdIkkL_Y6uQXnb3Wtrna04XlhPrDvxPrDiVevH4,58838
|
3
4
|
natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
|
4
5
|
natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
|
5
6
|
natural_pdf/analyzers/guides.py,sha256=BqFgt-bRSOkEoFCvNsYyY8j__00X-8DJ_TLb2Hx9qsQ,202430
|
@@ -7,6 +8,13 @@ natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLS
|
|
7
8
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
8
9
|
natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
|
9
10
|
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
11
|
+
natural_pdf/analyzers/checkbox/__init__.py,sha256=2ZWAIUoRqgGlwVeEU0JNMkQ-mO4nxWNFQ6fLOx0jfRQ,243
|
12
|
+
natural_pdf/analyzers/checkbox/base.py,sha256=Hu2WrlaG2gNbTFa8fYSzjyUFmOZvbdTdonnMd9lwl44,9610
|
13
|
+
natural_pdf/analyzers/checkbox/checkbox_analyzer.py,sha256=rDO7YIT_fAd5BmpXMOUnZaSHUmFuXqVXZK-HNyS3Ezw,13647
|
14
|
+
natural_pdf/analyzers/checkbox/checkbox_manager.py,sha256=ZR8yfhWiykxBe6h4smsDuY-So47j0tcGEXhF0FEIorE,5959
|
15
|
+
natural_pdf/analyzers/checkbox/checkbox_options.py,sha256=-2V3_yduBhD4iVjn-EhgK7D6qA2xH9NJorfgDcar6PU,2094
|
16
|
+
natural_pdf/analyzers/checkbox/mixin.py,sha256=KYnr_Xx4U2bp6c35GG2hk6yX_z4NgX7ZW9zT1xmEKEw,3710
|
17
|
+
natural_pdf/analyzers/checkbox/rtdetr.py,sha256=Oxz4XVJKDuVWzBQDqM_hqslCH66n1HJg4_hdXS4aAs4,6944
|
10
18
|
natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
|
11
19
|
natural_pdf/analyzers/layout/base.py,sha256=F5xPOJcI65N4nxwm0szvhtbDD6lVMqWDut8PSkTCobU,8349
|
12
20
|
natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
|
@@ -23,28 +31,28 @@ natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kN
|
|
23
31
|
natural_pdf/classification/manager.py,sha256=BaqBL9GeMvYgoJsiQeI2J8aUKQ5Qxu_ELRvmCWquld8,22172
|
24
32
|
natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
|
25
33
|
natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
|
26
|
-
natural_pdf/collections/mixins.py,sha256=
|
34
|
+
natural_pdf/collections/mixins.py,sha256=ZsS61WFu6Ipree4O_zFECKWoKHC3pYVwZU7tUP6OTOQ,6145
|
27
35
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
|
-
natural_pdf/core/element_manager.py,sha256=
|
36
|
+
natural_pdf/core/element_manager.py,sha256=7fy65zzD42LvDJKj8X1pbJAQYL5lk9wGdTtgE0rsPpA,56057
|
29
37
|
natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
|
30
|
-
natural_pdf/core/page.py,sha256
|
31
|
-
natural_pdf/core/page_collection.py,sha256=
|
38
|
+
natural_pdf/core/page.py,sha256=NiJxBHLx4Otwr7iMza1gsEAfSqTMvTu_6zex4aocZOw,162710
|
39
|
+
natural_pdf/core/page_collection.py,sha256=OjIS9iEtFrHw0liJHGI-CFwZbHHA4Lt7vK69wN76Igg,68255
|
32
40
|
natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
|
33
|
-
natural_pdf/core/pdf.py,sha256=
|
41
|
+
natural_pdf/core/pdf.py,sha256=Cc4A6b49apGfxk7DFcN4oCfoiYmpnH2-jFf_Gb6B5mg,106345
|
34
42
|
natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
|
35
43
|
natural_pdf/core/render_spec.py,sha256=y9QkMiIvWaKiEBlV0TjyldADIEUY3YfWLQXxStHu1S4,15480
|
36
44
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
37
|
-
natural_pdf/describe/base.py,sha256=
|
45
|
+
natural_pdf/describe/base.py,sha256=pU_fDkWG_hQlne2nNIdOC1xXyTrPc-kmTwd685nZiSk,21024
|
38
46
|
natural_pdf/describe/elements.py,sha256=3Y541z5TQ2obrfZFiFi1YQMsCt3oYrhMHpD5j1tuppw,12639
|
39
47
|
natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
|
40
48
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
41
49
|
natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
|
42
|
-
natural_pdf/elements/base.py,sha256=
|
43
|
-
natural_pdf/elements/element_collection.py,sha256=
|
50
|
+
natural_pdf/elements/base.py,sha256=NunXdrZW53iG-Q4Pe9DHmWpzigHg-JrkjOLZ016I_b0,82679
|
51
|
+
natural_pdf/elements/element_collection.py,sha256=z3gRONShw6MrdTJYXVjBi9uNr3dNQtRXgyYKm-VPB7A,141371
|
44
52
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
45
53
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
46
54
|
natural_pdf/elements/rect.py,sha256=kmUmhwnihd-aTweAO-LsngRDo5Iqmx7lcSa8ZBlE_2E,4544
|
47
|
-
natural_pdf/elements/region.py,sha256=
|
55
|
+
natural_pdf/elements/region.py,sha256=ql_pZvfjbT0j2zekqMrGBWDzNVo4erNiQ9aK67J7KTw,173382
|
48
56
|
natural_pdf/elements/text.py,sha256=Jo4gnrsJe1PStdoWF2Bt8RSeSmOcfA9DxvMJl7EoAmI,21344
|
49
57
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
50
58
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
@@ -85,7 +93,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
85
93
|
natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
|
86
94
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
87
95
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
88
|
-
natural_pdf/selectors/parser.py,sha256=
|
96
|
+
natural_pdf/selectors/parser.py,sha256=wXlTL2t05xj47sMoG-vhjQFyEVou8NZie7wKKm60iMA,49063
|
89
97
|
natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
|
90
98
|
natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
|
91
99
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
@@ -101,7 +109,7 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
|
|
101
109
|
natural_pdf/utils/pdfminer_patches.py,sha256=Ob81OMoNUGMUIy9nMw3deSQ_Z6cQmhbRlHUC3EHw2jk,4201
|
102
110
|
natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
|
103
111
|
natural_pdf/utils/sections.py,sha256=HZX7829-fquKgIF7vUN2tL10-aXckEaM25g_2VcgWU4,12941
|
104
|
-
natural_pdf/utils/spatial.py,sha256=
|
112
|
+
natural_pdf/utils/spatial.py,sha256=YjzGO4A013ZDYGYbs7hl4RbJOaKrg8-x__Dl_BamwUA,5908
|
105
113
|
natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
|
106
114
|
natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
|
107
115
|
natural_pdf/vision/__init__.py,sha256=TkoQtdODlh0n_99dsjLIWKE9dgK0m4jfrui_cQ3gTwU,221
|
@@ -111,15 +119,30 @@ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2
|
|
111
119
|
natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
|
112
120
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
113
121
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
114
|
-
natural_pdf-0.2.
|
122
|
+
natural_pdf-0.2.19.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
115
123
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
116
124
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
117
125
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
118
126
|
optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
|
119
127
|
optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
|
128
|
+
temp/check_model.py,sha256=rhnqTRUaq2VyyqXHuLBxM7ZEoJwf0ExlSJnvkMDYPRU,1710
|
129
|
+
temp/check_pdf_content.py,sha256=adFIVMI6m36l0R3112ESt9oqX_zM-mhDvTusBcjqBy8,233
|
130
|
+
temp/checkbox_checks.py,sha256=XsR6bmaVNiSH-HsDzthtJcz4vcKOYJ5IbAi6vtfo7P0,20293
|
131
|
+
temp/checkbox_simple.py,sha256=d1NiE1IbGSG2nMtvFPgBgxF6OSZLm7TIC2nkrDSG8fE,3975
|
132
|
+
temp/checkbox_ux_ideas.py,sha256=Pa1NXi-wmtEGAPb1RW9fiQ4mcKf1G88OMm7zIABqGoI,15302
|
133
|
+
temp/context_manager_prototype.py,sha256=uMRO7xrWsbxBUCUaY7xGtEFcIj-QT9j2DQ2JMkinW2M,6150
|
134
|
+
temp/convert_to_hf.py,sha256=DMqZAWvOA_StujfSmkD-hJCnCy4dlvyjIOl2_1l_mOg,1881
|
135
|
+
temp/demo_text_closest.py,sha256=qRnAynLhF-P_q9t_WFaxE_5QLbZMiMp4v9llFipfqZA,2721
|
120
136
|
temp/fix_page_exclusions.py,sha256=YIj62zF38TdoBARAuSIvEbetl_JfXG-mp4v9p355qmo,1358
|
137
|
+
temp/inspect_model.py,sha256=AaRDqhRH9kqcUcfmrSUsNw1xWkxargNA3BWIvzxwHGM,1692
|
138
|
+
temp/rtdetr_dinov2_test.py,sha256=9FUL3hiHweYJIbEeH0AZTrLJSnWatxwymNG9CZEXrGA,1553
|
139
|
+
temp/test_closest_debug.py,sha256=QP53iAEwy2KRSZlwH2eQ07JILxRgfYwBrvro9i2ITXQ,809
|
140
|
+
temp/test_closest_debug2.py,sha256=Hbh0nkG7xS0NfayH2Qg_IzLkeKh6mH-OWo0o2i9777I,740
|
141
|
+
temp/test_context_exploration.py,sha256=DlFXDuKavvUskLjHMwqPVGGrPpYT8zBHErX1uzHnWxw,2611
|
121
142
|
temp/test_draw_guides.py,sha256=_eSSBElGHQkd2QD_KA_Okw70v0dlY5m-1-C5SQwKAJw,642
|
122
143
|
temp/test_draw_guides_interactive.py,sha256=FsH-2ZQGsGx_8QfVCWUAkLbOcJz-VfiwROzQD4AD7kQ,926
|
144
|
+
temp/test_durham.py,sha256=A0J78TiVXCLHP4xy67G6GlOtrE2sgWP7FsLMH6fjBaA,916
|
145
|
+
temp/test_empty_string.py,sha256=FovOW7hDwkShVT7nYVH_UMv3IwQjX0pHhxC9WHAfo2U,470
|
123
146
|
temp/test_exclusion_with_debug.py,sha256=CScxHvb43KrB5dzXuTOhuzjcBXZBdfYB5ygiKkEW26g,1393
|
124
147
|
temp/test_find_exclusions_fix.py,sha256=1l5aEqnElcl3kiykdtmJFlVxQ1xMKGm1UckGYEQg--c,2103
|
125
148
|
temp/test_find_exclusions_fix_no_recursion.py,sha256=qZspTBwxunRM93N_-fZ2fR5Lodj0ArQX3h10HlTXhfc,3592
|
@@ -133,6 +156,7 @@ temp/test_marker_order.py,sha256=TFZkMxRiNoZGVcdDivYnkIDNvwHaiyKUdYoy2rTTIiI,141
|
|
133
156
|
temp/test_original_exclusions_now_work.py,sha256=G6LmaF-P9Qhj0j4lT_4ncfCddllfP6L8F_x2prUBr9w,1904
|
134
157
|
temp/test_pdf_exclusions_with_guides.py,sha256=QaMl0frgKC8kCPQ2BUI8kqyvqsIjQPXKV_St1rK3zxg,2754
|
135
158
|
temp/test_region_exclusions_detailed.py,sha256=EftdW3JY3JH_LX5QlWKt-4drM-joPggK2fKUZRXVTMA,814
|
159
|
+
temp/test_similarity.py,sha256=2Nv8QbSwjaBwMwJsvpZgwOiMIRxPMux5QeZE_rgQ63A,441
|
136
160
|
temp/test_stripes_real_pdf.py,sha256=FIvDoJrnuioOMw1A0aTCCfZLeg99lusfe0Fb0MiqnhQ,2618
|
137
161
|
temp/test_vertical_stripes.py,sha256=Yf3TJfb_faqAFzlgb7i5u6dDHjF4UMSHIGM99vangRk,1877
|
138
162
|
temp/test_widget_functionality.py,sha256=jsEGHYK1dWWa8uEcfGRRj1ReHRMzNoIaMZU4d-o-Djs,2448
|
@@ -148,8 +172,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
148
172
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
149
173
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
150
174
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
151
|
-
natural_pdf-0.2.
|
152
|
-
natural_pdf-0.2.
|
153
|
-
natural_pdf-0.2.
|
154
|
-
natural_pdf-0.2.
|
155
|
-
natural_pdf-0.2.
|
175
|
+
natural_pdf-0.2.19.dist-info/METADATA,sha256=vtMsWwMW9cR2LdQhdDFhDG4WWIkctrT7_3P7klvyJ-8,6960
|
176
|
+
natural_pdf-0.2.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
177
|
+
natural_pdf-0.2.19.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
178
|
+
natural_pdf-0.2.19.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
|
179
|
+
natural_pdf-0.2.19.dist-info/RECORD,,
|
temp/check_model.py
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
from ultralytics import RTDETR
|
2
|
+
import os
|
3
|
+
|
4
|
+
model_path = "/Users/soma/Development/natural-pdf/model-weights/checkbox-nano.pt"
|
5
|
+
print(f"Model exists: {os.path.exists(model_path)}")
|
6
|
+
|
7
|
+
try:
|
8
|
+
model = RTDETR(model_path)
|
9
|
+
print(f"Model loaded successfully")
|
10
|
+
print(f"Model names: {model.names}")
|
11
|
+
print(f"Model task: {model.task}")
|
12
|
+
|
13
|
+
# Try to get architecture info
|
14
|
+
if hasattr(model.model, 'yaml'):
|
15
|
+
print(f"Model yaml: {model.model.yaml}")
|
16
|
+
|
17
|
+
# Check the model structure
|
18
|
+
if hasattr(model.model, 'model'):
|
19
|
+
for i, module in enumerate(model.model.model):
|
20
|
+
print(f"Layer {i}: {module}")
|
21
|
+
if i > 5: # Just show first few layers
|
22
|
+
break
|
23
|
+
|
24
|
+
except Exception as e:
|
25
|
+
print(f"Error: {e}")
|
26
|
+
|
27
|
+
# Try loading as generic model to inspect
|
28
|
+
import torch
|
29
|
+
try:
|
30
|
+
checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
|
31
|
+
print(f"\nCheckpoint keys: {list(checkpoint.keys())}")
|
32
|
+
|
33
|
+
# Check for model configuration
|
34
|
+
if 'model' in checkpoint and hasattr(checkpoint['model'], 'yaml'):
|
35
|
+
print(f"Model yaml: {checkpoint['model'].yaml}")
|
36
|
+
|
37
|
+
# Check train args for model info
|
38
|
+
if 'train_args' in checkpoint:
|
39
|
+
args = checkpoint['train_args']
|
40
|
+
print(f"\nTraining args:")
|
41
|
+
print(f" Model: {getattr(args, 'model', 'Unknown')}")
|
42
|
+
print(f" Task: {getattr(args, 'task', 'Unknown')}")
|
43
|
+
|
44
|
+
# Check epoch info
|
45
|
+
if 'epoch' in checkpoint:
|
46
|
+
print(f" Epochs trained: {checkpoint['epoch']}")
|
47
|
+
|
48
|
+
except Exception as e2:
|
49
|
+
print(f"Error loading checkpoint: {e2}")
|