natural-pdf 0.2.17__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. natural_pdf/__init__.py +8 -0
  2. natural_pdf/analyzers/checkbox/__init__.py +6 -0
  3. natural_pdf/analyzers/checkbox/base.py +265 -0
  4. natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
  5. natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
  6. natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
  7. natural_pdf/analyzers/checkbox/mixin.py +95 -0
  8. natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
  9. natural_pdf/collections/mixins.py +14 -5
  10. natural_pdf/core/element_manager.py +5 -1
  11. natural_pdf/core/page.py +103 -9
  12. natural_pdf/core/page_collection.py +41 -1
  13. natural_pdf/core/pdf.py +24 -1
  14. natural_pdf/describe/base.py +20 -0
  15. natural_pdf/elements/base.py +152 -10
  16. natural_pdf/elements/element_collection.py +41 -2
  17. natural_pdf/elements/region.py +115 -2
  18. natural_pdf/judge.py +1509 -0
  19. natural_pdf/selectors/parser.py +42 -1
  20. natural_pdf/utils/spatial.py +42 -39
  21. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
  22. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +42 -18
  23. temp/check_model.py +49 -0
  24. temp/check_pdf_content.py +9 -0
  25. temp/checkbox_checks.py +590 -0
  26. temp/checkbox_simple.py +117 -0
  27. temp/checkbox_ux_ideas.py +400 -0
  28. temp/context_manager_prototype.py +177 -0
  29. temp/convert_to_hf.py +60 -0
  30. temp/demo_text_closest.py +66 -0
  31. temp/inspect_model.py +43 -0
  32. temp/rtdetr_dinov2_test.py +49 -0
  33. temp/test_closest_debug.py +26 -0
  34. temp/test_closest_debug2.py +22 -0
  35. temp/test_context_exploration.py +85 -0
  36. temp/test_durham.py +30 -0
  37. temp/test_empty_string.py +16 -0
  38. temp/test_similarity.py +15 -0
  39. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
  40. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
  41. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
  42. {natural_pdf-0.2.17.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
@@ -7,11 +7,12 @@ selectors with extensions for PDF-specific attributes and spatial relationships.
7
7
  The parser handles:
8
8
  - Basic element selectors (text, rect, line, image)
9
9
  - Attribute selectors with comparisons ([size>12], [color="red"])
10
- - Pseudo-selectors for text content (:contains(), :regex())
10
+ - Pseudo-selectors for text content (:contains(), :regex(), :closest())
11
11
  - Spatial relationship selectors (:above(), :below(), :near())
12
12
  - Color matching with Delta E distance calculations
13
13
  - Logical operators (AND, OR) and grouping
14
14
  - Complex nested expressions with proper precedence
15
+ - Fuzzy text matching for OCR errors (:closest())
15
16
 
16
17
  Key features:
17
18
  - Safe value parsing without eval() for security
@@ -25,9 +26,12 @@ This enables powerful document navigation like:
25
26
  - page.find_all('rect[color~="red"]:above(text:contains("Total"))')
26
27
  - page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
27
28
  - page.find('text:regex("[\u2500-\u257f]")') # Box drawing characters
29
+ - page.find('text:closest("Date(s) of Review")') # Fuzzy match for OCR errors
30
+ - page.find('text:closest("Invoice Date@0.9")') # 90% similarity threshold
28
31
  """
29
32
 
30
33
  import ast
34
+ import difflib
31
35
  import logging
32
36
  import re
33
37
  from collections import Counter
@@ -691,6 +695,9 @@ def _build_filter_list(
691
695
  return getattr(element, "region_type", "").lower().replace(" ", "_")
692
696
  elif name == "model":
693
697
  return getattr(element, "model", None)
698
+ elif name == "checked":
699
+ # Map 'checked' attribute to is_checked for checkboxes
700
+ return getattr(element, "is_checked", None)
694
701
  else:
695
702
  return getattr(element, python_name, None)
696
703
  else:
@@ -724,6 +731,29 @@ def _build_filter_list(
724
731
  ]:
725
732
  op_desc = f"= {value!r} (exact color)"
726
733
  compare_func = lambda el_val, sel_val: _is_exact_color_match(el_val, sel_val)
734
+ # For boolean attributes, handle string/bool comparison
735
+ elif name in ["checked", "is_checked", "bold", "italic"]:
736
+
737
+ def bool_compare(el_val, sel_val):
738
+ # Convert both to boolean for comparison
739
+ if isinstance(el_val, bool):
740
+ el_bool = el_val
741
+ else:
742
+ el_bool = str(el_val).lower() in ("true", "1", "yes")
743
+
744
+ if isinstance(sel_val, bool):
745
+ sel_bool = sel_val
746
+ else:
747
+ sel_bool = str(sel_val).lower() in ("true", "1", "yes")
748
+
749
+ # Debug logging
750
+ logger.debug(
751
+ f"Boolean comparison: el_val={el_val} ({type(el_val)}) -> {el_bool}, sel_val={sel_val} ({type(sel_val)}) -> {sel_bool}"
752
+ )
753
+
754
+ return el_bool == sel_bool
755
+
756
+ compare_func = bool_compare
727
757
  else:
728
758
  compare_func = lambda el_val, sel_val: el_val == sel_val
729
759
  elif op == "!=":
@@ -894,6 +924,13 @@ def _build_filter_list(
894
924
 
895
925
  filter_lambda = regex_check
896
926
 
927
+ # --- Handle :closest pseudo-class for fuzzy text matching --- #
928
+ elif name == "closest" and args is not None:
929
+ # Note: :closest is handled specially in the page._apply_selector method
930
+ # It doesn't filter elements here, but marks them for special processing
931
+ # This allows us to first check :contains matches, then sort by similarity
932
+ filter_lambda = lambda el: True # Accept all elements for now
933
+
897
934
  # --- Handle :startswith and :starts-with (alias) --- #
898
935
  elif name in ("starts-with", "startswith") and args is not None:
899
936
  filter_name = f"pseudo-class :{name}({args!r})"
@@ -936,6 +973,10 @@ def _build_filter_list(
936
973
  filter_lambda = lambda el: hasattr(el, "is_horizontal") and el.is_horizontal
937
974
  elif name == "vertical":
938
975
  filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
976
+ elif name == "checked":
977
+ filter_lambda = lambda el: hasattr(el, "is_checked") and el.is_checked
978
+ elif name == "unchecked":
979
+ filter_lambda = lambda el: hasattr(el, "is_checked") and not el.is_checked
939
980
 
940
981
  # --- New: :strike / :strikethrough / :strikeout pseudo-classes --- #
941
982
  elif name in ("strike", "strikethrough", "strikeout"):
@@ -10,11 +10,11 @@ with include_boundaries='none'.
10
10
 
11
11
  Example:
12
12
  from natural_pdf.utils.spatial import is_element_in_region
13
-
13
+
14
14
  # Check if element is in region using center-based logic (default)
15
15
  if is_element_in_region(element, region):
16
16
  print("Element is in region")
17
-
17
+
18
18
  # Use different strategies
19
19
  if is_element_in_region(element, region, strategy="intersects"):
20
20
  print("Element overlaps with region")
@@ -35,16 +35,16 @@ InclusionStrategy = Literal["center", "intersects", "contains"]
35
35
 
36
36
  def is_element_in_region(
37
37
  element: "Element",
38
- region: "Region",
38
+ region: "Region",
39
39
  strategy: InclusionStrategy = "center",
40
- check_page: bool = True
40
+ check_page: bool = True,
41
41
  ) -> bool:
42
42
  """
43
43
  Unified function to check if an element is inside a region.
44
-
44
+
45
45
  This centralizes the logic used across Region, Page, and Flow to ensure
46
46
  consistent behavior throughout the library.
47
-
47
+
48
48
  Args:
49
49
  element: The element to check
50
50
  region: The region to check against
@@ -53,7 +53,7 @@ def is_element_in_region(
53
53
  - "intersects": Element belongs if any part overlaps
54
54
  - "contains": Element belongs only if fully contained
55
55
  check_page: Whether to verify element and region are on the same page
56
-
56
+
57
57
  Returns:
58
58
  bool: True if element is in region according to the strategy
59
59
  """
@@ -61,18 +61,18 @@ def is_element_in_region(
61
61
  if not hasattr(element, "bbox") or not element.bbox:
62
62
  logger.debug(f"Element lacks bbox attributes: {element}")
63
63
  return False
64
-
64
+
65
65
  if not hasattr(region, "bbox") or not region.bbox:
66
66
  logger.debug(f"Region lacks bbox attributes: {region}")
67
67
  return False
68
-
68
+
69
69
  # Check page membership if requested
70
70
  if check_page:
71
71
  if not hasattr(element, "page") or not hasattr(region, "page"):
72
72
  return False
73
73
  if element.page != region.page:
74
74
  return False
75
-
75
+
76
76
  # Apply the appropriate strategy
77
77
  if strategy == "center":
78
78
  # Use existing region method if available
@@ -82,37 +82,43 @@ def is_element_in_region(
82
82
  # Fallback calculation
83
83
  elem_center_x = (element.x0 + element.x1) / 2
84
84
  elem_center_y = (element.top + element.bottom) / 2
85
-
85
+
86
86
  # Use region's is_point_inside if available
87
87
  if hasattr(region, "is_point_inside"):
88
88
  return region.is_point_inside(elem_center_x, elem_center_y)
89
89
  else:
90
90
  # Simple bounds check
91
- return (region.x0 <= elem_center_x <= region.x1 and
92
- region.top <= elem_center_y <= region.bottom)
93
-
91
+ return (
92
+ region.x0 <= elem_center_x <= region.x1
93
+ and region.top <= elem_center_y <= region.bottom
94
+ )
95
+
94
96
  elif strategy == "intersects":
95
97
  # Use existing region method if available
96
98
  if hasattr(region, "intersects"):
97
99
  return region.intersects(element)
98
100
  else:
99
101
  # Simple bbox overlap check
100
- return not (element.x1 < region.x0 or
101
- element.x0 > region.x1 or
102
- element.bottom < region.top or
103
- element.top > region.bottom)
104
-
102
+ return not (
103
+ element.x1 < region.x0
104
+ or element.x0 > region.x1
105
+ or element.bottom < region.top
106
+ or element.top > region.bottom
107
+ )
108
+
105
109
  elif strategy == "contains":
106
110
  # Use existing region method if available
107
111
  if hasattr(region, "contains"):
108
112
  return region.contains(element)
109
113
  else:
110
114
  # Simple full containment check
111
- return (region.x0 <= element.x0 and
112
- element.x1 <= region.x1 and
113
- region.top <= element.top and
114
- element.bottom <= region.bottom)
115
-
115
+ return (
116
+ region.x0 <= element.x0
117
+ and element.x1 <= region.x1
118
+ and region.top <= element.top
119
+ and element.bottom <= region.bottom
120
+ )
121
+
116
122
  else:
117
123
  raise ValueError(f"Unknown inclusion strategy: {strategy}")
118
124
 
@@ -120,10 +126,10 @@ def is_element_in_region(
120
126
  def get_inclusion_strategy() -> InclusionStrategy:
121
127
  """
122
128
  Get the current global inclusion strategy.
123
-
129
+
124
130
  This could be made configurable via environment variable or settings.
125
131
  For now, returns the default strategy.
126
-
132
+
127
133
  Returns:
128
134
  The current inclusion strategy (default: "center")
129
135
  """
@@ -132,38 +138,35 @@ def get_inclusion_strategy() -> InclusionStrategy:
132
138
  return "center"
133
139
 
134
140
 
135
- def calculate_element_overlap_percentage(
136
- element: "Element",
137
- region: "Region"
138
- ) -> float:
141
+ def calculate_element_overlap_percentage(element: "Element", region: "Region") -> float:
139
142
  """
140
143
  Calculate what percentage of an element overlaps with a region.
141
-
144
+
142
145
  Args:
143
146
  element: The element to check
144
147
  region: The region to check against
145
-
148
+
146
149
  Returns:
147
150
  float: Percentage of element area that overlaps with region (0.0 to 1.0)
148
151
  """
149
152
  if not hasattr(element, "bbox") or not hasattr(region, "bbox"):
150
153
  return 0.0
151
-
154
+
152
155
  # Calculate intersection bounds
153
156
  intersect_x0 = max(element.x0, region.x0)
154
- intersect_y0 = max(element.top, region.top)
157
+ intersect_y0 = max(element.top, region.top)
155
158
  intersect_x1 = min(element.x1, region.x1)
156
159
  intersect_y1 = min(element.bottom, region.bottom)
157
-
160
+
158
161
  # Check if there's an intersection
159
162
  if intersect_x1 <= intersect_x0 or intersect_y1 <= intersect_y0:
160
163
  return 0.0
161
-
164
+
162
165
  # Calculate areas
163
166
  element_area = (element.x1 - element.x0) * (element.bottom - element.top)
164
167
  if element_area == 0:
165
168
  return 0.0
166
-
169
+
167
170
  intersect_area = (intersect_x1 - intersect_x0) * (intersect_y1 - intersect_y0)
168
-
169
- return intersect_area / element_area
171
+
172
+ return intersect_area / element_area
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.17
3
+ Version: 0.2.19
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,6 @@
1
- natural_pdf/__init__.py,sha256=N9ubwsFpmPj7WHA6Uewgn6IbmU2r0BeUGIdIhmTl6nw,4701
1
+ natural_pdf/__init__.py,sha256=JPuQBMN0mZPnPB4z-RAHm8jPSVLKbgw4gxfSXyEgdX4,4957
2
2
  natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
3
+ natural_pdf/judge.py,sha256=mRPJfdIkkL_Y6uQXnb3Wtrna04XlhPrDvxPrDiVevH4,58838
3
4
  natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
4
5
  natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
5
6
  natural_pdf/analyzers/guides.py,sha256=BqFgt-bRSOkEoFCvNsYyY8j__00X-8DJ_TLb2Hx9qsQ,202430
@@ -7,6 +8,13 @@ natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLS
7
8
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
8
9
  natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
9
10
  natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
11
+ natural_pdf/analyzers/checkbox/__init__.py,sha256=2ZWAIUoRqgGlwVeEU0JNMkQ-mO4nxWNFQ6fLOx0jfRQ,243
12
+ natural_pdf/analyzers/checkbox/base.py,sha256=Hu2WrlaG2gNbTFa8fYSzjyUFmOZvbdTdonnMd9lwl44,9610
13
+ natural_pdf/analyzers/checkbox/checkbox_analyzer.py,sha256=rDO7YIT_fAd5BmpXMOUnZaSHUmFuXqVXZK-HNyS3Ezw,13647
14
+ natural_pdf/analyzers/checkbox/checkbox_manager.py,sha256=ZR8yfhWiykxBe6h4smsDuY-So47j0tcGEXhF0FEIorE,5959
15
+ natural_pdf/analyzers/checkbox/checkbox_options.py,sha256=-2V3_yduBhD4iVjn-EhgK7D6qA2xH9NJorfgDcar6PU,2094
16
+ natural_pdf/analyzers/checkbox/mixin.py,sha256=KYnr_Xx4U2bp6c35GG2hk6yX_z4NgX7ZW9zT1xmEKEw,3710
17
+ natural_pdf/analyzers/checkbox/rtdetr.py,sha256=Oxz4XVJKDuVWzBQDqM_hqslCH66n1HJg4_hdXS4aAs4,6944
10
18
  natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
11
19
  natural_pdf/analyzers/layout/base.py,sha256=F5xPOJcI65N4nxwm0szvhtbDD6lVMqWDut8PSkTCobU,8349
12
20
  natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
@@ -23,28 +31,28 @@ natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kN
23
31
  natural_pdf/classification/manager.py,sha256=BaqBL9GeMvYgoJsiQeI2J8aUKQ5Qxu_ELRvmCWquld8,22172
24
32
  natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
25
33
  natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
26
- natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
34
+ natural_pdf/collections/mixins.py,sha256=ZsS61WFu6Ipree4O_zFECKWoKHC3pYVwZU7tUP6OTOQ,6145
27
35
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
- natural_pdf/core/element_manager.py,sha256=619R97OtMd7uhaax7fZNJmhy9GxSs9HCNP4OzGgP828,55882
36
+ natural_pdf/core/element_manager.py,sha256=7fy65zzD42LvDJKj8X1pbJAQYL5lk9wGdTtgE0rsPpA,56057
29
37
  natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
30
- natural_pdf/core/page.py,sha256=-0OaIoXz0zjT_jnPjjI2jpb8vvNKh-1W56auA5UBhTA,158791
31
- natural_pdf/core/page_collection.py,sha256=bLZ3TqTQbmP3oYrbfEi7HUoPMbcGplEtUMZ3Z1y7fuw,66728
38
+ natural_pdf/core/page.py,sha256=NiJxBHLx4Otwr7iMza1gsEAfSqTMvTu_6zex4aocZOw,162710
39
+ natural_pdf/core/page_collection.py,sha256=OjIS9iEtFrHw0liJHGI-CFwZbHHA4Lt7vK69wN76Igg,68255
32
40
  natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
33
- natural_pdf/core/pdf.py,sha256=i8dYCimL_k5FV6BmPI1a2Dk7XZfwLP8TziXr2n3O_fI,105639
41
+ natural_pdf/core/pdf.py,sha256=Cc4A6b49apGfxk7DFcN4oCfoiYmpnH2-jFf_Gb6B5mg,106345
34
42
  natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
35
43
  natural_pdf/core/render_spec.py,sha256=y9QkMiIvWaKiEBlV0TjyldADIEUY3YfWLQXxStHu1S4,15480
36
44
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
37
- natural_pdf/describe/base.py,sha256=M4TGXR8ppTvznTnA1ZDgMQMkDpgu1pwGMNaOcgHf2iY,20154
45
+ natural_pdf/describe/base.py,sha256=pU_fDkWG_hQlne2nNIdOC1xXyTrPc-kmTwd685nZiSk,21024
38
46
  natural_pdf/describe/elements.py,sha256=3Y541z5TQ2obrfZFiFi1YQMsCt3oYrhMHpD5j1tuppw,12639
39
47
  natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
40
48
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
41
49
  natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
42
- natural_pdf/elements/base.py,sha256=YYdoss63yv3IzQeuHbNypo7VLz2UJDFK5b6lqQe5tR8,76090
43
- natural_pdf/elements/element_collection.py,sha256=dlKoIaqmK_pC_cEcTX9LA2bNbZmc8iXcTTDfpHDlyUM,139812
50
+ natural_pdf/elements/base.py,sha256=NunXdrZW53iG-Q4Pe9DHmWpzigHg-JrkjOLZ016I_b0,82679
51
+ natural_pdf/elements/element_collection.py,sha256=z3gRONShw6MrdTJYXVjBi9uNr3dNQtRXgyYKm-VPB7A,141371
44
52
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
53
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
54
  natural_pdf/elements/rect.py,sha256=kmUmhwnihd-aTweAO-LsngRDo5Iqmx7lcSa8ZBlE_2E,4544
47
- natural_pdf/elements/region.py,sha256=qJ86iToSjrCUjVrEbO0M0S1nTuZDW9tpI4jF9T5xJKs,168777
55
+ natural_pdf/elements/region.py,sha256=ql_pZvfjbT0j2zekqMrGBWDzNVo4erNiQ9aK67J7KTw,173382
48
56
  natural_pdf/elements/text.py,sha256=Jo4gnrsJe1PStdoWF2Bt8RSeSmOcfA9DxvMJl7EoAmI,21344
49
57
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
50
58
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -85,7 +93,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
85
93
  natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
86
94
  natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
87
95
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
88
- natural_pdf/selectors/parser.py,sha256=HbPgmtXXA4lRSAVkCzw6vpCi3oh66e-53yUEPhYLGX8,46909
96
+ natural_pdf/selectors/parser.py,sha256=wXlTL2t05xj47sMoG-vhjQFyEVou8NZie7wKKm60iMA,49063
89
97
  natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
90
98
  natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
91
99
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
@@ -101,7 +109,7 @@ natural_pdf/utils/packaging.py,sha256=TM0jafwS5yVbTGC-RMi4TyWunf9cUUo9h5J6rMzkT-
101
109
  natural_pdf/utils/pdfminer_patches.py,sha256=Ob81OMoNUGMUIy9nMw3deSQ_Z6cQmhbRlHUC3EHw2jk,4201
102
110
  natural_pdf/utils/reading_order.py,sha256=u7XyVZdKMPMK0CL1C7xFogKnZ92b0JKT068KFjQWe18,7437
103
111
  natural_pdf/utils/sections.py,sha256=HZX7829-fquKgIF7vUN2tL10-aXckEaM25g_2VcgWU4,12941
104
- natural_pdf/utils/spatial.py,sha256=JOH2LHnF5WBDcjNQsHQdj458zwUgKtSWW7Tj0motn70,5968
112
+ natural_pdf/utils/spatial.py,sha256=YjzGO4A013ZDYGYbs7hl4RbJOaKrg8-x__Dl_BamwUA,5908
105
113
  natural_pdf/utils/text_extraction.py,sha256=CCwPTmMoTgtQt2P00X_ADIf6ZGNfxvjCO9FO0_HqG40,13900
106
114
  natural_pdf/utils/visualization.py,sha256=zhZEHgYnZFuX7YxTHXF8Y3D97uHp2beTKMaC-JkCFwk,22364
107
115
  natural_pdf/vision/__init__.py,sha256=TkoQtdODlh0n_99dsjLIWKE9dgK0m4jfrui_cQ3gTwU,221
@@ -111,15 +119,30 @@ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2
111
119
  natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
112
120
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
113
121
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
114
- natural_pdf-0.2.17.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
122
+ natural_pdf-0.2.19.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
115
123
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
116
124
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
117
125
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
118
126
  optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
119
127
  optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
128
+ temp/check_model.py,sha256=rhnqTRUaq2VyyqXHuLBxM7ZEoJwf0ExlSJnvkMDYPRU,1710
129
+ temp/check_pdf_content.py,sha256=adFIVMI6m36l0R3112ESt9oqX_zM-mhDvTusBcjqBy8,233
130
+ temp/checkbox_checks.py,sha256=XsR6bmaVNiSH-HsDzthtJcz4vcKOYJ5IbAi6vtfo7P0,20293
131
+ temp/checkbox_simple.py,sha256=d1NiE1IbGSG2nMtvFPgBgxF6OSZLm7TIC2nkrDSG8fE,3975
132
+ temp/checkbox_ux_ideas.py,sha256=Pa1NXi-wmtEGAPb1RW9fiQ4mcKf1G88OMm7zIABqGoI,15302
133
+ temp/context_manager_prototype.py,sha256=uMRO7xrWsbxBUCUaY7xGtEFcIj-QT9j2DQ2JMkinW2M,6150
134
+ temp/convert_to_hf.py,sha256=DMqZAWvOA_StujfSmkD-hJCnCy4dlvyjIOl2_1l_mOg,1881
135
+ temp/demo_text_closest.py,sha256=qRnAynLhF-P_q9t_WFaxE_5QLbZMiMp4v9llFipfqZA,2721
120
136
  temp/fix_page_exclusions.py,sha256=YIj62zF38TdoBARAuSIvEbetl_JfXG-mp4v9p355qmo,1358
137
+ temp/inspect_model.py,sha256=AaRDqhRH9kqcUcfmrSUsNw1xWkxargNA3BWIvzxwHGM,1692
138
+ temp/rtdetr_dinov2_test.py,sha256=9FUL3hiHweYJIbEeH0AZTrLJSnWatxwymNG9CZEXrGA,1553
139
+ temp/test_closest_debug.py,sha256=QP53iAEwy2KRSZlwH2eQ07JILxRgfYwBrvro9i2ITXQ,809
140
+ temp/test_closest_debug2.py,sha256=Hbh0nkG7xS0NfayH2Qg_IzLkeKh6mH-OWo0o2i9777I,740
141
+ temp/test_context_exploration.py,sha256=DlFXDuKavvUskLjHMwqPVGGrPpYT8zBHErX1uzHnWxw,2611
121
142
  temp/test_draw_guides.py,sha256=_eSSBElGHQkd2QD_KA_Okw70v0dlY5m-1-C5SQwKAJw,642
122
143
  temp/test_draw_guides_interactive.py,sha256=FsH-2ZQGsGx_8QfVCWUAkLbOcJz-VfiwROzQD4AD7kQ,926
144
+ temp/test_durham.py,sha256=A0J78TiVXCLHP4xy67G6GlOtrE2sgWP7FsLMH6fjBaA,916
145
+ temp/test_empty_string.py,sha256=FovOW7hDwkShVT7nYVH_UMv3IwQjX0pHhxC9WHAfo2U,470
123
146
  temp/test_exclusion_with_debug.py,sha256=CScxHvb43KrB5dzXuTOhuzjcBXZBdfYB5ygiKkEW26g,1393
124
147
  temp/test_find_exclusions_fix.py,sha256=1l5aEqnElcl3kiykdtmJFlVxQ1xMKGm1UckGYEQg--c,2103
125
148
  temp/test_find_exclusions_fix_no_recursion.py,sha256=qZspTBwxunRM93N_-fZ2fR5Lodj0ArQX3h10HlTXhfc,3592
@@ -133,6 +156,7 @@ temp/test_marker_order.py,sha256=TFZkMxRiNoZGVcdDivYnkIDNvwHaiyKUdYoy2rTTIiI,141
133
156
  temp/test_original_exclusions_now_work.py,sha256=G6LmaF-P9Qhj0j4lT_4ncfCddllfP6L8F_x2prUBr9w,1904
134
157
  temp/test_pdf_exclusions_with_guides.py,sha256=QaMl0frgKC8kCPQ2BUI8kqyvqsIjQPXKV_St1rK3zxg,2754
135
158
  temp/test_region_exclusions_detailed.py,sha256=EftdW3JY3JH_LX5QlWKt-4drM-joPggK2fKUZRXVTMA,814
159
+ temp/test_similarity.py,sha256=2Nv8QbSwjaBwMwJsvpZgwOiMIRxPMux5QeZE_rgQ63A,441
136
160
  temp/test_stripes_real_pdf.py,sha256=FIvDoJrnuioOMw1A0aTCCfZLeg99lusfe0Fb0MiqnhQ,2618
137
161
  temp/test_vertical_stripes.py,sha256=Yf3TJfb_faqAFzlgb7i5u6dDHjF4UMSHIGM99vangRk,1877
138
162
  temp/test_widget_functionality.py,sha256=jsEGHYK1dWWa8uEcfGRRj1ReHRMzNoIaMZU4d-o-Djs,2448
@@ -148,8 +172,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
148
172
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
149
173
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
150
174
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
151
- natural_pdf-0.2.17.dist-info/METADATA,sha256=8K5PCwh_OuI8vkWRLChHeT-LuEd0sRmigkRm55ZNeDo,6960
152
- natural_pdf-0.2.17.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
153
- natural_pdf-0.2.17.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
154
- natural_pdf-0.2.17.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
155
- natural_pdf-0.2.17.dist-info/RECORD,,
175
+ natural_pdf-0.2.19.dist-info/METADATA,sha256=vtMsWwMW9cR2LdQhdDFhDG4WWIkctrT7_3P7klvyJ-8,6960
176
+ natural_pdf-0.2.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
177
+ natural_pdf-0.2.19.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
178
+ natural_pdf-0.2.19.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
179
+ natural_pdf-0.2.19.dist-info/RECORD,,
temp/check_model.py ADDED
@@ -0,0 +1,49 @@
1
+ from ultralytics import RTDETR
2
+ import os
3
+
4
+ model_path = "/Users/soma/Development/natural-pdf/model-weights/checkbox-nano.pt"
5
+ print(f"Model exists: {os.path.exists(model_path)}")
6
+
7
+ try:
8
+ model = RTDETR(model_path)
9
+ print(f"Model loaded successfully")
10
+ print(f"Model names: {model.names}")
11
+ print(f"Model task: {model.task}")
12
+
13
+ # Try to get architecture info
14
+ if hasattr(model.model, 'yaml'):
15
+ print(f"Model yaml: {model.model.yaml}")
16
+
17
+ # Check the model structure
18
+ if hasattr(model.model, 'model'):
19
+ for i, module in enumerate(model.model.model):
20
+ print(f"Layer {i}: {module}")
21
+ if i > 5: # Just show first few layers
22
+ break
23
+
24
+ except Exception as e:
25
+ print(f"Error: {e}")
26
+
27
+ # Try loading as generic model to inspect
28
+ import torch
29
+ try:
30
+ checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
31
+ print(f"\nCheckpoint keys: {list(checkpoint.keys())}")
32
+
33
+ # Check for model configuration
34
+ if 'model' in checkpoint and hasattr(checkpoint['model'], 'yaml'):
35
+ print(f"Model yaml: {checkpoint['model'].yaml}")
36
+
37
+ # Check train args for model info
38
+ if 'train_args' in checkpoint:
39
+ args = checkpoint['train_args']
40
+ print(f"\nTraining args:")
41
+ print(f" Model: {getattr(args, 'model', 'Unknown')}")
42
+ print(f" Task: {getattr(args, 'task', 'Unknown')}")
43
+
44
+ # Check epoch info
45
+ if 'epoch' in checkpoint:
46
+ print(f" Epochs trained: {checkpoint['epoch']}")
47
+
48
+ except Exception as e2:
49
+ print(f"Error loading checkpoint: {e2}")
@@ -0,0 +1,9 @@
1
+ from natural_pdf import PDF
2
+
3
+ pdf = PDF('pdfs/01-practice.pdf')
4
+ page = pdf.pages[0]
5
+ texts = page.find_all('text')
6
+ print(f'Total text elements: {len(texts)}')
7
+ print('Sample texts:')
8
+ for t in texts[:20]:
9
+ print(f' - {repr(t.text)}')