natural-pdf 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. natural_pdf/__init__.py +8 -0
  2. natural_pdf/analyzers/checkbox/__init__.py +6 -0
  3. natural_pdf/analyzers/checkbox/base.py +265 -0
  4. natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
  5. natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
  6. natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
  7. natural_pdf/analyzers/checkbox/mixin.py +95 -0
  8. natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
  9. natural_pdf/collections/mixins.py +14 -5
  10. natural_pdf/core/element_manager.py +5 -1
  11. natural_pdf/core/page.py +61 -0
  12. natural_pdf/core/page_collection.py +41 -1
  13. natural_pdf/core/pdf.py +24 -1
  14. natural_pdf/describe/base.py +20 -0
  15. natural_pdf/elements/base.py +152 -10
  16. natural_pdf/elements/element_collection.py +41 -2
  17. natural_pdf/elements/region.py +115 -2
  18. natural_pdf/judge.py +1509 -0
  19. natural_pdf/selectors/parser.py +42 -1
  20. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
  21. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +41 -17
  22. temp/check_model.py +49 -0
  23. temp/check_pdf_content.py +9 -0
  24. temp/checkbox_checks.py +590 -0
  25. temp/checkbox_simple.py +117 -0
  26. temp/checkbox_ux_ideas.py +400 -0
  27. temp/context_manager_prototype.py +177 -0
  28. temp/convert_to_hf.py +60 -0
  29. temp/demo_text_closest.py +66 -0
  30. temp/inspect_model.py +43 -0
  31. temp/rtdetr_dinov2_test.py +49 -0
  32. temp/test_closest_debug.py +26 -0
  33. temp/test_closest_debug2.py +22 -0
  34. temp/test_context_exploration.py +85 -0
  35. temp/test_durham.py +30 -0
  36. temp/test_empty_string.py +16 -0
  37. temp/test_similarity.py +15 -0
  38. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
  39. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
  40. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
  41. {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
@@ -7,11 +7,12 @@ selectors with extensions for PDF-specific attributes and spatial relationships.
7
7
  The parser handles:
8
8
  - Basic element selectors (text, rect, line, image)
9
9
  - Attribute selectors with comparisons ([size>12], [color="red"])
10
- - Pseudo-selectors for text content (:contains(), :regex())
10
+ - Pseudo-selectors for text content (:contains(), :regex(), :closest())
11
11
  - Spatial relationship selectors (:above(), :below(), :near())
12
12
  - Color matching with Delta E distance calculations
13
13
  - Logical operators (AND, OR) and grouping
14
14
  - Complex nested expressions with proper precedence
15
+ - Fuzzy text matching for OCR errors (:closest())
15
16
 
16
17
  Key features:
17
18
  - Safe value parsing without eval() for security
@@ -25,9 +26,12 @@ This enables powerful document navigation like:
25
26
  - page.find_all('rect[color~="red"]:above(text:contains("Total"))')
26
27
  - page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
27
28
  - page.find('text:regex("[\u2500-\u257f]")') # Box drawing characters
29
+ - page.find('text:closest("Date(s) of Review")') # Fuzzy match for OCR errors
30
+ - page.find('text:closest("Invoice Date@0.9")') # 90% similarity threshold
28
31
  """
29
32
 
30
33
  import ast
34
+ import difflib
31
35
  import logging
32
36
  import re
33
37
  from collections import Counter
@@ -691,6 +695,9 @@ def _build_filter_list(
691
695
  return getattr(element, "region_type", "").lower().replace(" ", "_")
692
696
  elif name == "model":
693
697
  return getattr(element, "model", None)
698
+ elif name == "checked":
699
+ # Map 'checked' attribute to is_checked for checkboxes
700
+ return getattr(element, "is_checked", None)
694
701
  else:
695
702
  return getattr(element, python_name, None)
696
703
  else:
@@ -724,6 +731,29 @@ def _build_filter_list(
724
731
  ]:
725
732
  op_desc = f"= {value!r} (exact color)"
726
733
  compare_func = lambda el_val, sel_val: _is_exact_color_match(el_val, sel_val)
734
+ # For boolean attributes, handle string/bool comparison
735
+ elif name in ["checked", "is_checked", "bold", "italic"]:
736
+
737
+ def bool_compare(el_val, sel_val):
738
+ # Convert both to boolean for comparison
739
+ if isinstance(el_val, bool):
740
+ el_bool = el_val
741
+ else:
742
+ el_bool = str(el_val).lower() in ("true", "1", "yes")
743
+
744
+ if isinstance(sel_val, bool):
745
+ sel_bool = sel_val
746
+ else:
747
+ sel_bool = str(sel_val).lower() in ("true", "1", "yes")
748
+
749
+ # Debug logging
750
+ logger.debug(
751
+ f"Boolean comparison: el_val={el_val} ({type(el_val)}) -> {el_bool}, sel_val={sel_val} ({type(sel_val)}) -> {sel_bool}"
752
+ )
753
+
754
+ return el_bool == sel_bool
755
+
756
+ compare_func = bool_compare
727
757
  else:
728
758
  compare_func = lambda el_val, sel_val: el_val == sel_val
729
759
  elif op == "!=":
@@ -894,6 +924,13 @@ def _build_filter_list(
894
924
 
895
925
  filter_lambda = regex_check
896
926
 
927
+ # --- Handle :closest pseudo-class for fuzzy text matching --- #
928
+ elif name == "closest" and args is not None:
929
+ # Note: :closest is handled specially in the page._apply_selector method
930
+ # It doesn't filter elements here, but marks them for special processing
931
+ # This allows us to first check :contains matches, then sort by similarity
932
+ filter_lambda = lambda el: True # Accept all elements for now
933
+
897
934
  # --- Handle :startswith and :starts-with (alias) --- #
898
935
  elif name in ("starts-with", "startswith") and args is not None:
899
936
  filter_name = f"pseudo-class :{name}({args!r})"
@@ -936,6 +973,10 @@ def _build_filter_list(
936
973
  filter_lambda = lambda el: hasattr(el, "is_horizontal") and el.is_horizontal
937
974
  elif name == "vertical":
938
975
  filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
976
+ elif name == "checked":
977
+ filter_lambda = lambda el: hasattr(el, "is_checked") and el.is_checked
978
+ elif name == "unchecked":
979
+ filter_lambda = lambda el: hasattr(el, "is_checked") and not el.is_checked
939
980
 
940
981
  # --- New: :strike / :strikethrough / :strikeout pseudo-classes --- #
941
982
  elif name in ("strike", "strikethrough", "strikeout"):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.18
3
+ Version: 0.2.19
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,6 @@
1
- natural_pdf/__init__.py,sha256=N9ubwsFpmPj7WHA6Uewgn6IbmU2r0BeUGIdIhmTl6nw,4701
1
+ natural_pdf/__init__.py,sha256=JPuQBMN0mZPnPB4z-RAHm8jPSVLKbgw4gxfSXyEgdX4,4957
2
2
  natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
3
+ natural_pdf/judge.py,sha256=mRPJfdIkkL_Y6uQXnb3Wtrna04XlhPrDvxPrDiVevH4,58838
3
4
  natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
4
5
  natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
5
6
  natural_pdf/analyzers/guides.py,sha256=BqFgt-bRSOkEoFCvNsYyY8j__00X-8DJ_TLb2Hx9qsQ,202430
@@ -7,6 +8,13 @@ natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLS
7
8
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
8
9
  natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
9
10
  natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
11
+ natural_pdf/analyzers/checkbox/__init__.py,sha256=2ZWAIUoRqgGlwVeEU0JNMkQ-mO4nxWNFQ6fLOx0jfRQ,243
12
+ natural_pdf/analyzers/checkbox/base.py,sha256=Hu2WrlaG2gNbTFa8fYSzjyUFmOZvbdTdonnMd9lwl44,9610
13
+ natural_pdf/analyzers/checkbox/checkbox_analyzer.py,sha256=rDO7YIT_fAd5BmpXMOUnZaSHUmFuXqVXZK-HNyS3Ezw,13647
14
+ natural_pdf/analyzers/checkbox/checkbox_manager.py,sha256=ZR8yfhWiykxBe6h4smsDuY-So47j0tcGEXhF0FEIorE,5959
15
+ natural_pdf/analyzers/checkbox/checkbox_options.py,sha256=-2V3_yduBhD4iVjn-EhgK7D6qA2xH9NJorfgDcar6PU,2094
16
+ natural_pdf/analyzers/checkbox/mixin.py,sha256=KYnr_Xx4U2bp6c35GG2hk6yX_z4NgX7ZW9zT1xmEKEw,3710
17
+ natural_pdf/analyzers/checkbox/rtdetr.py,sha256=Oxz4XVJKDuVWzBQDqM_hqslCH66n1HJg4_hdXS4aAs4,6944
10
18
  natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
11
19
  natural_pdf/analyzers/layout/base.py,sha256=F5xPOJcI65N4nxwm0szvhtbDD6lVMqWDut8PSkTCobU,8349
12
20
  natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
@@ -23,28 +31,28 @@ natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kN
23
31
  natural_pdf/classification/manager.py,sha256=BaqBL9GeMvYgoJsiQeI2J8aUKQ5Qxu_ELRvmCWquld8,22172
24
32
  natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
25
33
  natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
26
- natural_pdf/collections/mixins.py,sha256=Se2C5AcpP9B5E0d0pIrey6-f_P32tAXTK4M7666MNj0,5688
34
+ natural_pdf/collections/mixins.py,sha256=ZsS61WFu6Ipree4O_zFECKWoKHC3pYVwZU7tUP6OTOQ,6145
27
35
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
28
- natural_pdf/core/element_manager.py,sha256=619R97OtMd7uhaax7fZNJmhy9GxSs9HCNP4OzGgP828,55882
36
+ natural_pdf/core/element_manager.py,sha256=7fy65zzD42LvDJKj8X1pbJAQYL5lk9wGdTtgE0rsPpA,56057
29
37
  natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
30
- natural_pdf/core/page.py,sha256=-78LuCbU9AEd4MGMm7_yoBl9rMAvOvrbPWcVsrMoe0s,159986
31
- natural_pdf/core/page_collection.py,sha256=bLZ3TqTQbmP3oYrbfEi7HUoPMbcGplEtUMZ3Z1y7fuw,66728
38
+ natural_pdf/core/page.py,sha256=NiJxBHLx4Otwr7iMza1gsEAfSqTMvTu_6zex4aocZOw,162710
39
+ natural_pdf/core/page_collection.py,sha256=OjIS9iEtFrHw0liJHGI-CFwZbHHA4Lt7vK69wN76Igg,68255
32
40
  natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
33
- natural_pdf/core/pdf.py,sha256=i8dYCimL_k5FV6BmPI1a2Dk7XZfwLP8TziXr2n3O_fI,105639
41
+ natural_pdf/core/pdf.py,sha256=Cc4A6b49apGfxk7DFcN4oCfoiYmpnH2-jFf_Gb6B5mg,106345
34
42
  natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
35
43
  natural_pdf/core/render_spec.py,sha256=y9QkMiIvWaKiEBlV0TjyldADIEUY3YfWLQXxStHu1S4,15480
36
44
  natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
37
- natural_pdf/describe/base.py,sha256=M4TGXR8ppTvznTnA1ZDgMQMkDpgu1pwGMNaOcgHf2iY,20154
45
+ natural_pdf/describe/base.py,sha256=pU_fDkWG_hQlne2nNIdOC1xXyTrPc-kmTwd685nZiSk,21024
38
46
  natural_pdf/describe/elements.py,sha256=3Y541z5TQ2obrfZFiFi1YQMsCt3oYrhMHpD5j1tuppw,12639
39
47
  natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
40
48
  natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
41
49
  natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
42
- natural_pdf/elements/base.py,sha256=YYdoss63yv3IzQeuHbNypo7VLz2UJDFK5b6lqQe5tR8,76090
43
- natural_pdf/elements/element_collection.py,sha256=dlKoIaqmK_pC_cEcTX9LA2bNbZmc8iXcTTDfpHDlyUM,139812
50
+ natural_pdf/elements/base.py,sha256=NunXdrZW53iG-Q4Pe9DHmWpzigHg-JrkjOLZ016I_b0,82679
51
+ natural_pdf/elements/element_collection.py,sha256=z3gRONShw6MrdTJYXVjBi9uNr3dNQtRXgyYKm-VPB7A,141371
44
52
  natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
45
53
  natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
46
54
  natural_pdf/elements/rect.py,sha256=kmUmhwnihd-aTweAO-LsngRDo5Iqmx7lcSa8ZBlE_2E,4544
47
- natural_pdf/elements/region.py,sha256=qJ86iToSjrCUjVrEbO0M0S1nTuZDW9tpI4jF9T5xJKs,168777
55
+ natural_pdf/elements/region.py,sha256=ql_pZvfjbT0j2zekqMrGBWDzNVo4erNiQ9aK67J7KTw,173382
48
56
  natural_pdf/elements/text.py,sha256=Jo4gnrsJe1PStdoWF2Bt8RSeSmOcfA9DxvMJl7EoAmI,21344
49
57
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
50
58
  natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
@@ -85,7 +93,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
85
93
  natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
86
94
  natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
87
95
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
88
- natural_pdf/selectors/parser.py,sha256=HbPgmtXXA4lRSAVkCzw6vpCi3oh66e-53yUEPhYLGX8,46909
96
+ natural_pdf/selectors/parser.py,sha256=wXlTL2t05xj47sMoG-vhjQFyEVou8NZie7wKKm60iMA,49063
89
97
  natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
90
98
  natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
91
99
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
@@ -111,15 +119,30 @@ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2
111
119
  natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
112
120
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
113
121
  natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
114
- natural_pdf-0.2.18.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
122
+ natural_pdf-0.2.19.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
115
123
  optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
116
124
  optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
117
125
  optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
118
126
  optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
119
127
  optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
128
+ temp/check_model.py,sha256=rhnqTRUaq2VyyqXHuLBxM7ZEoJwf0ExlSJnvkMDYPRU,1710
129
+ temp/check_pdf_content.py,sha256=adFIVMI6m36l0R3112ESt9oqX_zM-mhDvTusBcjqBy8,233
130
+ temp/checkbox_checks.py,sha256=XsR6bmaVNiSH-HsDzthtJcz4vcKOYJ5IbAi6vtfo7P0,20293
131
+ temp/checkbox_simple.py,sha256=d1NiE1IbGSG2nMtvFPgBgxF6OSZLm7TIC2nkrDSG8fE,3975
132
+ temp/checkbox_ux_ideas.py,sha256=Pa1NXi-wmtEGAPb1RW9fiQ4mcKf1G88OMm7zIABqGoI,15302
133
+ temp/context_manager_prototype.py,sha256=uMRO7xrWsbxBUCUaY7xGtEFcIj-QT9j2DQ2JMkinW2M,6150
134
+ temp/convert_to_hf.py,sha256=DMqZAWvOA_StujfSmkD-hJCnCy4dlvyjIOl2_1l_mOg,1881
135
+ temp/demo_text_closest.py,sha256=qRnAynLhF-P_q9t_WFaxE_5QLbZMiMp4v9llFipfqZA,2721
120
136
  temp/fix_page_exclusions.py,sha256=YIj62zF38TdoBARAuSIvEbetl_JfXG-mp4v9p355qmo,1358
137
+ temp/inspect_model.py,sha256=AaRDqhRH9kqcUcfmrSUsNw1xWkxargNA3BWIvzxwHGM,1692
138
+ temp/rtdetr_dinov2_test.py,sha256=9FUL3hiHweYJIbEeH0AZTrLJSnWatxwymNG9CZEXrGA,1553
139
+ temp/test_closest_debug.py,sha256=QP53iAEwy2KRSZlwH2eQ07JILxRgfYwBrvro9i2ITXQ,809
140
+ temp/test_closest_debug2.py,sha256=Hbh0nkG7xS0NfayH2Qg_IzLkeKh6mH-OWo0o2i9777I,740
141
+ temp/test_context_exploration.py,sha256=DlFXDuKavvUskLjHMwqPVGGrPpYT8zBHErX1uzHnWxw,2611
121
142
  temp/test_draw_guides.py,sha256=_eSSBElGHQkd2QD_KA_Okw70v0dlY5m-1-C5SQwKAJw,642
122
143
  temp/test_draw_guides_interactive.py,sha256=FsH-2ZQGsGx_8QfVCWUAkLbOcJz-VfiwROzQD4AD7kQ,926
144
+ temp/test_durham.py,sha256=A0J78TiVXCLHP4xy67G6GlOtrE2sgWP7FsLMH6fjBaA,916
145
+ temp/test_empty_string.py,sha256=FovOW7hDwkShVT7nYVH_UMv3IwQjX0pHhxC9WHAfo2U,470
123
146
  temp/test_exclusion_with_debug.py,sha256=CScxHvb43KrB5dzXuTOhuzjcBXZBdfYB5ygiKkEW26g,1393
124
147
  temp/test_find_exclusions_fix.py,sha256=1l5aEqnElcl3kiykdtmJFlVxQ1xMKGm1UckGYEQg--c,2103
125
148
  temp/test_find_exclusions_fix_no_recursion.py,sha256=qZspTBwxunRM93N_-fZ2fR5Lodj0ArQX3h10HlTXhfc,3592
@@ -133,6 +156,7 @@ temp/test_marker_order.py,sha256=TFZkMxRiNoZGVcdDivYnkIDNvwHaiyKUdYoy2rTTIiI,141
133
156
  temp/test_original_exclusions_now_work.py,sha256=G6LmaF-P9Qhj0j4lT_4ncfCddllfP6L8F_x2prUBr9w,1904
134
157
  temp/test_pdf_exclusions_with_guides.py,sha256=QaMl0frgKC8kCPQ2BUI8kqyvqsIjQPXKV_St1rK3zxg,2754
135
158
  temp/test_region_exclusions_detailed.py,sha256=EftdW3JY3JH_LX5QlWKt-4drM-joPggK2fKUZRXVTMA,814
159
+ temp/test_similarity.py,sha256=2Nv8QbSwjaBwMwJsvpZgwOiMIRxPMux5QeZE_rgQ63A,441
136
160
  temp/test_stripes_real_pdf.py,sha256=FIvDoJrnuioOMw1A0aTCCfZLeg99lusfe0Fb0MiqnhQ,2618
137
161
  temp/test_vertical_stripes.py,sha256=Yf3TJfb_faqAFzlgb7i5u6dDHjF4UMSHIGM99vangRk,1877
138
162
  temp/test_widget_functionality.py,sha256=jsEGHYK1dWWa8uEcfGRRj1ReHRMzNoIaMZU4d-o-Djs,2448
@@ -148,8 +172,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
148
172
  tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
149
173
  tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
150
174
  tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
151
- natural_pdf-0.2.18.dist-info/METADATA,sha256=_hwRZyYPDD_bl-dRHE2KLo8oeo2TPxVxGi66grA-ZIs,6960
152
- natural_pdf-0.2.18.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
153
- natural_pdf-0.2.18.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
154
- natural_pdf-0.2.18.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
155
- natural_pdf-0.2.18.dist-info/RECORD,,
175
+ natural_pdf-0.2.19.dist-info/METADATA,sha256=vtMsWwMW9cR2LdQhdDFhDG4WWIkctrT7_3P7klvyJ-8,6960
176
+ natural_pdf-0.2.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
177
+ natural_pdf-0.2.19.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
178
+ natural_pdf-0.2.19.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
179
+ natural_pdf-0.2.19.dist-info/RECORD,,
temp/check_model.py ADDED
@@ -0,0 +1,49 @@
1
+ from ultralytics import RTDETR
2
+ import os
3
+
4
+ model_path = "/Users/soma/Development/natural-pdf/model-weights/checkbox-nano.pt"
5
+ print(f"Model exists: {os.path.exists(model_path)}")
6
+
7
+ try:
8
+ model = RTDETR(model_path)
9
+ print(f"Model loaded successfully")
10
+ print(f"Model names: {model.names}")
11
+ print(f"Model task: {model.task}")
12
+
13
+ # Try to get architecture info
14
+ if hasattr(model.model, 'yaml'):
15
+ print(f"Model yaml: {model.model.yaml}")
16
+
17
+ # Check the model structure
18
+ if hasattr(model.model, 'model'):
19
+ for i, module in enumerate(model.model.model):
20
+ print(f"Layer {i}: {module}")
21
+ if i > 5: # Just show first few layers
22
+ break
23
+
24
+ except Exception as e:
25
+ print(f"Error: {e}")
26
+
27
+ # Try loading as generic model to inspect
28
+ import torch
29
+ try:
30
+ checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
31
+ print(f"\nCheckpoint keys: {list(checkpoint.keys())}")
32
+
33
+ # Check for model configuration
34
+ if 'model' in checkpoint and hasattr(checkpoint['model'], 'yaml'):
35
+ print(f"Model yaml: {checkpoint['model'].yaml}")
36
+
37
+ # Check train args for model info
38
+ if 'train_args' in checkpoint:
39
+ args = checkpoint['train_args']
40
+ print(f"\nTraining args:")
41
+ print(f" Model: {getattr(args, 'model', 'Unknown')}")
42
+ print(f" Task: {getattr(args, 'task', 'Unknown')}")
43
+
44
+ # Check epoch info
45
+ if 'epoch' in checkpoint:
46
+ print(f" Epochs trained: {checkpoint['epoch']}")
47
+
48
+ except Exception as e2:
49
+ print(f"Error loading checkpoint: {e2}")
@@ -0,0 +1,9 @@
1
+ from natural_pdf import PDF
2
+
3
+ pdf = PDF('pdfs/01-practice.pdf')
4
+ page = pdf.pages[0]
5
+ texts = page.find_all('text')
6
+ print(f'Total text elements: {len(texts)}')
7
+ print('Sample texts:')
8
+ for t in texts[:20]:
9
+ print(f' - {repr(t.text)}')