natural-pdf 0.2.18__py3-none-any.whl → 0.2.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +8 -0
- natural_pdf/analyzers/checkbox/__init__.py +6 -0
- natural_pdf/analyzers/checkbox/base.py +265 -0
- natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
- natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
- natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
- natural_pdf/analyzers/checkbox/mixin.py +95 -0
- natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
- natural_pdf/collections/mixins.py +14 -5
- natural_pdf/core/element_manager.py +5 -1
- natural_pdf/core/page.py +61 -0
- natural_pdf/core/page_collection.py +41 -1
- natural_pdf/core/pdf.py +24 -1
- natural_pdf/describe/base.py +20 -0
- natural_pdf/elements/base.py +152 -10
- natural_pdf/elements/element_collection.py +41 -2
- natural_pdf/elements/region.py +115 -2
- natural_pdf/judge.py +1509 -0
- natural_pdf/selectors/parser.py +42 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/METADATA +1 -1
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/RECORD +41 -17
- temp/check_model.py +49 -0
- temp/check_pdf_content.py +9 -0
- temp/checkbox_checks.py +590 -0
- temp/checkbox_simple.py +117 -0
- temp/checkbox_ux_ideas.py +400 -0
- temp/context_manager_prototype.py +177 -0
- temp/convert_to_hf.py +60 -0
- temp/demo_text_closest.py +66 -0
- temp/inspect_model.py +43 -0
- temp/rtdetr_dinov2_test.py +49 -0
- temp/test_closest_debug.py +26 -0
- temp/test_closest_debug2.py +22 -0
- temp/test_context_exploration.py +85 -0
- temp/test_durham.py +30 -0
- temp/test_empty_string.py +16 -0
- temp/test_similarity.py +15 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/WHEEL +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.2.18.dist-info → natural_pdf-0.2.19.dist-info}/top_level.txt +0 -0
natural_pdf/selectors/parser.py
CHANGED
@@ -7,11 +7,12 @@ selectors with extensions for PDF-specific attributes and spatial relationships.
|
|
7
7
|
The parser handles:
|
8
8
|
- Basic element selectors (text, rect, line, image)
|
9
9
|
- Attribute selectors with comparisons ([size>12], [color="red"])
|
10
|
-
- Pseudo-selectors for text content (:contains(), :regex())
|
10
|
+
- Pseudo-selectors for text content (:contains(), :regex(), :closest())
|
11
11
|
- Spatial relationship selectors (:above(), :below(), :near())
|
12
12
|
- Color matching with Delta E distance calculations
|
13
13
|
- Logical operators (AND, OR) and grouping
|
14
14
|
- Complex nested expressions with proper precedence
|
15
|
+
- Fuzzy text matching for OCR errors (:closest())
|
15
16
|
|
16
17
|
Key features:
|
17
18
|
- Safe value parsing without eval() for security
|
@@ -25,9 +26,12 @@ This enables powerful document navigation like:
|
|
25
26
|
- page.find_all('rect[color~="red"]:above(text:contains("Total"))')
|
26
27
|
- page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
|
27
28
|
- page.find('text:regex("[\u2500-\u257f]")') # Box drawing characters
|
29
|
+
- page.find('text:closest("Date(s) of Review")') # Fuzzy match for OCR errors
|
30
|
+
- page.find('text:closest("Invoice Date@0.9")') # 90% similarity threshold
|
28
31
|
"""
|
29
32
|
|
30
33
|
import ast
|
34
|
+
import difflib
|
31
35
|
import logging
|
32
36
|
import re
|
33
37
|
from collections import Counter
|
@@ -691,6 +695,9 @@ def _build_filter_list(
|
|
691
695
|
return getattr(element, "region_type", "").lower().replace(" ", "_")
|
692
696
|
elif name == "model":
|
693
697
|
return getattr(element, "model", None)
|
698
|
+
elif name == "checked":
|
699
|
+
# Map 'checked' attribute to is_checked for checkboxes
|
700
|
+
return getattr(element, "is_checked", None)
|
694
701
|
else:
|
695
702
|
return getattr(element, python_name, None)
|
696
703
|
else:
|
@@ -724,6 +731,29 @@ def _build_filter_list(
|
|
724
731
|
]:
|
725
732
|
op_desc = f"= {value!r} (exact color)"
|
726
733
|
compare_func = lambda el_val, sel_val: _is_exact_color_match(el_val, sel_val)
|
734
|
+
# For boolean attributes, handle string/bool comparison
|
735
|
+
elif name in ["checked", "is_checked", "bold", "italic"]:
|
736
|
+
|
737
|
+
def bool_compare(el_val, sel_val):
|
738
|
+
# Convert both to boolean for comparison
|
739
|
+
if isinstance(el_val, bool):
|
740
|
+
el_bool = el_val
|
741
|
+
else:
|
742
|
+
el_bool = str(el_val).lower() in ("true", "1", "yes")
|
743
|
+
|
744
|
+
if isinstance(sel_val, bool):
|
745
|
+
sel_bool = sel_val
|
746
|
+
else:
|
747
|
+
sel_bool = str(sel_val).lower() in ("true", "1", "yes")
|
748
|
+
|
749
|
+
# Debug logging
|
750
|
+
logger.debug(
|
751
|
+
f"Boolean comparison: el_val={el_val} ({type(el_val)}) -> {el_bool}, sel_val={sel_val} ({type(sel_val)}) -> {sel_bool}"
|
752
|
+
)
|
753
|
+
|
754
|
+
return el_bool == sel_bool
|
755
|
+
|
756
|
+
compare_func = bool_compare
|
727
757
|
else:
|
728
758
|
compare_func = lambda el_val, sel_val: el_val == sel_val
|
729
759
|
elif op == "!=":
|
@@ -894,6 +924,13 @@ def _build_filter_list(
|
|
894
924
|
|
895
925
|
filter_lambda = regex_check
|
896
926
|
|
927
|
+
# --- Handle :closest pseudo-class for fuzzy text matching --- #
|
928
|
+
elif name == "closest" and args is not None:
|
929
|
+
# Note: :closest is handled specially in the page._apply_selector method
|
930
|
+
# It doesn't filter elements here, but marks them for special processing
|
931
|
+
# This allows us to first check :contains matches, then sort by similarity
|
932
|
+
filter_lambda = lambda el: True # Accept all elements for now
|
933
|
+
|
897
934
|
# --- Handle :startswith and :starts-with (alias) --- #
|
898
935
|
elif name in ("starts-with", "startswith") and args is not None:
|
899
936
|
filter_name = f"pseudo-class :{name}({args!r})"
|
@@ -936,6 +973,10 @@ def _build_filter_list(
|
|
936
973
|
filter_lambda = lambda el: hasattr(el, "is_horizontal") and el.is_horizontal
|
937
974
|
elif name == "vertical":
|
938
975
|
filter_lambda = lambda el: hasattr(el, "is_vertical") and el.is_vertical
|
976
|
+
elif name == "checked":
|
977
|
+
filter_lambda = lambda el: hasattr(el, "is_checked") and el.is_checked
|
978
|
+
elif name == "unchecked":
|
979
|
+
filter_lambda = lambda el: hasattr(el, "is_checked") and not el.is_checked
|
939
980
|
|
940
981
|
# --- New: :strike / :strikethrough / :strikeout pseudo-classes --- #
|
941
982
|
elif name in ("strike", "strikethrough", "strikeout"):
|
@@ -1,5 +1,6 @@
|
|
1
|
-
natural_pdf/__init__.py,sha256=
|
1
|
+
natural_pdf/__init__.py,sha256=JPuQBMN0mZPnPB4z-RAHm8jPSVLKbgw4gxfSXyEgdX4,4957
|
2
2
|
natural_pdf/cli.py,sha256=0zO9ZoRiP8JmyGBaVavrMATnvbARWTl7WD2PEefu9BM,4061
|
3
|
+
natural_pdf/judge.py,sha256=mRPJfdIkkL_Y6uQXnb3Wtrna04XlhPrDvxPrDiVevH4,58838
|
3
4
|
natural_pdf/text_mixin.py,sha256=eFCiHj6Okcw3aum4955BepcI2NPRalkf9UFFVTc_H30,4012
|
4
5
|
natural_pdf/analyzers/__init__.py,sha256=3XGoNq3OgiVkZP7tOdeP5XVUl7fDgyztdA8DlOcMLXg,1138
|
5
6
|
natural_pdf/analyzers/guides.py,sha256=BqFgt-bRSOkEoFCvNsYyY8j__00X-8DJ_TLb2Hx9qsQ,202430
|
@@ -7,6 +8,13 @@ natural_pdf/analyzers/shape_detection_mixin.py,sha256=mgpyJ4jIulz9l9HCqThabJIsLS
|
|
7
8
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
8
9
|
natural_pdf/analyzers/text_structure.py,sha256=3WWusi-BI0krUnJxB05DD6XmKj5qRNvQBqH7zOQGm1M,28451
|
9
10
|
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
11
|
+
natural_pdf/analyzers/checkbox/__init__.py,sha256=2ZWAIUoRqgGlwVeEU0JNMkQ-mO4nxWNFQ6fLOx0jfRQ,243
|
12
|
+
natural_pdf/analyzers/checkbox/base.py,sha256=Hu2WrlaG2gNbTFa8fYSzjyUFmOZvbdTdonnMd9lwl44,9610
|
13
|
+
natural_pdf/analyzers/checkbox/checkbox_analyzer.py,sha256=rDO7YIT_fAd5BmpXMOUnZaSHUmFuXqVXZK-HNyS3Ezw,13647
|
14
|
+
natural_pdf/analyzers/checkbox/checkbox_manager.py,sha256=ZR8yfhWiykxBe6h4smsDuY-So47j0tcGEXhF0FEIorE,5959
|
15
|
+
natural_pdf/analyzers/checkbox/checkbox_options.py,sha256=-2V3_yduBhD4iVjn-EhgK7D6qA2xH9NJorfgDcar6PU,2094
|
16
|
+
natural_pdf/analyzers/checkbox/mixin.py,sha256=KYnr_Xx4U2bp6c35GG2hk6yX_z4NgX7ZW9zT1xmEKEw,3710
|
17
|
+
natural_pdf/analyzers/checkbox/rtdetr.py,sha256=Oxz4XVJKDuVWzBQDqM_hqslCH66n1HJg4_hdXS4aAs4,6944
|
10
18
|
natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
|
11
19
|
natural_pdf/analyzers/layout/base.py,sha256=F5xPOJcI65N4nxwm0szvhtbDD6lVMqWDut8PSkTCobU,8349
|
12
20
|
natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
|
@@ -23,28 +31,28 @@ natural_pdf/analyzers/layout/yolo.py,sha256=2Iz2-WsMy--ftkZQ8j5PGqp_1fTD7Mskl2kN
|
|
23
31
|
natural_pdf/classification/manager.py,sha256=BaqBL9GeMvYgoJsiQeI2J8aUKQ5Qxu_ELRvmCWquld8,22172
|
24
32
|
natural_pdf/classification/mixin.py,sha256=CXygXXhe_qx1563SmIjiu4uSnZkxCkuRR4fGvLokS2w,9416
|
25
33
|
natural_pdf/classification/results.py,sha256=5ha77CxK0GYwkBMJbvUBZkBjsL5GpOveIZDK9nO4j8I,3239
|
26
|
-
natural_pdf/collections/mixins.py,sha256=
|
34
|
+
natural_pdf/collections/mixins.py,sha256=ZsS61WFu6Ipree4O_zFECKWoKHC3pYVwZU7tUP6OTOQ,6145
|
27
35
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
28
|
-
natural_pdf/core/element_manager.py,sha256=
|
36
|
+
natural_pdf/core/element_manager.py,sha256=7fy65zzD42LvDJKj8X1pbJAQYL5lk9wGdTtgE0rsPpA,56057
|
29
37
|
natural_pdf/core/highlighting_service.py,sha256=wEV-koqHoHf7S3wZ3j8D2L-ucGp3Nd0YhhStz9yqeLc,70406
|
30
|
-
natural_pdf/core/page.py,sha256
|
31
|
-
natural_pdf/core/page_collection.py,sha256=
|
38
|
+
natural_pdf/core/page.py,sha256=NiJxBHLx4Otwr7iMza1gsEAfSqTMvTu_6zex4aocZOw,162710
|
39
|
+
natural_pdf/core/page_collection.py,sha256=OjIS9iEtFrHw0liJHGI-CFwZbHHA4Lt7vK69wN76Igg,68255
|
32
40
|
natural_pdf/core/page_groupby.py,sha256=V2e_RNlHaasUzYm2h2vNJI7_aV_fl3_pg7kU3F2j0z8,8218
|
33
|
-
natural_pdf/core/pdf.py,sha256=
|
41
|
+
natural_pdf/core/pdf.py,sha256=Cc4A6b49apGfxk7DFcN4oCfoiYmpnH2-jFf_Gb6B5mg,106345
|
34
42
|
natural_pdf/core/pdf_collection.py,sha256=s3ogu4CEHrHMTRqQMJUKJZ-9Ii8b_B9dWbVLTFj0s7g,34992
|
35
43
|
natural_pdf/core/render_spec.py,sha256=y9QkMiIvWaKiEBlV0TjyldADIEUY3YfWLQXxStHu1S4,15480
|
36
44
|
natural_pdf/describe/__init__.py,sha256=kIV7ORmWWB1SAur7nK2aAwR-wHqSedhKfUsaUl4hG0A,586
|
37
|
-
natural_pdf/describe/base.py,sha256=
|
45
|
+
natural_pdf/describe/base.py,sha256=pU_fDkWG_hQlne2nNIdOC1xXyTrPc-kmTwd685nZiSk,21024
|
38
46
|
natural_pdf/describe/elements.py,sha256=3Y541z5TQ2obrfZFiFi1YQMsCt3oYrhMHpD5j1tuppw,12639
|
39
47
|
natural_pdf/describe/mixin.py,sha256=rkX14aGrSz7Jvxx8Rbxv3eSfbO-_29DipwpstrV2pDQ,3109
|
40
48
|
natural_pdf/describe/summary.py,sha256=cfT4ZQkeatCDAOwWPwhtEVXisNgk6E57fAXAnoRysSU,7645
|
41
49
|
natural_pdf/elements/__init__.py,sha256=ICNikmLeIEuSYypz-KnkBn8xR1hR7rge4hsa1KLkyWY,42
|
42
|
-
natural_pdf/elements/base.py,sha256=
|
43
|
-
natural_pdf/elements/element_collection.py,sha256=
|
50
|
+
natural_pdf/elements/base.py,sha256=NunXdrZW53iG-Q4Pe9DHmWpzigHg-JrkjOLZ016I_b0,82679
|
51
|
+
natural_pdf/elements/element_collection.py,sha256=z3gRONShw6MrdTJYXVjBi9uNr3dNQtRXgyYKm-VPB7A,141371
|
44
52
|
natural_pdf/elements/image.py,sha256=zu-P2Y8fRoEXf6IeZU0EYRWsgZ6I_a5vy1FA3VXTGkQ,1424
|
45
53
|
natural_pdf/elements/line.py,sha256=TFn7KXjPT_jUQyQyabU0F7XYU4dC-qadwodJMZF4DCU,3844
|
46
54
|
natural_pdf/elements/rect.py,sha256=kmUmhwnihd-aTweAO-LsngRDo5Iqmx7lcSa8ZBlE_2E,4544
|
47
|
-
natural_pdf/elements/region.py,sha256=
|
55
|
+
natural_pdf/elements/region.py,sha256=ql_pZvfjbT0j2zekqMrGBWDzNVo4erNiQ9aK67J7KTw,173382
|
48
56
|
natural_pdf/elements/text.py,sha256=Jo4gnrsJe1PStdoWF2Bt8RSeSmOcfA9DxvMJl7EoAmI,21344
|
49
57
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
50
58
|
natural_pdf/exporters/__init__.py,sha256=QffoARekR6WzXEd05oxOytly4qPdBizuIF-SUkeFpig,643
|
@@ -85,7 +93,7 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
85
93
|
natural_pdf/search/search_service_protocol.py,sha256=u8pbuWP96fnQEe6mnreY9DrdiDAHP6ZCY7phvSbFlP8,6697
|
86
94
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
87
95
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
88
|
-
natural_pdf/selectors/parser.py,sha256=
|
96
|
+
natural_pdf/selectors/parser.py,sha256=wXlTL2t05xj47sMoG-vhjQFyEVou8NZie7wKKm60iMA,49063
|
89
97
|
natural_pdf/tables/__init__.py,sha256=sCvCGbGsL6BiqlNxAYfVv003bIDLI11FmjHhaWfcU6w,104
|
90
98
|
natural_pdf/tables/result.py,sha256=-8ctA-jCJYSHtlfAoqTvhUwO5zSP2BQxxetAjqEsNyg,8665
|
91
99
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
@@ -111,15 +119,30 @@ natural_pdf/vision/similarity.py,sha256=HWmXDBNLSOlRWH-_1K3FVR7tSsRuMFqXZwrVhhg2
|
|
111
119
|
natural_pdf/vision/template_matching.py,sha256=91XQt5tp-vmcMX_4b2Bz-YwIAlb-hc8E5ih_qAHQuCk,7145
|
112
120
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
113
121
|
natural_pdf/widgets/viewer.py,sha256=KW3JogdR2TMg2ECUMYp8hwd060hfg8EsYBWxb5IEzBY,24942
|
114
|
-
natural_pdf-0.2.
|
122
|
+
natural_pdf-0.2.19.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
115
123
|
optimization/memory_comparison.py,sha256=0i_foFSRmppj-fY069qjwH36s_zkx-1L2ASAAlepWzA,6541
|
116
124
|
optimization/pdf_analyzer.py,sha256=HjrmTgu2qchxPeDckc5kjgxppGwd40UESrYS9Myj7pY,19352
|
117
125
|
optimization/performance_analysis.py,sha256=JBXnR9hc7Ix7YCnt3EJPSpsyqIUgKsc7GEffQ_TDCBk,13033
|
118
126
|
optimization/test_cleanup_methods.py,sha256=PmLOL4MRgvV0j_DW9W1TS8MsGGgu57QCuq6_5y7zK3s,6209
|
119
127
|
optimization/test_memory_fix.py,sha256=A3knK74fNhvHknDbLhbTmA276x1ifl-3ivJ_7BhVSTI,6170
|
128
|
+
temp/check_model.py,sha256=rhnqTRUaq2VyyqXHuLBxM7ZEoJwf0ExlSJnvkMDYPRU,1710
|
129
|
+
temp/check_pdf_content.py,sha256=adFIVMI6m36l0R3112ESt9oqX_zM-mhDvTusBcjqBy8,233
|
130
|
+
temp/checkbox_checks.py,sha256=XsR6bmaVNiSH-HsDzthtJcz4vcKOYJ5IbAi6vtfo7P0,20293
|
131
|
+
temp/checkbox_simple.py,sha256=d1NiE1IbGSG2nMtvFPgBgxF6OSZLm7TIC2nkrDSG8fE,3975
|
132
|
+
temp/checkbox_ux_ideas.py,sha256=Pa1NXi-wmtEGAPb1RW9fiQ4mcKf1G88OMm7zIABqGoI,15302
|
133
|
+
temp/context_manager_prototype.py,sha256=uMRO7xrWsbxBUCUaY7xGtEFcIj-QT9j2DQ2JMkinW2M,6150
|
134
|
+
temp/convert_to_hf.py,sha256=DMqZAWvOA_StujfSmkD-hJCnCy4dlvyjIOl2_1l_mOg,1881
|
135
|
+
temp/demo_text_closest.py,sha256=qRnAynLhF-P_q9t_WFaxE_5QLbZMiMp4v9llFipfqZA,2721
|
120
136
|
temp/fix_page_exclusions.py,sha256=YIj62zF38TdoBARAuSIvEbetl_JfXG-mp4v9p355qmo,1358
|
137
|
+
temp/inspect_model.py,sha256=AaRDqhRH9kqcUcfmrSUsNw1xWkxargNA3BWIvzxwHGM,1692
|
138
|
+
temp/rtdetr_dinov2_test.py,sha256=9FUL3hiHweYJIbEeH0AZTrLJSnWatxwymNG9CZEXrGA,1553
|
139
|
+
temp/test_closest_debug.py,sha256=QP53iAEwy2KRSZlwH2eQ07JILxRgfYwBrvro9i2ITXQ,809
|
140
|
+
temp/test_closest_debug2.py,sha256=Hbh0nkG7xS0NfayH2Qg_IzLkeKh6mH-OWo0o2i9777I,740
|
141
|
+
temp/test_context_exploration.py,sha256=DlFXDuKavvUskLjHMwqPVGGrPpYT8zBHErX1uzHnWxw,2611
|
121
142
|
temp/test_draw_guides.py,sha256=_eSSBElGHQkd2QD_KA_Okw70v0dlY5m-1-C5SQwKAJw,642
|
122
143
|
temp/test_draw_guides_interactive.py,sha256=FsH-2ZQGsGx_8QfVCWUAkLbOcJz-VfiwROzQD4AD7kQ,926
|
144
|
+
temp/test_durham.py,sha256=A0J78TiVXCLHP4xy67G6GlOtrE2sgWP7FsLMH6fjBaA,916
|
145
|
+
temp/test_empty_string.py,sha256=FovOW7hDwkShVT7nYVH_UMv3IwQjX0pHhxC9WHAfo2U,470
|
123
146
|
temp/test_exclusion_with_debug.py,sha256=CScxHvb43KrB5dzXuTOhuzjcBXZBdfYB5ygiKkEW26g,1393
|
124
147
|
temp/test_find_exclusions_fix.py,sha256=1l5aEqnElcl3kiykdtmJFlVxQ1xMKGm1UckGYEQg--c,2103
|
125
148
|
temp/test_find_exclusions_fix_no_recursion.py,sha256=qZspTBwxunRM93N_-fZ2fR5Lodj0ArQX3h10HlTXhfc,3592
|
@@ -133,6 +156,7 @@ temp/test_marker_order.py,sha256=TFZkMxRiNoZGVcdDivYnkIDNvwHaiyKUdYoy2rTTIiI,141
|
|
133
156
|
temp/test_original_exclusions_now_work.py,sha256=G6LmaF-P9Qhj0j4lT_4ncfCddllfP6L8F_x2prUBr9w,1904
|
134
157
|
temp/test_pdf_exclusions_with_guides.py,sha256=QaMl0frgKC8kCPQ2BUI8kqyvqsIjQPXKV_St1rK3zxg,2754
|
135
158
|
temp/test_region_exclusions_detailed.py,sha256=EftdW3JY3JH_LX5QlWKt-4drM-joPggK2fKUZRXVTMA,814
|
159
|
+
temp/test_similarity.py,sha256=2Nv8QbSwjaBwMwJsvpZgwOiMIRxPMux5QeZE_rgQ63A,441
|
136
160
|
temp/test_stripes_real_pdf.py,sha256=FIvDoJrnuioOMw1A0aTCCfZLeg99lusfe0Fb0MiqnhQ,2618
|
137
161
|
temp/test_vertical_stripes.py,sha256=Yf3TJfb_faqAFzlgb7i5u6dDHjF4UMSHIGM99vangRk,1877
|
138
162
|
temp/test_widget_functionality.py,sha256=jsEGHYK1dWWa8uEcfGRRj1ReHRMzNoIaMZU4d-o-Djs,2448
|
@@ -148,8 +172,8 @@ tools/bad_pdf_eval/llm_enrich.py,sha256=mCh4KGi1HmIkzGjj5rrHz1Osd7sEX1IZ_FW08H1t
|
|
148
172
|
tools/bad_pdf_eval/llm_enrich_with_retry.py,sha256=XUtPF1hUvqd3frDXT0wDTXoonuAivhjM5vgFdZ-tm0A,9373
|
149
173
|
tools/bad_pdf_eval/reporter.py,sha256=e1g__mkSB4q02p3mGWOwMhvFs7F2HJosNBxup0-LkyU,400
|
150
174
|
tools/bad_pdf_eval/utils.py,sha256=hR95XQ7qf7Cu6BdyX0L7ggGVx-ah5sK0jHWblTJUUic,4896
|
151
|
-
natural_pdf-0.2.
|
152
|
-
natural_pdf-0.2.
|
153
|
-
natural_pdf-0.2.
|
154
|
-
natural_pdf-0.2.
|
155
|
-
natural_pdf-0.2.
|
175
|
+
natural_pdf-0.2.19.dist-info/METADATA,sha256=vtMsWwMW9cR2LdQhdDFhDG4WWIkctrT7_3P7klvyJ-8,6960
|
176
|
+
natural_pdf-0.2.19.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
177
|
+
natural_pdf-0.2.19.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
178
|
+
natural_pdf-0.2.19.dist-info/top_level.txt,sha256=ZDKhxE_tg508o9BpagsjCGcI8GY4cF_8bg0e0IaLsPI,41
|
179
|
+
natural_pdf-0.2.19.dist-info/RECORD,,
|
temp/check_model.py
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
from ultralytics import RTDETR
|
2
|
+
import os
|
3
|
+
|
4
|
+
model_path = "/Users/soma/Development/natural-pdf/model-weights/checkbox-nano.pt"
|
5
|
+
print(f"Model exists: {os.path.exists(model_path)}")
|
6
|
+
|
7
|
+
try:
|
8
|
+
model = RTDETR(model_path)
|
9
|
+
print(f"Model loaded successfully")
|
10
|
+
print(f"Model names: {model.names}")
|
11
|
+
print(f"Model task: {model.task}")
|
12
|
+
|
13
|
+
# Try to get architecture info
|
14
|
+
if hasattr(model.model, 'yaml'):
|
15
|
+
print(f"Model yaml: {model.model.yaml}")
|
16
|
+
|
17
|
+
# Check the model structure
|
18
|
+
if hasattr(model.model, 'model'):
|
19
|
+
for i, module in enumerate(model.model.model):
|
20
|
+
print(f"Layer {i}: {module}")
|
21
|
+
if i > 5: # Just show first few layers
|
22
|
+
break
|
23
|
+
|
24
|
+
except Exception as e:
|
25
|
+
print(f"Error: {e}")
|
26
|
+
|
27
|
+
# Try loading as generic model to inspect
|
28
|
+
import torch
|
29
|
+
try:
|
30
|
+
checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
|
31
|
+
print(f"\nCheckpoint keys: {list(checkpoint.keys())}")
|
32
|
+
|
33
|
+
# Check for model configuration
|
34
|
+
if 'model' in checkpoint and hasattr(checkpoint['model'], 'yaml'):
|
35
|
+
print(f"Model yaml: {checkpoint['model'].yaml}")
|
36
|
+
|
37
|
+
# Check train args for model info
|
38
|
+
if 'train_args' in checkpoint:
|
39
|
+
args = checkpoint['train_args']
|
40
|
+
print(f"\nTraining args:")
|
41
|
+
print(f" Model: {getattr(args, 'model', 'Unknown')}")
|
42
|
+
print(f" Task: {getattr(args, 'task', 'Unknown')}")
|
43
|
+
|
44
|
+
# Check epoch info
|
45
|
+
if 'epoch' in checkpoint:
|
46
|
+
print(f" Epochs trained: {checkpoint['epoch']}")
|
47
|
+
|
48
|
+
except Exception as e2:
|
49
|
+
print(f"Error loading checkpoint: {e2}")
|