natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +45 -1
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/layout/yolo.py +2 -2
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +556 -25
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +89 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/engine_paddle.py +1 -1
- natural_pdf/ocr/ocr_factory.py +8 -8
- natural_pdf/ocr/ocr_manager.py +51 -1
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
- natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,11 @@
|
|
1
|
+
bad_pdf_analysis/analyze_10_more.py,sha256=UjsTuHE1GUMoVjkX3afy3x6DfpXyfZXHgS2W1GQqUmw,11906
|
2
|
+
bad_pdf_analysis/analyze_final_10.py,sha256=xYkIId0nF9LpWHRLDP1_nlJfJfC0b0Tu4mLu-3mim-0,25170
|
3
|
+
bad_pdf_analysis/analyze_specific_pages.py,sha256=wzq3_ZWR28hFdT7GEkayHPYgsk20OpD476LYmy2rAEk,13725
|
4
|
+
bad_pdf_analysis/analyze_specific_pages_direct.py,sha256=307gSNplwOtNTR9a0lEQWxlAKGeoZIcDe5z1pROKUXY,14846
|
1
5
|
natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
|
2
6
|
natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
|
3
7
|
natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
|
4
|
-
natural_pdf/analyzers/shape_detection_mixin.py,sha256=
|
8
|
+
natural_pdf/analyzers/shape_detection_mixin.py,sha256=0a4uuKQ4Z1Ta_UVuUtX7mVhlwmXdAkoHTyC5wZyp5do,94455
|
5
9
|
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
6
10
|
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
7
11
|
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
@@ -9,37 +13,38 @@ natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTL
|
|
9
13
|
natural_pdf/analyzers/layout/base.py,sha256=bYawhmc_0xqKG-xbxUSiazIU1om-aBox5Jh8qDqv-eM,6451
|
10
14
|
natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
|
11
15
|
natural_pdf/analyzers/layout/gemini.py,sha256=ldECVCQ5HNQA3Omjg2NOsTrJXslyYb0vErDncmLIiuE,10510
|
12
|
-
natural_pdf/analyzers/layout/layout_analyzer.py,sha256=
|
13
|
-
natural_pdf/analyzers/layout/layout_manager.py,sha256=
|
16
|
+
natural_pdf/analyzers/layout/layout_analyzer.py,sha256=1v23FVCIGzkoiyRqiLZBwGZssBFKphtMossMENMuMxE,15519
|
17
|
+
natural_pdf/analyzers/layout/layout_manager.py,sha256=vDXBAaNwvp68CRcEPH58MGLxx01OdVgzOh7Uv53L6fs,10319
|
14
18
|
natural_pdf/analyzers/layout/layout_options.py,sha256=-Nv6bcu4_pqSCN6uNhCZ9mvoCBtRDZIUkO6kjkuLXsg,7703
|
15
19
|
natural_pdf/analyzers/layout/paddle.py,sha256=tX2bI1yayAdmRhvsfZ_Ygs7zAG5e9eW-pLJkw4NUpBQ,21325
|
16
20
|
natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh6RoezjdepTnMl90SaNIrP29Pwc,5902
|
17
|
-
natural_pdf/analyzers/layout/surya.py,sha256=
|
21
|
+
natural_pdf/analyzers/layout/surya.py,sha256=ugRXPIHiLoh65lfbbiXO317TbgdtQ-5kVN1nonEf4ws,9778
|
18
22
|
natural_pdf/analyzers/layout/table_structure_utils.py,sha256=nISZDBd46RPYkFHxbQyIHwg9WweG4DslpoYJ31OMJYA,2768
|
19
23
|
natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
|
20
|
-
natural_pdf/analyzers/layout/yolo.py,sha256=
|
21
|
-
natural_pdf/classification/manager.py,sha256=
|
24
|
+
natural_pdf/analyzers/layout/yolo.py,sha256=ruchj28sxar0DWDALwUz1j30z0CLIEp2QAs0gLVvC4E,8346
|
25
|
+
natural_pdf/classification/manager.py,sha256=pOP2LvJpTBGItvdIODnk735DXq7F2qqxN4AKmBORM3c,21775
|
22
26
|
natural_pdf/classification/mixin.py,sha256=nYpmHQ4BlrealdPtIJt-_idME5o-xKLKNuAdIHzWL6c,7580
|
23
27
|
natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZHWdWJzmsU,3239
|
24
28
|
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
25
29
|
natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
|
26
30
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
27
|
-
natural_pdf/core/element_manager.py,sha256=
|
28
|
-
natural_pdf/core/highlighting_service.py,sha256=
|
29
|
-
natural_pdf/core/page.py,sha256=
|
30
|
-
natural_pdf/core/pdf.py,sha256=
|
31
|
+
natural_pdf/core/element_manager.py,sha256=96v_w3kXhSUqRsJlX5Bl6O6hJzpYRqDn4xoyRsdqZ7o,49260
|
32
|
+
natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
|
33
|
+
natural_pdf/core/page.py,sha256=kQKKqsbOaNeLhW3ark6mueDS-4tsopJcGcoMmKPK6B8,125624
|
34
|
+
natural_pdf/core/pdf.py,sha256=YfniZp54AyptzMyr7ZP8n617n4wlV28SPrajt32nNBk,74233
|
31
35
|
natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
|
32
|
-
natural_pdf/describe/base.py,sha256=
|
33
|
-
natural_pdf/describe/elements.py,sha256=
|
36
|
+
natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
|
37
|
+
natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
|
34
38
|
natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
|
35
39
|
natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
|
36
40
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
37
|
-
natural_pdf/elements/base.py,sha256=
|
38
|
-
natural_pdf/elements/collections.py,sha256=
|
41
|
+
natural_pdf/elements/base.py,sha256=VshU4RstdzONJFq_8UVIjT_lVOai0MwMFsSFrCN-IO8,47299
|
42
|
+
natural_pdf/elements/collections.py,sha256=52Oac96svzm_QMJcVaItnCG9P7d6JMNiGEx9lHgDEQg,125915
|
43
|
+
natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
|
39
44
|
natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
|
40
45
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
41
|
-
natural_pdf/elements/region.py,sha256=
|
42
|
-
natural_pdf/elements/text.py,sha256=
|
46
|
+
natural_pdf/elements/region.py,sha256=v1PzWvQoGHGdn7SQiPf4Oq3hIGueIfYGwcZ05ZU6XPE,127692
|
47
|
+
natural_pdf/elements/text.py,sha256=2neapKplef0FsAMYWr4OdICt-TmrZ3z9z0YBrX8FrSk,17738
|
43
48
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
44
49
|
natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
|
45
50
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
@@ -55,18 +60,18 @@ natural_pdf/extraction/manager.py,sha256=sASPJZ5cWFsl8A4PyTjg2yqkyC00tRl6glfoFA6
|
|
55
60
|
natural_pdf/extraction/mixin.py,sha256=_5wGnzOCEuRWhqdSUV1Lqo9HIi56YC4MWzbBxOkOEKU,23160
|
56
61
|
natural_pdf/extraction/result.py,sha256=D5DhjxLW7IvhEkvsAP7Zs2YA8K4hyuoTg681CSn5qA0,1825
|
57
62
|
natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
|
58
|
-
natural_pdf/flows/collections.py,sha256=
|
63
|
+
natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
|
59
64
|
natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
|
60
65
|
natural_pdf/flows/flow.py,sha256=I61BpFVDQyo6ORsmoqoYiOEP1DBRp0vgDJjm_V8frhc,10562
|
61
|
-
natural_pdf/flows/region.py,sha256=
|
66
|
+
natural_pdf/flows/region.py,sha256=s_YAT_0KsrwUs73hhU9xr_35Ufr__XNhRjHSQkxcfYU,27647
|
62
67
|
natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
|
63
68
|
natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
|
64
69
|
natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
|
65
70
|
natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdFUtzo,9888
|
66
|
-
natural_pdf/ocr/engine_paddle.py,sha256=
|
71
|
+
natural_pdf/ocr/engine_paddle.py,sha256=9tQZl1VqN6d_KEWUY_S9tfrDLiR4FCHMjgSRNwPlsu8,16152
|
67
72
|
natural_pdf/ocr/engine_surya.py,sha256=lOvSbZk53VKFVxRmqcQzM_0dHVdwTkRGiDZ9AWCgL1Q,5951
|
68
|
-
natural_pdf/ocr/ocr_factory.py,sha256=
|
69
|
-
natural_pdf/ocr/ocr_manager.py,sha256=
|
73
|
+
natural_pdf/ocr/ocr_factory.py,sha256=qjGL3hm_nTzxjwYWP0JE7dCFXZjKN8Z7f9c0oqasb9M,5262
|
74
|
+
natural_pdf/ocr/ocr_manager.py,sha256=jFJI8v3coapKpERoUlP-ptwguZG_Dl4VlclD0xQ6Us8,16192
|
70
75
|
natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
|
71
76
|
natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
|
72
77
|
natural_pdf/qa/__init__.py,sha256=2u2KJcA71g1I0HnLD-j6yvDw1moAjo9kkLhhfoYRURM,166
|
@@ -79,9 +84,12 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
|
|
79
84
|
natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
|
80
85
|
natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
|
81
86
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
82
|
-
natural_pdf/selectors/parser.py,sha256=
|
87
|
+
natural_pdf/selectors/parser.py,sha256=T9r7XZhM1cGSYQrc9amUHbFtX-zBqd9_YPK0scwCjAQ,34231
|
88
|
+
natural_pdf/tables/__init__.py,sha256=y65LM2wnu81yzvOX-J_5NXiIK4vEUtHa3EM1xv-0ttQ,105
|
89
|
+
natural_pdf/tables/result.py,sha256=OYc-MjnP-VRTVaY-pBt84E-d8N3AaqzwAud0hHt5sVY,3979
|
83
90
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
84
91
|
natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
|
92
|
+
natural_pdf/utils/bidi_mirror.py,sha256=SAe5SnL-xG5Wyo3LtkMttLdsnQqZhzAebLc7BAe6LhQ,1150
|
85
93
|
natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
|
86
94
|
natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
|
87
95
|
natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
|
@@ -89,12 +97,26 @@ natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,21
|
|
89
97
|
natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
|
90
98
|
natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
|
91
99
|
natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
|
92
|
-
natural_pdf/utils/visualization.py,sha256=
|
100
|
+
natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
|
93
101
|
natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
|
94
102
|
natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
|
95
|
-
natural_pdf-0.1.
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
103
|
+
natural_pdf-0.1.30.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
104
|
+
optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
|
105
|
+
optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
|
106
|
+
optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
|
107
|
+
optimization/test_cleanup_methods.py,sha256=B_zHiJr1hI8q-tdfBoFi0Jf5lj2PURjA_6teRBGoz8o,6277
|
108
|
+
optimization/test_memory_fix.py,sha256=CWc0OSvFfKE0-nxqJOi_HAQc0GXUPKzkQbTeJp5UqxU,6364
|
109
|
+
tools/rtl_smoke_test.py,sha256=-ogcbvNzumJasICP0NNQHk4Zb4M1VRx0TnGkJUQC7SM,3043
|
110
|
+
tools/bad_pdf_eval/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
|
111
|
+
tools/bad_pdf_eval/analyser.py,sha256=sR31aVVmTXRHS8uwLZXlPefTH2_lskxtAzuZwlhsyOo,13391
|
112
|
+
tools/bad_pdf_eval/collate_summaries.py,sha256=Mcmf1OvVn0S0efj5ypk0syXKSrfUf6L5dowoGvOTgjU,5047
|
113
|
+
tools/bad_pdf_eval/eval_suite.py,sha256=-MK-XLqBo1025sccwYL6tnf7mZ1ZEpxu6EsTYv2ppmU,4294
|
114
|
+
tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrOw1q6ARMl-EazIU,1906
|
115
|
+
tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
|
116
|
+
tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
|
117
|
+
tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
|
118
|
+
natural_pdf-0.1.30.dist-info/METADATA,sha256=4Jg-iXXt6zGNE4gSYE_nMF395JDzv1Dierh93x1Lklo,6711
|
119
|
+
natural_pdf-0.1.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
120
|
+
natural_pdf-0.1.30.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
|
121
|
+
natural_pdf-0.1.30.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
|
122
|
+
natural_pdf-0.1.30.dist-info/RECORD,,
|
@@ -0,0 +1,172 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Memory comparison script to measure the effectiveness of the character duplication fix.
|
4
|
+
|
5
|
+
This script compares memory usage before and after the optimization by:
|
6
|
+
1. Testing with a text-heavy PDF
|
7
|
+
2. Measuring detailed memory usage patterns
|
8
|
+
3. Calculating memory savings
|
9
|
+
"""
|
10
|
+
|
11
|
+
import gc
|
12
|
+
import os
|
13
|
+
import psutil
|
14
|
+
import sys
|
15
|
+
from pathlib import Path
|
16
|
+
|
17
|
+
import natural_pdf as npdf
|
18
|
+
|
19
|
+
|
20
|
+
def get_detailed_memory_info():
|
21
|
+
"""Get detailed memory information"""
|
22
|
+
process = psutil.Process()
|
23
|
+
memory_info = process.memory_info()
|
24
|
+
return {
|
25
|
+
'rss_mb': memory_info.rss / 1024 / 1024,
|
26
|
+
'vms_mb': memory_info.vms / 1024 / 1024,
|
27
|
+
'python_objects': len(gc.get_objects())
|
28
|
+
}
|
29
|
+
|
30
|
+
|
31
|
+
def analyze_character_storage(page):
|
32
|
+
"""Analyze how characters are stored in the page"""
|
33
|
+
# Force element loading
|
34
|
+
text_elements = page.find_all("text")
|
35
|
+
|
36
|
+
total_char_indices = 0
|
37
|
+
total_char_dicts = 0
|
38
|
+
total_chars_in_words = 0
|
39
|
+
memory_efficient_words = 0
|
40
|
+
legacy_words = 0
|
41
|
+
|
42
|
+
for element in text_elements:
|
43
|
+
if hasattr(element, '_char_indices') and element._char_indices:
|
44
|
+
memory_efficient_words += 1
|
45
|
+
total_char_indices += len(element._char_indices)
|
46
|
+
total_chars_in_words += len(element._char_indices)
|
47
|
+
|
48
|
+
if hasattr(element, '_char_dicts') and element._char_dicts:
|
49
|
+
total_char_dicts += len(element._char_dicts)
|
50
|
+
if not (hasattr(element, '_char_indices') and element._char_indices):
|
51
|
+
legacy_words += 1
|
52
|
+
total_chars_in_words += len(element._char_dicts)
|
53
|
+
|
54
|
+
# Get individual character elements
|
55
|
+
char_elements = []
|
56
|
+
if hasattr(page, '_element_mgr'):
|
57
|
+
char_elements = page._element_mgr.get_elements('chars')
|
58
|
+
|
59
|
+
return {
|
60
|
+
'total_words': len(text_elements),
|
61
|
+
'memory_efficient_words': memory_efficient_words,
|
62
|
+
'legacy_words': legacy_words,
|
63
|
+
'total_char_elements': len(char_elements),
|
64
|
+
'total_char_indices': total_char_indices,
|
65
|
+
'total_char_dicts': total_char_dicts,
|
66
|
+
'total_chars_in_words': total_chars_in_words,
|
67
|
+
'estimated_duplication_ratio': total_char_dicts / max(len(char_elements), 1)
|
68
|
+
}
|
69
|
+
|
70
|
+
|
71
|
+
def test_memory_optimization():
|
72
|
+
"""Test the memory optimization with a real PDF"""
|
73
|
+
|
74
|
+
# Test with the practice PDF
|
75
|
+
test_pdf = Path("pdfs/01-practice.pdf")
|
76
|
+
if not test_pdf.exists():
|
77
|
+
print(f"Test PDF not found: {test_pdf}")
|
78
|
+
return
|
79
|
+
|
80
|
+
print("=" * 60)
|
81
|
+
print("MEMORY OPTIMIZATION ANALYSIS")
|
82
|
+
print("=" * 60)
|
83
|
+
|
84
|
+
# Baseline memory
|
85
|
+
gc.collect()
|
86
|
+
baseline_memory = get_detailed_memory_info()
|
87
|
+
print(f"Baseline memory: {baseline_memory['rss_mb']:.2f} MB RSS, {baseline_memory['python_objects']:,} objects")
|
88
|
+
|
89
|
+
# Load PDF
|
90
|
+
pdf = npdf.PDF(str(test_pdf))
|
91
|
+
page = pdf.pages[0]
|
92
|
+
|
93
|
+
post_load_memory = get_detailed_memory_info()
|
94
|
+
print(f"After PDF load: {post_load_memory['rss_mb']:.2f} MB RSS, {post_load_memory['python_objects']:,} objects")
|
95
|
+
|
96
|
+
# Analyze character storage
|
97
|
+
storage_analysis = analyze_character_storage(page)
|
98
|
+
|
99
|
+
final_memory = get_detailed_memory_info()
|
100
|
+
print(f"After element load: {final_memory['rss_mb']:.2f} MB RSS, {final_memory['python_objects']:,} objects")
|
101
|
+
|
102
|
+
print("\n" + "=" * 40)
|
103
|
+
print("CHARACTER STORAGE ANALYSIS")
|
104
|
+
print("=" * 40)
|
105
|
+
|
106
|
+
print(f"Total words: {storage_analysis['total_words']}")
|
107
|
+
print(f"Memory-efficient words: {storage_analysis['memory_efficient_words']}")
|
108
|
+
print(f"Legacy words: {storage_analysis['legacy_words']}")
|
109
|
+
print(f"Total character elements: {storage_analysis['total_char_elements']}")
|
110
|
+
print(f"Character indices used: {storage_analysis['total_char_indices']}")
|
111
|
+
print(f"Character dicts stored: {storage_analysis['total_char_dicts']}")
|
112
|
+
print(f"Characters referenced by words: {storage_analysis['total_chars_in_words']}")
|
113
|
+
|
114
|
+
# Calculate optimization metrics
|
115
|
+
duplication_ratio = storage_analysis['estimated_duplication_ratio']
|
116
|
+
optimization_percentage = storage_analysis['memory_efficient_words'] / max(storage_analysis['total_words'], 1) * 100
|
117
|
+
|
118
|
+
print(f"\nOptimization metrics:")
|
119
|
+
print(f"- Duplication ratio: {duplication_ratio:.2f}x")
|
120
|
+
print(f"- Words using optimization: {optimization_percentage:.1f}%")
|
121
|
+
|
122
|
+
# Memory savings estimation
|
123
|
+
memory_used = final_memory['rss_mb'] - baseline_memory['rss_mb']
|
124
|
+
chars_total = storage_analysis['total_char_elements']
|
125
|
+
|
126
|
+
if chars_total > 0:
|
127
|
+
memory_per_char = memory_used / chars_total * 1024 # KB per char
|
128
|
+
print(f"- Memory per character: {memory_per_char:.2f} KB")
|
129
|
+
|
130
|
+
# Estimate savings from eliminating _char_dicts duplication
|
131
|
+
duplicated_chars = storage_analysis['total_char_dicts']
|
132
|
+
if duplicated_chars > 0:
|
133
|
+
estimated_wasted_memory = duplicated_chars * memory_per_char / 1024 # MB
|
134
|
+
print(f"- Estimated memory saved by optimization: {estimated_wasted_memory:.2f} MB")
|
135
|
+
print(f"- Memory efficiency improvement: {estimated_wasted_memory / memory_used * 100:.1f}%")
|
136
|
+
|
137
|
+
print(f"\nTotal memory used for page processing: {memory_used:.2f} MB")
|
138
|
+
|
139
|
+
# Test functionality
|
140
|
+
print("\n" + "=" * 40)
|
141
|
+
print("FUNCTIONALITY VERIFICATION")
|
142
|
+
print("=" * 40)
|
143
|
+
|
144
|
+
# Test character access
|
145
|
+
test_elements = page.find_all("text")[:3]
|
146
|
+
for i, element in enumerate(test_elements):
|
147
|
+
print(f"\nWord {i+1}: '{element.text[:30]}{'...' if len(element.text) > 30 else ''}'")
|
148
|
+
|
149
|
+
if hasattr(element, '_char_indices') and element._char_indices:
|
150
|
+
chars = element.chars
|
151
|
+
print(f" - Uses character indices: {len(element._char_indices)} indices -> {len(chars)} chars")
|
152
|
+
print(f" - Memory optimization: ACTIVE")
|
153
|
+
|
154
|
+
# Verify character access works
|
155
|
+
if chars:
|
156
|
+
first_char = chars[0]
|
157
|
+
print(f" - First char: '{first_char.text}' at ({first_char.x0:.1f}, {first_char.top:.1f})")
|
158
|
+
|
159
|
+
elif hasattr(element, '_char_dicts') and element._char_dicts:
|
160
|
+
print(f" - Uses character dicts: {len(element._char_dicts)} dicts")
|
161
|
+
print(f" - Memory optimization: LEGACY MODE")
|
162
|
+
|
163
|
+
else:
|
164
|
+
print(f" - No character data available")
|
165
|
+
|
166
|
+
print("\n" + "=" * 60)
|
167
|
+
print("✅ MEMORY OPTIMIZATION ANALYSIS COMPLETE")
|
168
|
+
print("=" * 60)
|
169
|
+
|
170
|
+
|
171
|
+
if __name__ == "__main__":
|
172
|
+
test_memory_optimization()
|