natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +556 -25
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +89 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,11 @@
1
+ bad_pdf_analysis/analyze_10_more.py,sha256=UjsTuHE1GUMoVjkX3afy3x6DfpXyfZXHgS2W1GQqUmw,11906
2
+ bad_pdf_analysis/analyze_final_10.py,sha256=xYkIId0nF9LpWHRLDP1_nlJfJfC0b0Tu4mLu-3mim-0,25170
3
+ bad_pdf_analysis/analyze_specific_pages.py,sha256=wzq3_ZWR28hFdT7GEkayHPYgsk20OpD476LYmy2rAEk,13725
4
+ bad_pdf_analysis/analyze_specific_pages_direct.py,sha256=307gSNplwOtNTR9a0lEQWxlAKGeoZIcDe5z1pROKUXY,14846
1
5
  natural_pdf/__init__.py,sha256=qDFJNF8sbEDO-2WSFAxoWEM8updOUP6dB-ckya0kxfs,3275
2
6
  natural_pdf/cli.py,sha256=IXrP2lCHihr-ed-CFiDbMTnSsutQa1j1PYALOLGbpsc,4019
3
7
  natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
4
- natural_pdf/analyzers/shape_detection_mixin.py,sha256=aHn4EMdbwOe8VWECPceGs5wN7gJP_kIxyAbmbNlNPSs,83634
8
+ natural_pdf/analyzers/shape_detection_mixin.py,sha256=0a4uuKQ4Z1Ta_UVuUtX7mVhlwmXdAkoHTyC5wZyp5do,94455
5
9
  natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
6
10
  natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
7
11
  natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
@@ -9,37 +13,38 @@ natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTL
9
13
  natural_pdf/analyzers/layout/base.py,sha256=bYawhmc_0xqKG-xbxUSiazIU1om-aBox5Jh8qDqv-eM,6451
10
14
  natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
11
15
  natural_pdf/analyzers/layout/gemini.py,sha256=ldECVCQ5HNQA3Omjg2NOsTrJXslyYb0vErDncmLIiuE,10510
12
- natural_pdf/analyzers/layout/layout_analyzer.py,sha256=n327Zjuf7aSzKQKChPHeiCVHinzeDGaWNyKiwQ-DkJk,15571
13
- natural_pdf/analyzers/layout/layout_manager.py,sha256=i887PY1vdlx7Hnkyf7JSAPAbnozT9hGK_RmYAoCRpyA,8576
16
+ natural_pdf/analyzers/layout/layout_analyzer.py,sha256=1v23FVCIGzkoiyRqiLZBwGZssBFKphtMossMENMuMxE,15519
17
+ natural_pdf/analyzers/layout/layout_manager.py,sha256=vDXBAaNwvp68CRcEPH58MGLxx01OdVgzOh7Uv53L6fs,10319
14
18
  natural_pdf/analyzers/layout/layout_options.py,sha256=-Nv6bcu4_pqSCN6uNhCZ9mvoCBtRDZIUkO6kjkuLXsg,7703
15
19
  natural_pdf/analyzers/layout/paddle.py,sha256=tX2bI1yayAdmRhvsfZ_Ygs7zAG5e9eW-pLJkw4NUpBQ,21325
16
20
  natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh6RoezjdepTnMl90SaNIrP29Pwc,5902
17
- natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuKvUGQfEBI,9789
21
+ natural_pdf/analyzers/layout/surya.py,sha256=ugRXPIHiLoh65lfbbiXO317TbgdtQ-5kVN1nonEf4ws,9778
18
22
  natural_pdf/analyzers/layout/table_structure_utils.py,sha256=nISZDBd46RPYkFHxbQyIHwg9WweG4DslpoYJ31OMJYA,2768
19
23
  natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
20
24
  natural_pdf/analyzers/layout/yolo.py,sha256=ruchj28sxar0DWDALwUz1j30z0CLIEp2QAs0gLVvC4E,8346
21
- natural_pdf/classification/manager.py,sha256=7HOyHdjMJtC9DfzI8OXAREnGDpXaAgSfTFVC42n3tVQ,18889
25
+ natural_pdf/classification/manager.py,sha256=pOP2LvJpTBGItvdIODnk735DXq7F2qqxN4AKmBORM3c,21775
22
26
  natural_pdf/classification/mixin.py,sha256=nYpmHQ4BlrealdPtIJt-_idME5o-xKLKNuAdIHzWL6c,7580
23
27
  natural_pdf/classification/results.py,sha256=Mcay-xLBHbYoZ8U7f4gMj2IhhH_yORNEkZHWdWJzmsU,3239
24
28
  natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
25
29
  natural_pdf/collections/pdf_collection.py,sha256=HLlyakM--23ZOeHDPucoM6Tw3yUyMXm0SSoqJwxRc2E,30744
26
30
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
27
- natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
28
- natural_pdf/core/highlighting_service.py,sha256=DKoaxiiuQsWgtf6wSroMAIcFiqJOOF7dXhciYdQKdCw,38223
29
- natural_pdf/core/page.py,sha256=GqYfYiVkuL1M_GoPTcLL0yWFXISN38BUCdQIKyF6vJ0,122721
30
- natural_pdf/core/pdf.py,sha256=qsSW4RxOJRmCnweLPMs0NhzkRfiAVdghTgnh4D_wuO4,74295
31
+ natural_pdf/core/element_manager.py,sha256=96v_w3kXhSUqRsJlX5Bl6O6hJzpYRqDn4xoyRsdqZ7o,49260
32
+ natural_pdf/core/highlighting_service.py,sha256=WKDqRpex1yS8CWhkNitWtKhxbyRRCLu3Xsct_HTPsD4,40774
33
+ natural_pdf/core/page.py,sha256=kQKKqsbOaNeLhW3ark6mueDS-4tsopJcGcoMmKPK6B8,125624
34
+ natural_pdf/core/pdf.py,sha256=YfniZp54AyptzMyr7ZP8n617n4wlV28SPrajt32nNBk,74233
31
35
  natural_pdf/describe/__init__.py,sha256=B3zjuHjFI_dFuBLgXR1Q4v7c72fVDyk84d2hs0H4KV8,561
32
- natural_pdf/describe/base.py,sha256=mUvEydumXXPJ2FkWAYm1BbWrRWY81I0dMyQrEU32rmc,17256
33
- natural_pdf/describe/elements.py,sha256=xD8wwR1z5IKat7RIwoAwQRUEL6zJTEwcOKorF4F-xPg,12717
36
+ natural_pdf/describe/base.py,sha256=HaWlHltb-dw6ug4mfR_iBLHWxr1OdPwLaUshXRxO7gg,18462
37
+ natural_pdf/describe/elements.py,sha256=COvKF3B_RbAxXl5ORJDubV4C5PsiuSfuzD0ufPIJTFM,12983
34
38
  natural_pdf/describe/mixin.py,sha256=U0x6v8r57KQb8qC3VVo64hvhfXQWsti8vdKBM7AXnMo,3116
35
39
  natural_pdf/describe/summary.py,sha256=7FIF3zF6bzNx-gx4pCJr2XQFKiVzOEDnWsAYQ_mr9L0,7982
36
40
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
37
- natural_pdf/elements/base.py,sha256=iw-Ab0o7eI69npt0gAxQvA14GPWHAAhkLrJ_JeKvIos,43309
38
- natural_pdf/elements/collections.py,sha256=JrM42VPRtDOJ9Q9KIR3SrcbamiiCHXI4nzTq2BBkeEk,124223
41
+ natural_pdf/elements/base.py,sha256=VshU4RstdzONJFq_8UVIjT_lVOai0MwMFsSFrCN-IO8,47299
42
+ natural_pdf/elements/collections.py,sha256=52Oac96svzm_QMJcVaItnCG9P7d6JMNiGEx9lHgDEQg,125915
43
+ natural_pdf/elements/image.py,sha256=UjHNzCgDzOseQmLpkKshcxg51DPmWNIAVYxZ0TAMyUI,1423
39
44
  natural_pdf/elements/line.py,sha256=aQm4pDdlQSDAAXqrdg4AU-oTl9JCXgYuaJN0EYls6E0,4920
40
45
  natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
41
- natural_pdf/elements/region.py,sha256=63rdyjOnbmsgTN1WMSOyQWQnvJRUYco9qTWLqBi3TBk,125498
42
- natural_pdf/elements/text.py,sha256=x163dnr2ZDEIE_WZXWH5hXJtoO-6cvTdA2BABcZd69U,14575
46
+ natural_pdf/elements/region.py,sha256=v1PzWvQoGHGdn7SQiPf4Oq3hIGueIfYGwcZ05ZU6XPE,127692
47
+ natural_pdf/elements/text.py,sha256=2neapKplef0FsAMYWr4OdICt-TmrZ3z9z0YBrX8FrSk,17738
43
48
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
44
49
  natural_pdf/exporters/__init__.py,sha256=g1WRPCDVzceaUUsm8dchPhzdHFSjYM0NfRyc8iN0mtE,644
45
50
  natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
@@ -55,10 +60,10 @@ natural_pdf/extraction/manager.py,sha256=sASPJZ5cWFsl8A4PyTjg2yqkyC00tRl6glfoFA6
55
60
  natural_pdf/extraction/mixin.py,sha256=_5wGnzOCEuRWhqdSUV1Lqo9HIi56YC4MWzbBxOkOEKU,23160
56
61
  natural_pdf/extraction/result.py,sha256=D5DhjxLW7IvhEkvsAP7Zs2YA8K4hyuoTg681CSn5qA0,1825
57
62
  natural_pdf/flows/__init__.py,sha256=cUN4A8hTDLZSRr4PO2W_lR4z6hWpbNG8Seox-IIcrLU,277
58
- natural_pdf/flows/collections.py,sha256=qGuSPFSPQF-wiYquG6STiSzg_o951MSsFEq_B44Jef8,28441
63
+ natural_pdf/flows/collections.py,sha256=iF8SsfKKb-YVIGi3m-yMRnfKgo_0n_EGhojnYK24h-Q,28493
59
64
  natural_pdf/flows/element.py,sha256=mKzk3B7A7sWNvu4CDvAjLr3_ZFLt--ktrSNoLfLpFxU,23940
60
65
  natural_pdf/flows/flow.py,sha256=I61BpFVDQyo6ORsmoqoYiOEP1DBRp0vgDJjm_V8frhc,10562
61
- natural_pdf/flows/region.py,sha256=4U3S7pLEa3oCyPfS-hpD0lSXf8MWT-MdF9AsVvMJbWU,26670
66
+ natural_pdf/flows/region.py,sha256=s_YAT_0KsrwUs73hhU9xr_35Ufr__XNhRjHSQkxcfYU,27647
62
67
  natural_pdf/ocr/__init__.py,sha256=VY8hhvDPf7Gh2lB-d2QRmghLLyTy6ydxlgo1cS4dOSk,2482
63
68
  natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
64
69
  natural_pdf/ocr/engine_doctr.py,sha256=ptKrupMWoulZb-R93zr9btoe94JPWU7vlJuN7OBJEIM,17740
@@ -66,7 +71,7 @@ natural_pdf/ocr/engine_easyocr.py,sha256=bWz6kHUgAJfe3rqdnZBAF-IPvw3B35DlvX5KDdF
66
71
  natural_pdf/ocr/engine_paddle.py,sha256=9tQZl1VqN6d_KEWUY_S9tfrDLiR4FCHMjgSRNwPlsu8,16152
67
72
  natural_pdf/ocr/engine_surya.py,sha256=lOvSbZk53VKFVxRmqcQzM_0dHVdwTkRGiDZ9AWCgL1Q,5951
68
73
  natural_pdf/ocr/ocr_factory.py,sha256=qjGL3hm_nTzxjwYWP0JE7dCFXZjKN8Z7f9c0oqasb9M,5262
69
- natural_pdf/ocr/ocr_manager.py,sha256=bLR2IowLKWaSrgR86SfOEbmcbn-DZUEXK9-fl5PiNM8,14256
74
+ natural_pdf/ocr/ocr_manager.py,sha256=jFJI8v3coapKpERoUlP-ptwguZG_Dl4VlclD0xQ6Us8,16192
70
75
  natural_pdf/ocr/ocr_options.py,sha256=l33QKu_93r-uwi3t_v8UH8pEgHo6HTVzP4tfmQFRF1w,5488
71
76
  natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
72
77
  natural_pdf/qa/__init__.py,sha256=2u2KJcA71g1I0HnLD-j6yvDw1moAjo9kkLhhfoYRURM,166
@@ -79,9 +84,12 @@ natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzP
79
84
  natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
80
85
  natural_pdf/search/searchable_mixin.py,sha256=hqQ_AuID5eTGRCtKYdFLZ1zF35y73uk3x1M1VW9Il8U,23514
81
86
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
82
- natural_pdf/selectors/parser.py,sha256=nJJPACzFKBMomAZMXzI_L3EMz9ji1agkmxJ5eaij310,32880
87
+ natural_pdf/selectors/parser.py,sha256=T9r7XZhM1cGSYQrc9amUHbFtX-zBqd9_YPK0scwCjAQ,34231
88
+ natural_pdf/tables/__init__.py,sha256=y65LM2wnu81yzvOX-J_5NXiIK4vEUtHa3EM1xv-0ttQ,105
89
+ natural_pdf/tables/result.py,sha256=OYc-MjnP-VRTVaY-pBt84E-d8N3AaqzwAud0hHt5sVY,3979
83
90
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
84
91
  natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
92
+ natural_pdf/utils/bidi_mirror.py,sha256=SAe5SnL-xG5Wyo3LtkMttLdsnQqZhzAebLc7BAe6LhQ,1150
85
93
  natural_pdf/utils/debug.py,sha256=RN7H3E6ph-GtxubCW6psW7TO8o2BxcNLiEzByTVR9fk,995
86
94
  natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
87
95
  natural_pdf/utils/identifiers.py,sha256=P7n6owcubnF8oAMa_UfYtENmIaJQdH_AMC9Jbs2bWXo,1117
@@ -89,12 +97,26 @@ natural_pdf/utils/locks.py,sha256=7HJqV0VsNcOfISnbw8goCKWP5ck11uSJo6T_x9XIPKI,21
89
97
  natural_pdf/utils/packaging.py,sha256=e7U2wWvpunlAWpPFexNkD_c4dYbPp5LcKo7og4bNGvk,22411
90
98
  natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
91
99
  natural_pdf/utils/text_extraction.py,sha256=mDeN1_VevNi3RwvFe48PM5vBh-A5WeBlYgP6lSjBaOk,10854
92
- natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
100
+ natural_pdf/utils/visualization.py,sha256=n3IZpbY5cf9LItzGavBcNyVZZrrUVxjYnmqZHYPa7NU,9386
93
101
  natural_pdf/widgets/__init__.py,sha256=QTVaUmsw__FCweFYZebwPssQxxUFUMd0wpm_cUbGZJY,181
94
102
  natural_pdf/widgets/viewer.py,sha256=2VUY1TzWMDe9I-IVNOosKZ2LaqpjLB62ftMAdk-s6_8,24952
95
- natural_pdf-0.1.28.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
96
- natural_pdf-0.1.28.dist-info/METADATA,sha256=7zn_ijkJlLhA7CwrsRBT2FememV4ZLxne-FbNDBzV84,6684
97
- natural_pdf-0.1.28.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
98
- natural_pdf-0.1.28.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
99
- natural_pdf-0.1.28.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
100
- natural_pdf-0.1.28.dist-info/RECORD,,
103
+ natural_pdf-0.1.30.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
104
+ optimization/memory_comparison.py,sha256=XEHtjduSmzXzxnsJMvemTcq-OAlvGUBAm5wwnOXq8TY,6524
105
+ optimization/pdf_analyzer.py,sha256=G3XWhsEqIYbohEgTqz6wzxkAnOx4MkbvbSspx577-8w,19145
106
+ optimization/performance_analysis.py,sha256=vVlFDywEXxhJLd9n2KVVqqQnS6rwWoHV_jlogboGF2k,13784
107
+ optimization/test_cleanup_methods.py,sha256=B_zHiJr1hI8q-tdfBoFi0Jf5lj2PURjA_6teRBGoz8o,6277
108
+ optimization/test_memory_fix.py,sha256=CWc0OSvFfKE0-nxqJOi_HAQc0GXUPKzkQbTeJp5UqxU,6364
109
+ tools/rtl_smoke_test.py,sha256=-ogcbvNzumJasICP0NNQHk4Zb4M1VRx0TnGkJUQC7SM,3043
110
+ tools/bad_pdf_eval/__init__.py,sha256=Nqnn8clbgv-5l0PgxcTOldg8mkMKrFn4TvPL-rYUUGg,1
111
+ tools/bad_pdf_eval/analyser.py,sha256=sR31aVVmTXRHS8uwLZXlPefTH2_lskxtAzuZwlhsyOo,13391
112
+ tools/bad_pdf_eval/collate_summaries.py,sha256=Mcmf1OvVn0S0efj5ypk0syXKSrfUf6L5dowoGvOTgjU,5047
113
+ tools/bad_pdf_eval/eval_suite.py,sha256=-MK-XLqBo1025sccwYL6tnf7mZ1ZEpxu6EsTYv2ppmU,4294
114
+ tools/bad_pdf_eval/export_enrichment_csv.py,sha256=SMEm9WxFUN_RIf8AGfZfjGEmvBvrOw1q6ARMl-EazIU,1906
115
+ tools/bad_pdf_eval/llm_enrich.py,sha256=PsFMymPc8BNck21T3vupTN18pLdum-A_OLoJEKr6f80,12234
116
+ tools/bad_pdf_eval/reporter.py,sha256=LIhcguDZ5XKgb0WeJsyA7m0kcliebOohzveShvt_KmY,400
117
+ tools/bad_pdf_eval/utils.py,sha256=FuxaPX6f26IjQXu1vP0a2i9h1jgJNbASb8mRyj5-elE,4849
118
+ natural_pdf-0.1.30.dist-info/METADATA,sha256=4Jg-iXXt6zGNE4gSYE_nMF395JDzv1Dierh93x1Lklo,6711
119
+ natural_pdf-0.1.30.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
120
+ natural_pdf-0.1.30.dist-info/entry_points.txt,sha256=1R_KMv7g60UBBpRqGfw7bppsMNGdayR-iJlb9ohEk_8,81
121
+ natural_pdf-0.1.30.dist-info/top_level.txt,sha256=oZlRzSc3nZ9sV3L6kD_Di734Pp62ANrm46imFVa51qQ,58
122
+ natural_pdf-0.1.30.dist-info/RECORD,,
@@ -0,0 +1,6 @@
1
+ bad_pdf_analysis
2
+ natural_pdf
3
+ optimization
4
+ pdfs
5
+ todo
6
+ tools
@@ -0,0 +1,172 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Memory comparison script to measure the effectiveness of the character duplication fix.
4
+
5
+ This script compares memory usage before and after the optimization by:
6
+ 1. Testing with a text-heavy PDF
7
+ 2. Measuring detailed memory usage patterns
8
+ 3. Calculating memory savings
9
+ """
10
+
11
+ import gc
12
+ import os
13
+ import psutil
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ import natural_pdf as npdf
18
+
19
+
20
+ def get_detailed_memory_info():
21
+ """Get detailed memory information"""
22
+ process = psutil.Process()
23
+ memory_info = process.memory_info()
24
+ return {
25
+ 'rss_mb': memory_info.rss / 1024 / 1024,
26
+ 'vms_mb': memory_info.vms / 1024 / 1024,
27
+ 'python_objects': len(gc.get_objects())
28
+ }
29
+
30
+
31
+ def analyze_character_storage(page):
32
+ """Analyze how characters are stored in the page"""
33
+ # Force element loading
34
+ text_elements = page.find_all("text")
35
+
36
+ total_char_indices = 0
37
+ total_char_dicts = 0
38
+ total_chars_in_words = 0
39
+ memory_efficient_words = 0
40
+ legacy_words = 0
41
+
42
+ for element in text_elements:
43
+ if hasattr(element, '_char_indices') and element._char_indices:
44
+ memory_efficient_words += 1
45
+ total_char_indices += len(element._char_indices)
46
+ total_chars_in_words += len(element._char_indices)
47
+
48
+ if hasattr(element, '_char_dicts') and element._char_dicts:
49
+ total_char_dicts += len(element._char_dicts)
50
+ if not (hasattr(element, '_char_indices') and element._char_indices):
51
+ legacy_words += 1
52
+ total_chars_in_words += len(element._char_dicts)
53
+
54
+ # Get individual character elements
55
+ char_elements = []
56
+ if hasattr(page, '_element_mgr'):
57
+ char_elements = page._element_mgr.get_elements('chars')
58
+
59
+ return {
60
+ 'total_words': len(text_elements),
61
+ 'memory_efficient_words': memory_efficient_words,
62
+ 'legacy_words': legacy_words,
63
+ 'total_char_elements': len(char_elements),
64
+ 'total_char_indices': total_char_indices,
65
+ 'total_char_dicts': total_char_dicts,
66
+ 'total_chars_in_words': total_chars_in_words,
67
+ 'estimated_duplication_ratio': total_char_dicts / max(len(char_elements), 1)
68
+ }
69
+
70
+
71
+ def test_memory_optimization():
72
+ """Test the memory optimization with a real PDF"""
73
+
74
+ # Test with the practice PDF
75
+ test_pdf = Path("pdfs/01-practice.pdf")
76
+ if not test_pdf.exists():
77
+ print(f"Test PDF not found: {test_pdf}")
78
+ return
79
+
80
+ print("=" * 60)
81
+ print("MEMORY OPTIMIZATION ANALYSIS")
82
+ print("=" * 60)
83
+
84
+ # Baseline memory
85
+ gc.collect()
86
+ baseline_memory = get_detailed_memory_info()
87
+ print(f"Baseline memory: {baseline_memory['rss_mb']:.2f} MB RSS, {baseline_memory['python_objects']:,} objects")
88
+
89
+ # Load PDF
90
+ pdf = npdf.PDF(str(test_pdf))
91
+ page = pdf.pages[0]
92
+
93
+ post_load_memory = get_detailed_memory_info()
94
+ print(f"After PDF load: {post_load_memory['rss_mb']:.2f} MB RSS, {post_load_memory['python_objects']:,} objects")
95
+
96
+ # Analyze character storage
97
+ storage_analysis = analyze_character_storage(page)
98
+
99
+ final_memory = get_detailed_memory_info()
100
+ print(f"After element load: {final_memory['rss_mb']:.2f} MB RSS, {final_memory['python_objects']:,} objects")
101
+
102
+ print("\n" + "=" * 40)
103
+ print("CHARACTER STORAGE ANALYSIS")
104
+ print("=" * 40)
105
+
106
+ print(f"Total words: {storage_analysis['total_words']}")
107
+ print(f"Memory-efficient words: {storage_analysis['memory_efficient_words']}")
108
+ print(f"Legacy words: {storage_analysis['legacy_words']}")
109
+ print(f"Total character elements: {storage_analysis['total_char_elements']}")
110
+ print(f"Character indices used: {storage_analysis['total_char_indices']}")
111
+ print(f"Character dicts stored: {storage_analysis['total_char_dicts']}")
112
+ print(f"Characters referenced by words: {storage_analysis['total_chars_in_words']}")
113
+
114
+ # Calculate optimization metrics
115
+ duplication_ratio = storage_analysis['estimated_duplication_ratio']
116
+ optimization_percentage = storage_analysis['memory_efficient_words'] / max(storage_analysis['total_words'], 1) * 100
117
+
118
+ print(f"\nOptimization metrics:")
119
+ print(f"- Duplication ratio: {duplication_ratio:.2f}x")
120
+ print(f"- Words using optimization: {optimization_percentage:.1f}%")
121
+
122
+ # Memory savings estimation
123
+ memory_used = final_memory['rss_mb'] - baseline_memory['rss_mb']
124
+ chars_total = storage_analysis['total_char_elements']
125
+
126
+ if chars_total > 0:
127
+ memory_per_char = memory_used / chars_total * 1024 # KB per char
128
+ print(f"- Memory per character: {memory_per_char:.2f} KB")
129
+
130
+ # Estimate savings from eliminating _char_dicts duplication
131
+ duplicated_chars = storage_analysis['total_char_dicts']
132
+ if duplicated_chars > 0:
133
+ estimated_wasted_memory = duplicated_chars * memory_per_char / 1024 # MB
134
+ print(f"- Estimated memory saved by optimization: {estimated_wasted_memory:.2f} MB")
135
+ print(f"- Memory efficiency improvement: {estimated_wasted_memory / memory_used * 100:.1f}%")
136
+
137
+ print(f"\nTotal memory used for page processing: {memory_used:.2f} MB")
138
+
139
+ # Test functionality
140
+ print("\n" + "=" * 40)
141
+ print("FUNCTIONALITY VERIFICATION")
142
+ print("=" * 40)
143
+
144
+ # Test character access
145
+ test_elements = page.find_all("text")[:3]
146
+ for i, element in enumerate(test_elements):
147
+ print(f"\nWord {i+1}: '{element.text[:30]}{'...' if len(element.text) > 30 else ''}'")
148
+
149
+ if hasattr(element, '_char_indices') and element._char_indices:
150
+ chars = element.chars
151
+ print(f" - Uses character indices: {len(element._char_indices)} indices -> {len(chars)} chars")
152
+ print(f" - Memory optimization: ACTIVE")
153
+
154
+ # Verify character access works
155
+ if chars:
156
+ first_char = chars[0]
157
+ print(f" - First char: '{first_char.text}' at ({first_char.x0:.1f}, {first_char.top:.1f})")
158
+
159
+ elif hasattr(element, '_char_dicts') and element._char_dicts:
160
+ print(f" - Uses character dicts: {len(element._char_dicts)} dicts")
161
+ print(f" - Memory optimization: LEGACY MODE")
162
+
163
+ else:
164
+ print(f" - No character data available")
165
+
166
+ print("\n" + "=" * 60)
167
+ print("✅ MEMORY OPTIMIZATION ANALYSIS COMPLETE")
168
+ print("=" * 60)
169
+
170
+
171
+ if __name__ == "__main__":
172
+ test_memory_optimization()