natural-pdf 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docs/finetuning/index.md +176 -0
- docs/tutorials/01-loading-and-extraction.ipynb +34 -1550
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/gemini.py +63 -47
- natural_pdf/collections/pdf_collection.py +5 -2
- natural_pdf/core/element_manager.py +6 -4
- natural_pdf/core/page.py +36 -27
- natural_pdf/core/pdf.py +25 -16
- natural_pdf/elements/base.py +1 -3
- natural_pdf/elements/collections.py +13 -14
- natural_pdf/elements/region.py +7 -6
- natural_pdf/exporters/__init__.py +4 -0
- natural_pdf/exporters/base.py +61 -0
- natural_pdf/exporters/paddleocr.py +345 -0
- natural_pdf/ocr/__init__.py +16 -8
- natural_pdf/ocr/engine.py +46 -30
- natural_pdf/ocr/engine_easyocr.py +81 -40
- natural_pdf/ocr/engine_paddle.py +39 -28
- natural_pdf/ocr/engine_surya.py +32 -16
- natural_pdf/ocr/ocr_factory.py +34 -23
- natural_pdf/ocr/ocr_manager.py +15 -11
- natural_pdf/ocr/ocr_options.py +5 -0
- natural_pdf/ocr/utils.py +46 -31
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +415 -0
- natural_pdf/utils/debug.py +4 -2
- natural_pdf/utils/identifiers.py +9 -5
- natural_pdf/utils/packaging.py +172 -105
- natural_pdf/utils/text_extraction.py +44 -64
- natural_pdf/utils/visualization.py +1 -1
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/METADATA +5 -3
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/RECORD +34 -30
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.6.dist-info → natural_pdf-0.1.7.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.7
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -23,7 +23,6 @@ Provides-Extra: haystack
|
|
23
23
|
Requires-Dist: haystack-ai; extra == "haystack"
|
24
24
|
Requires-Dist: chroma-haystack; extra == "haystack"
|
25
25
|
Requires-Dist: sentence-transformers; extra == "haystack"
|
26
|
-
Requires-Dist: protobuf<4; extra == "haystack"
|
27
26
|
Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
|
28
27
|
Provides-Extra: easyocr
|
29
28
|
Requires-Dist: easyocr; extra == "easyocr"
|
@@ -75,11 +74,14 @@ Requires-Dist: natural-pdf[test]; extra == "all"
|
|
75
74
|
Provides-Extra: core-ml
|
76
75
|
Requires-Dist: torch; extra == "core-ml"
|
77
76
|
Requires-Dist: torchvision; extra == "core-ml"
|
78
|
-
Requires-Dist: transformers; extra == "core-ml"
|
77
|
+
Requires-Dist: transformers[sentencepiece]; extra == "core-ml"
|
79
78
|
Requires-Dist: huggingface_hub; extra == "core-ml"
|
80
79
|
Provides-Extra: ocr-export
|
81
80
|
Requires-Dist: ocrmypdf; extra == "ocr-export"
|
82
81
|
Requires-Dist: pikepdf; extra == "ocr-export"
|
82
|
+
Provides-Extra: export-extras
|
83
|
+
Requires-Dist: jupytext; extra == "export-extras"
|
84
|
+
Requires-Dist: nbformat; extra == "export-extras"
|
83
85
|
Dynamic: license-file
|
84
86
|
|
85
87
|
# Natural PDF
|
@@ -12,6 +12,7 @@ docs/document-qa/index.ipynb,sha256=MXJoFhi8TUKK6ZnRFiUBglLGpMbzwdb7LJYfzw8Gp48,
|
|
12
12
|
docs/document-qa/index.md,sha256=mwuO4tothg0OzBXewnj73QEJu46Udq7f1pQBYrKOHwM,2131
|
13
13
|
docs/element-selection/index.ipynb,sha256=-7PwKw1RbPlZ4stzN1Rd1GJ8mwjOD4ySsLcpqVX7chc,1193628
|
14
14
|
docs/element-selection/index.md,sha256=_1P8vI64Y0aSVwUzdRJD4ayb80BJWBLED9TvVpveFx8,6979
|
15
|
+
docs/finetuning/index.md,sha256=Ur3zqSaR0X8PvBCSyI7cFiDv5qZ6Jtv4omBKXCKAzEk,9200
|
15
16
|
docs/installation/index.md,sha256=nd4RZrQFR8_vv7Xm3xAzp7z-CQQr9ffAcGa7yuEYn2U,1594
|
16
17
|
docs/interactive-widget/index.ipynb,sha256=zY1rz5N34OUW-OtgcbI6iiOjlIJqXjVcx9OoNWMjuyU,293111
|
17
18
|
docs/interactive-widget/index.md,sha256=tZbq0uYI7Zwo9mLbhXpqeBriuAjazkIyEJeP-jasJ-Q,259
|
@@ -28,7 +29,7 @@ docs/text-analysis/index.ipynb,sha256=iaup8pcQXGp0ZK3IWi-HHssQLdIzWYGYfvZK5i8yjj
|
|
28
29
|
docs/text-analysis/index.md,sha256=02pfZemOgV37izV7H-XzKmHu7AedDKLidQ-sKhYaMVw,3527
|
29
30
|
docs/text-extraction/index.ipynb,sha256=809y9ZamXT3bc3GhwwFyoDnlyEpO-kUZ3tIsZZWyrj8,2537087
|
30
31
|
docs/text-extraction/index.md,sha256=b1KfQpvIEelc8cPbFETUnK92az7iB4b7-LqK2DRH8vw,6985
|
31
|
-
docs/tutorials/01-loading-and-extraction.ipynb,sha256
|
32
|
+
docs/tutorials/01-loading-and-extraction.ipynb,sha256=SCW26hxW9PhOspiR-2X5CD6L1EiJRfXouO-OF_Nc718,4548
|
32
33
|
docs/tutorials/01-loading-and-extraction.md,sha256=g40J8GhKz-ikM2URj5MqIatKKj4l5kTFozHeVjxDJQA,2191
|
33
34
|
docs/tutorials/02-finding-elements.ipynb,sha256=k1CSz47_atA9D6DXfQzVS64t5-L-KjssU2VuFvdy7oU,524374
|
34
35
|
docs/tutorials/02-finding-elements.md,sha256=qOkjcWUzem05of54aKzKvy-MMzRX_S4CyZisVV-73QM,4162
|
@@ -59,7 +60,7 @@ docs/tutorials/13-semantic-search.md,sha256=nsNjv0ipYUC3YPSqT5d6dga9ZjObEc04Mc8c
|
|
59
60
|
docs/visual-debugging/index.ipynb,sha256=MJ92u3Q9sfRCyDAQM4KWmCrs4QhKwIagbn6ytPF83L4,2175800
|
60
61
|
docs/visual-debugging/index.md,sha256=ueGD2kNFhEAgIHt7qxCfrLRLjHcR7NTD3AU9okBhX9k,4176
|
61
62
|
docs/visual-debugging/region.png,sha256=ULAJs3ZTxMjpD9F4w1DKaZXmhxga3KRq3NrUsXgw28s,67835
|
62
|
-
natural_pdf/__init__.py,sha256=
|
63
|
+
natural_pdf/__init__.py,sha256=UdS-I3d7MzSvpxL-QMQUSUO5IGhh8c5of34BIs49TaU,2670
|
63
64
|
natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
|
64
65
|
natural_pdf/analyzers/text_options.py,sha256=nE2E1pp4psDPpxmtarvNtEQsgozPkyFRjv0TVP2HTyU,2865
|
65
66
|
natural_pdf/analyzers/text_structure.py,sha256=9h8hKRz0JWnr13xQr3b4FFr_-hDIjue07WvG7LmT8nc,12827
|
@@ -67,7 +68,7 @@ natural_pdf/analyzers/utils.py,sha256=Lgub1kYSTOnNxeLO1klStHLwH-GIuT4vpdqyVRF-Mc
|
|
67
68
|
natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
|
68
69
|
natural_pdf/analyzers/layout/base.py,sha256=9dCR758mAuz7ExlHJ-gwnPnETaM4GZV3W1IRei_t13s,6815
|
69
70
|
natural_pdf/analyzers/layout/docling.py,sha256=4BJYyNVR6VegZGxyisvNIBBRvVk6YKPyDVs7ZdVfzEU,12676
|
70
|
-
natural_pdf/analyzers/layout/gemini.py,sha256=
|
71
|
+
natural_pdf/analyzers/layout/gemini.py,sha256=CzJPWyyEghuCNpu2CMb6OA6FtBGdGhXspHjsjy6I4JE,11195
|
71
72
|
natural_pdf/analyzers/layout/layout_analyzer.py,sha256=6aed1qz5jpndOiakXCBRZAcnyG_waeXi3WPuP5fRvh4,14046
|
72
73
|
natural_pdf/analyzers/layout/layout_manager.py,sha256=Vh8EKiszKqjELofxQ1eiVLKVjibyjBsZpLFzTf0_21E,11179
|
73
74
|
natural_pdf/analyzers/layout/layout_options.py,sha256=s7xr4brE3OutE6aYNAi2PniRy1p2w8a342C2xGpvX2s,3777
|
@@ -75,30 +76,32 @@ natural_pdf/analyzers/layout/paddle.py,sha256=gTI9ZqNd5-t4H5IByGfL32WgcE6JrdchW6
|
|
75
76
|
natural_pdf/analyzers/layout/surya.py,sha256=vhji6ynHPMyQLHuYRPQcplNi7m_lG4P4NYtWv6MzcME,13556
|
76
77
|
natural_pdf/analyzers/layout/tatr.py,sha256=-GJhMy4d0yx6egkO9-ULAIdQkkQRyAKExoIta-b256U,12971
|
77
78
|
natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
|
78
|
-
natural_pdf/collections/pdf_collection.py,sha256=
|
79
|
+
natural_pdf/collections/pdf_collection.py,sha256=afE0tNIfwA7IRCc8g0EGgiBgJz3TuJbEzZ5meDNAnQw,13272
|
79
80
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
80
|
-
natural_pdf/core/element_manager.py,sha256=
|
81
|
+
natural_pdf/core/element_manager.py,sha256=RjLCzeHDRJCoCx1W_6jGg8KsiCTuXz7Uc2BoSY4M7mE,22144
|
81
82
|
natural_pdf/core/highlighting_service.py,sha256=CTVd7y-fpIreFSe70cTpMu1Pwl6HKMtTHp0bh2U7VXk,32609
|
82
|
-
natural_pdf/core/page.py,sha256=
|
83
|
-
natural_pdf/core/pdf.py,sha256=
|
83
|
+
natural_pdf/core/page.py,sha256=emS6jJdb-J7xnK8Uo8Hs1n0plbIAGA_YH6kmp36wVgM,84955
|
84
|
+
natural_pdf/core/pdf.py,sha256=hOR1i3bJjfJCBCI2m4pBNAMEYpmbtG905QbFe-l8gZU,46525
|
84
85
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
85
|
-
natural_pdf/elements/base.py,sha256=
|
86
|
-
natural_pdf/elements/collections.py,sha256=
|
86
|
+
natural_pdf/elements/base.py,sha256=UtoSD-c_s0yiLpWZrIIJjeJ9MgGz_4R0UHYcsFWH6bc,35157
|
87
|
+
natural_pdf/elements/collections.py,sha256=w0JqLwn57Je00Aq4Ay8SeYmxPjPJvUOtkLbgfGM2-nM,68882
|
87
88
|
natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
|
88
89
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
89
|
-
natural_pdf/elements/region.py,sha256
|
90
|
+
natural_pdf/elements/region.py,sha256=9E21LYQWB98coi_73Kpf9mQ60p9ElzGOJzxdtgOUfh4,69662
|
90
91
|
natural_pdf/elements/text.py,sha256=8PNKSLUgXUhEu9IFfbNbSSpuu0Slm11T6UH8jn4O6hQ,11078
|
91
|
-
natural_pdf/exporters/__init__.py,sha256=
|
92
|
+
natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
|
93
|
+
natural_pdf/exporters/base.py,sha256=s-NpHoH81x80GQxs0oqjdhPGrzbUa8npjnK8apKOsHQ,2115
|
94
|
+
natural_pdf/exporters/paddleocr.py,sha256=1G2bS2-CcuAtS78JZYRczO3r5k8fdO9jrExH0Kr9r7M,16249
|
92
95
|
natural_pdf/exporters/searchable_pdf.py,sha256=qsaPsnbOOaZHA_aplfZbwQnBoK9KghWm-wzbyRRomeY,16859
|
93
|
-
natural_pdf/ocr/__init__.py,sha256=
|
94
|
-
natural_pdf/ocr/engine.py,sha256=
|
95
|
-
natural_pdf/ocr/engine_easyocr.py,sha256=
|
96
|
-
natural_pdf/ocr/engine_paddle.py,sha256=
|
97
|
-
natural_pdf/ocr/engine_surya.py,sha256=
|
98
|
-
natural_pdf/ocr/ocr_factory.py,sha256=
|
99
|
-
natural_pdf/ocr/ocr_manager.py,sha256=
|
100
|
-
natural_pdf/ocr/ocr_options.py,sha256=
|
101
|
-
natural_pdf/ocr/utils.py,sha256=
|
96
|
+
natural_pdf/ocr/__init__.py,sha256=jKaDbo13CdCDcas1WiBmg5gjBvVeG-Z9uaeYxyzvaNY,2464
|
97
|
+
natural_pdf/ocr/engine.py,sha256=ZBC1tZNM5EDbGDJJmZI9mNHr4nCMLEZvUFhiJq8GdF4,8741
|
98
|
+
natural_pdf/ocr/engine_easyocr.py,sha256=rnDXLNa-keymonR3qbLEbbxA6bqk4QUAVCHKUDixqKg,9045
|
99
|
+
natural_pdf/ocr/engine_paddle.py,sha256=2nIrvLBBAiZG1BxVo3eFVJulA6YGoOTXw_RN98p_BUk,6184
|
100
|
+
natural_pdf/ocr/engine_surya.py,sha256=iySjG-Dahgh0cLICfbMtOcwUpRFcZjo-5Ed5Zwz-o5Y,4805
|
101
|
+
natural_pdf/ocr/ocr_factory.py,sha256=IFccj0BB75YGV4hjcy4ECtGQX_JQzdptpvDFfeGxxgI,4391
|
102
|
+
natural_pdf/ocr/ocr_manager.py,sha256=PqF1z1ET8emSw19r7jtEkC9_LZJXY7C5zK5cFklo57I,9238
|
103
|
+
natural_pdf/ocr/ocr_options.py,sha256=MIH7cOe8esuiGcVe4AtArSeQdaIpUu9RaUZbuwwvKQw,3294
|
104
|
+
natural_pdf/ocr/utils.py,sha256=kdO4sCBqCb5qB-9iPqdPN8_5t1jWwijpT-ci5UHnz6A,3867
|
102
105
|
natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
|
103
106
|
natural_pdf/qa/document_qa.py,sha256=W4E4vS_Eox_IBsYpVb0ifQbJb0FP-PYEIG93CU3rUkE,15246
|
104
107
|
natural_pdf/search/__init__.py,sha256=EB_HRwlktJn5WGPVtSaRbOQNjLAZTxujeYf_eN-zd2U,4191
|
@@ -110,22 +113,23 @@ natural_pdf/search/searchable_mixin.py,sha256=M2a6FaFVM0vcfh7FgjDH6BLhS-7ggeVpcf
|
|
110
113
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
111
114
|
natural_pdf/selectors/parser.py,sha256=59_GSsTApM6MFvtqhrrmbKaBfODPbGXMluvvQJcrqhE,15754
|
112
115
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
116
|
+
natural_pdf/templates/finetune/fine_tune_paddleocr.md,sha256=AGt6kQWSTJZ8F28iN1D4p_Q6f1bvFML9gyUk6QcSHDc,14517
|
113
117
|
natural_pdf/templates/spa/index.html,sha256=6hLTp07OeV5Q4jUMp5Sgl-dwfBs3oPzBxqphG4kEs24,787
|
114
118
|
natural_pdf/templates/spa/words.txt,sha256=vkGtl5Y7-Nq-3Vhx1daRWWF1Jp1UCVaw-ZZaiFwrurk,2493885
|
115
119
|
natural_pdf/templates/spa/css/style.css,sha256=Qdl0U3L5HMyhBDNzyRPklfb3OxW6rMxCfQbzO8i8IW4,7643
|
116
120
|
natural_pdf/templates/spa/js/app.js,sha256=Efb7NmcTN9RLdLwKpDcU6CG5Ix0laHtzRHmfUlDMJXw,19679
|
117
121
|
natural_pdf/utils/__init__.py,sha256=s3M8FggaK1P3EBYn6R_-HgSDjNc9C73gyKe1hihtNWg,43
|
118
|
-
natural_pdf/utils/debug.py,sha256=
|
122
|
+
natural_pdf/utils/debug.py,sha256=lk_6qzxan8NagjEtJEZpZ2MS30SO8ce6iznBxmA0xgk,995
|
119
123
|
natural_pdf/utils/highlighting.py,sha256=EIY6ihVGtUTS_DjWyxpnr_UXpcR4btC1KhSGQ9VUfKg,698
|
120
|
-
natural_pdf/utils/identifiers.py,sha256=
|
121
|
-
natural_pdf/utils/packaging.py,sha256=
|
124
|
+
natural_pdf/utils/identifiers.py,sha256=n61viCQiMlf5-E_jsPLe-FkPBdKkMKv-gfs5tGqlKiw,1117
|
125
|
+
natural_pdf/utils/packaging.py,sha256=HSgpubpHICU75L4ZAZPU8iOjium055XWnklV9_YqoCA,21579
|
122
126
|
natural_pdf/utils/reading_order.py,sha256=s3DsYq_3g_1YA07qhd4BGEjeIRTeyGtnwc_hNtSzwBY,7290
|
123
|
-
natural_pdf/utils/text_extraction.py,sha256=
|
124
|
-
natural_pdf/utils/visualization.py,sha256=
|
127
|
+
natural_pdf/utils/text_extraction.py,sha256=ujhqU2C9y2YwzGDBfT9oiGPUvSz6mVqq72ttd3Ksskg,7712
|
128
|
+
natural_pdf/utils/visualization.py,sha256=5GbhxtvZW-77ONVnICupg-s2D-OaxLZNqkKlOrQESK4,8593
|
125
129
|
natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
|
126
130
|
natural_pdf/widgets/viewer.py,sha256=Aiw6kuBc0WkhcZrPNKyLNzzWbmtmU6rvOmHV0IuXCBk,40862
|
127
131
|
natural_pdf/widgets/frontend/viewer.js,sha256=w8ywfz_IOAAv2nP_qaf2VBUkF1KhjT3zorhJxM1-CfU,4371
|
128
|
-
natural_pdf-0.1.
|
132
|
+
natural_pdf-0.1.7.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
129
133
|
notebooks/Examples.ipynb,sha256=l4YMtMEx_DWBzWIjl9CmBkWTo0g_nK8l_XWOyzYooQM,4275170
|
130
134
|
pdfs/.gitkeep,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
131
135
|
pdfs/01-practice.pdf,sha256=dxWyJIa2cm7bALE3BWDJ2dg3inyFlo1n8ntVyy0hkTo,7906
|
@@ -135,7 +139,7 @@ pdfs/2014 Statistics.pdf,sha256=B-30OQVjqj_3718-G9cGUefNddnz-MosPdHAzfGfkcc,9559
|
|
135
139
|
pdfs/2019 Statistics.pdf,sha256=reuSJxvAlx9_P-pW7IPqzox0jFCxSPbK1i1-WFu-uGA,511439
|
136
140
|
pdfs/Atlanta_Public_Schools_GA_sample.pdf,sha256=PLBh_uWJQH0MnBaSm5ng5Ima63_m6Mi11CjdravB_S8,137689
|
137
141
|
pdfs/needs-ocr.pdf,sha256=vusKiLxSOlELUTetfZfaotNU54RtMj9PCzGfLc2cuNs,139305
|
138
|
-
natural_pdf-0.1.
|
139
|
-
natural_pdf-0.1.
|
140
|
-
natural_pdf-0.1.
|
141
|
-
natural_pdf-0.1.
|
142
|
+
natural_pdf-0.1.7.dist-info/METADATA,sha256=BMzSroqVMlbJrti_56ilNFZkSEH2-hJc8vUVrjk3OZU,6766
|
143
|
+
natural_pdf-0.1.7.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
144
|
+
natural_pdf-0.1.7.dist-info/top_level.txt,sha256=7nDKUnpkN7B8cBI7DEpW5JM8S7OcOgHw3jXH-1iCX2o,32
|
145
|
+
natural_pdf-0.1.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|