natural-pdf 0.1.8__tar.gz → 0.1.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. natural_pdf-0.1.10/MANIFEST.in +48 -0
  2. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/PKG-INFO +12 -3
  3. natural_pdf-0.1.10/audit_packaging.py +56 -0
  4. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/categorizing-documents/index.md +20 -23
  5. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/data-extraction/index.md +41 -19
  6. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/index.md +4 -4
  7. natural_pdf-0.1.10/docs/tutorials/01-loading-and-extraction.ipynb +1628 -0
  8. natural_pdf-0.1.10/docs/tutorials/02-finding-elements.ipynb +374 -0
  9. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/02-finding-elements.md +3 -3
  10. natural_pdf-0.1.10/docs/tutorials/03-extracting-blocks.ipynb +152 -0
  11. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/04-table-extraction.ipynb +12 -12
  12. natural_pdf-0.1.10/docs/tutorials/05-excluding-content.ipynb +275 -0
  13. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/06-document-qa.ipynb +28 -28
  14. natural_pdf-0.1.10/docs/tutorials/07-layout-analysis.ipynb +269 -0
  15. natural_pdf-0.1.10/docs/tutorials/07-working-with-regions.ipynb +414 -0
  16. natural_pdf-0.1.10/docs/tutorials/08-spatial-navigation.ipynb +513 -0
  17. natural_pdf-0.1.10/docs/tutorials/09-section-extraction.ipynb +2439 -0
  18. natural_pdf-0.1.10/docs/tutorials/10-form-field-extraction.ipynb +503 -0
  19. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  20. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/12-ocr-integration.ipynb +976 -976
  21. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/13-semantic-search.ipynb +410 -717
  22. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/13-semantic-search.md +8 -7
  23. natural_pdf-0.1.10/docs/tutorials/14-categorizing-documents.ipynb +2365 -0
  24. natural_pdf-0.1.10/docs/tutorials/14-categorizing-documents.md +99 -0
  25. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/mkdocs.yml +1 -0
  26. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/__init__.py +1 -0
  27. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/base.py +1 -5
  28. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/gemini.py +61 -51
  29. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  30. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/layout_manager.py +26 -84
  31. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/layout_options.py +7 -0
  32. natural_pdf-0.1.10/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  33. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/surya.py +46 -123
  34. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/tatr.py +51 -4
  35. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/text_structure.py +3 -5
  36. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/utils.py +3 -3
  37. natural_pdf-0.1.10/natural_pdf/classification/manager.py +426 -0
  38. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/classification/mixin.py +52 -38
  39. natural_pdf-0.1.10/natural_pdf/classification/results.py +88 -0
  40. natural_pdf-0.1.10/natural_pdf/collections/mixins.py +128 -0
  41. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/collections/pdf_collection.py +245 -100
  42. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/core/element_manager.py +30 -14
  43. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/core/highlighting_service.py +13 -22
  44. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/core/page.py +423 -101
  45. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/core/pdf.py +694 -195
  46. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/elements/base.py +134 -40
  47. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/elements/collections.py +610 -134
  48. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/elements/region.py +659 -90
  49. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/elements/text.py +1 -1
  50. natural_pdf-0.1.10/natural_pdf/export/mixin.py +137 -0
  51. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/exporters/base.py +3 -3
  52. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/exporters/paddleocr.py +4 -3
  53. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/extraction/manager.py +50 -49
  54. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/extraction/mixin.py +90 -57
  55. natural_pdf-0.1.10/natural_pdf/extraction/result.py +23 -0
  56. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/ocr/__init__.py +5 -5
  57. natural_pdf-0.1.10/natural_pdf/ocr/engine_doctr.py +346 -0
  58. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/ocr/ocr_factory.py +24 -4
  59. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/ocr/ocr_manager.py +61 -25
  60. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/ocr/ocr_options.py +70 -10
  61. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/ocr/utils.py +6 -4
  62. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/search/__init__.py +20 -34
  63. natural_pdf-0.1.10/natural_pdf/search/haystack_search_service.py +687 -0
  64. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/search/haystack_utils.py +99 -75
  65. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/search/search_service_protocol.py +11 -12
  66. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/selectors/parser.py +219 -143
  67. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/debug.py +3 -3
  68. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/locks.py +1 -1
  69. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/packaging.py +8 -6
  70. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/text_extraction.py +24 -16
  71. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/tqdm_utils.py +18 -10
  72. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/visualization.py +18 -0
  73. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/widgets/viewer.py +4 -25
  74. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf.egg-info/PKG-INFO +12 -3
  75. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf.egg-info/SOURCES.txt +7 -1
  76. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf.egg-info/requires.txt +13 -2
  77. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf.egg-info/top_level.txt +0 -2
  78. natural_pdf-0.1.10/pdfs/cia-doc.pdf +0 -0
  79. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/pyproject.toml +28 -12
  80. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/tests/exporters/test_paddleocr_exporter.py +4 -3
  81. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/tests/test_optional_deps.py +43 -17
  82. natural_pdf-0.1.8/MANIFEST.in +0 -8
  83. natural_pdf-0.1.8/docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  84. natural_pdf-0.1.8/docs/tutorials/02-finding-elements.ipynb +0 -417
  85. natural_pdf-0.1.8/docs/tutorials/03-extracting-blocks.ipynb +0 -152
  86. natural_pdf-0.1.8/docs/tutorials/05-excluding-content.ipynb +0 -275
  87. natural_pdf-0.1.8/docs/tutorials/07-layout-analysis.ipynb +0 -293
  88. natural_pdf-0.1.8/docs/tutorials/07-working-with-regions.ipynb +0 -414
  89. natural_pdf-0.1.8/docs/tutorials/08-spatial-navigation.ipynb +0 -513
  90. natural_pdf-0.1.8/docs/tutorials/09-section-extraction.ipynb +0 -2439
  91. natural_pdf-0.1.8/docs/tutorials/10-form-field-extraction.ipynb +0 -517
  92. natural_pdf-0.1.8/natural_pdf/classification/manager.py +0 -343
  93. natural_pdf-0.1.8/natural_pdf/classification/results.py +0 -62
  94. natural_pdf-0.1.8/natural_pdf/collections/mixins.py +0 -63
  95. natural_pdf-0.1.8/natural_pdf/extraction/result.py +0 -37
  96. natural_pdf-0.1.8/natural_pdf/search/haystack_search_service.py +0 -643
  97. natural_pdf-0.1.8/notebooks/Examples.ipynb +0 -1293
  98. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/.cursor/rules/analysis_framework.mdc +0 -0
  99. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/.cursor/rules/coding-style.mdc +0 -0
  100. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  101. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/.cursor/rules/minimal-comments.mdc +0 -0
  102. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  103. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  104. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/.github/workflows/docs.yml +0 -0
  105. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/.gitignore +0 -0
  106. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/01-execute_notebooks.py +0 -0
  107. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/02-run_all_tutorials.sh +0 -0
  108. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/CLAUDE.md +0 -0
  109. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/LICENSE +0 -0
  110. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/README.md +0 -0
  111. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/check_run_md.sh +0 -0
  112. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/api/index.md +0 -0
  113. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/assets/favicon.png +0 -0
  114. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/assets/favicon.svg +0 -0
  115. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/assets/javascripts/custom.js +0 -0
  116. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/assets/logo.svg +0 -0
  117. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/assets/sample-screen.png +0 -0
  118. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/assets/social-preview.png +0 -0
  119. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/assets/social-preview.svg +0 -0
  120. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/assets/stylesheets/custom.css +0 -0
  121. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/document-qa/index.ipynb +0 -0
  122. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/document-qa/index.md +0 -0
  123. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/element-selection/index.ipynb +0 -0
  124. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/element-selection/index.md +0 -0
  125. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/finetuning/index.md +0 -0
  126. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/installation/index.md +0 -0
  127. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/interactive-widget/index.ipynb +0 -0
  128. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/interactive-widget/index.md +0 -0
  129. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/layout-analysis/index.ipynb +0 -0
  130. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/layout-analysis/index.md +0 -0
  131. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/ocr/index.md +0 -0
  132. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/pdf-navigation/index.ipynb +0 -0
  133. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/pdf-navigation/index.md +0 -0
  134. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/regions/index.ipynb +0 -0
  135. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/regions/index.md +0 -0
  136. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tables/index.ipynb +0 -0
  137. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tables/index.md +0 -0
  138. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/text-analysis/index.ipynb +0 -0
  139. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/text-analysis/index.md +0 -0
  140. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/text-extraction/index.ipynb +0 -0
  141. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/text-extraction/index.md +0 -0
  142. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/01-loading-and-extraction.md +0 -0
  143. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/03-extracting-blocks.md +0 -0
  144. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/04-table-extraction.md +0 -0
  145. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/05-excluding-content.md +0 -0
  146. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/06-document-qa.md +0 -0
  147. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/07-layout-analysis.md +0 -0
  148. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/07-working-with-regions.md +0 -0
  149. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/08-spatial-navigation.md +0 -0
  150. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/09-section-extraction.md +0 -0
  151. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/10-form-field-extraction.md +0 -0
  152. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  153. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/tutorials/12-ocr-integration.md +0 -0
  154. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/visual-debugging/index.ipynb +0 -0
  155. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/visual-debugging/index.md +0 -0
  156. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/docs/visual-debugging/region.png +0 -0
  157. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/__init__.py +0 -0
  158. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/__init__.py +0 -0
  159. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/docling.py +0 -0
  160. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/paddle.py +0 -0
  161. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/yolo.py +0 -0
  162. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/analyzers/text_options.py +0 -0
  163. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/core/__init__.py +0 -0
  164. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/elements/__init__.py +0 -0
  165. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/elements/line.py +0 -0
  166. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/elements/rect.py +0 -0
  167. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/exporters/__init__.py +0 -0
  168. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/exporters/searchable_pdf.py +0 -0
  169. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/ocr/engine.py +0 -0
  170. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_easyocr.py +0 -0
  171. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_paddle.py +0 -0
  172. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_surya.py +0 -0
  173. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/qa/__init__.py +0 -0
  174. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/qa/document_qa.py +0 -0
  175. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/search/search_options.py +0 -0
  176. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/search/searchable_mixin.py +0 -0
  177. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/selectors/__init__.py +0 -0
  178. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/templates/__init__.py +0 -0
  179. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  180. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/templates/spa/css/style.css +0 -0
  181. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/templates/spa/index.html +0 -0
  182. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/templates/spa/js/app.js +0 -0
  183. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/templates/spa/words.txt +0 -0
  184. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/__init__.py +0 -0
  185. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/highlighting.py +0 -0
  186. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/identifiers.py +1 -1
  187. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/utils/reading_order.py +0 -0
  188. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/widgets/__init__.py +0 -0
  189. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf/widgets/frontend/viewer.js +0 -0
  190. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/natural_pdf.egg-info/dependency_links.txt +0 -0
  191. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/noxfile.py +0 -0
  192. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/pdfs/.gitkeep +0 -0
  193. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/pdfs/01-practice.pdf +0 -0
  194. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/pdfs/0500000US42001.pdf +0 -0
  195. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/pdfs/0500000US42007.pdf +0 -0
  196. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/pdfs/2014 Statistics.pdf +0 -0
  197. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/pdfs/2019 Statistics.pdf +0 -0
  198. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  199. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/pdfs/needs-ocr.pdf +0 -0
  200. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/publish.sh +0 -0
  201. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/sample-screen.png +0 -0
  202. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/setup.cfg +0 -0
  203. {natural_pdf-0.1.8 → natural_pdf-0.1.10}/tests/test_loading.py +0 -0
@@ -0,0 +1,48 @@
1
+ include README.md
2
+ include LICENSE
3
+
4
+ # HTML templates
5
+ recursive-include natural_pdf/templates *.html
6
+
7
+ # Documentation assets
8
+ recursive-include docs *.md *.png *.jpg *.gif
9
+
10
+ # Remove common build garbage
11
+ global-exclude __pycache__ *.py[cod] *.so .DS_Store
12
+ global-exclude *hidden*
13
+
14
+ # 💣 Critical: prevent recursion bugs
15
+ prune build
16
+ prune dist
17
+ prune .nox
18
+ prune .venv
19
+ prune env
20
+ prune venv
21
+
22
+ # General junk
23
+ exclude .notebook_cache.json
24
+ exclude Untitled.ipynb
25
+ exclude conversation.md
26
+ exclude transcript.md
27
+ exclude sample.py
28
+ exclude sample2.py
29
+ exclude requirements.lock
30
+ exclude install.sh
31
+
32
+ # Directories to exclude
33
+ prune .venv
34
+ prune output
35
+ prune results
36
+ prune natural_pdf_index
37
+ prune hidden
38
+ prune pdfs/hidden
39
+ prune my_paddleocr_finetune_data
40
+ prune notebooks
41
+ prune docs/tutorials/pdfs
42
+
43
+ # Individual files in nested directories
44
+ exclude docs/tutorials/needs-ocr-searchable.pdf
45
+ exclude notebooks/Examples.md
46
+
47
+ # File patterns
48
+ global-exclude *.hocr
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.8
3
+ Version: 0.1.10
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -17,11 +17,13 @@ Requires-Dist: colour
17
17
  Requires-Dist: numpy
18
18
  Requires-Dist: urllib3
19
19
  Requires-Dist: tqdm
20
+ Requires-Dist: pydantic
20
21
  Provides-Extra: interactive
21
22
  Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
22
23
  Provides-Extra: haystack
23
24
  Requires-Dist: haystack-ai; extra == "haystack"
24
- Requires-Dist: chroma-haystack; extra == "haystack"
25
+ Requires-Dist: lancedb-haystack; extra == "haystack"
26
+ Requires-Dist: lancedb; extra == "haystack"
25
27
  Requires-Dist: sentence-transformers; extra == "haystack"
26
28
  Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
27
29
  Provides-Extra: easyocr
@@ -36,6 +38,9 @@ Requires-Dist: natural-pdf[core-ml]; extra == "layout-yolo"
36
38
  Provides-Extra: surya
37
39
  Requires-Dist: surya-ocr; extra == "surya"
38
40
  Requires-Dist: natural-pdf[core-ml]; extra == "surya"
41
+ Provides-Extra: doctr
42
+ Requires-Dist: python-doctr[torch]; extra == "doctr"
43
+ Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
39
44
  Provides-Extra: qa
40
45
  Requires-Dist: natural-pdf[core-ml]; extra == "qa"
41
46
  Provides-Extra: docling
@@ -43,7 +48,6 @@ Requires-Dist: docling; extra == "docling"
43
48
  Requires-Dist: natural-pdf[core-ml]; extra == "docling"
44
49
  Provides-Extra: llm
45
50
  Requires-Dist: openai>=1.0; extra == "llm"
46
- Requires-Dist: pydantic; extra == "llm"
47
51
  Provides-Extra: classification
48
52
  Requires-Dist: sentence-transformers; extra == "classification"
49
53
  Requires-Dist: timm; extra == "classification"
@@ -63,6 +67,9 @@ Requires-Dist: pipdeptree; extra == "dev"
63
67
  Requires-Dist: nbformat; extra == "dev"
64
68
  Requires-Dist: jupytext; extra == "dev"
65
69
  Requires-Dist: nbclient; extra == "dev"
70
+ Provides-Extra: deskew
71
+ Requires-Dist: deskew>=1.5; extra == "deskew"
72
+ Requires-Dist: img2pdf; extra == "deskew"
66
73
  Provides-Extra: all
67
74
  Requires-Dist: natural-pdf[interactive]; extra == "all"
68
75
  Requires-Dist: natural-pdf[haystack]; extra == "all"
@@ -70,11 +77,13 @@ Requires-Dist: natural-pdf[easyocr]; extra == "all"
70
77
  Requires-Dist: natural-pdf[paddle]; extra == "all"
71
78
  Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
72
79
  Requires-Dist: natural-pdf[surya]; extra == "all"
80
+ Requires-Dist: natural-pdf[doctr]; extra == "all"
73
81
  Requires-Dist: natural-pdf[qa]; extra == "all"
74
82
  Requires-Dist: natural-pdf[ocr-export]; extra == "all"
75
83
  Requires-Dist: natural-pdf[docling]; extra == "all"
76
84
  Requires-Dist: natural-pdf[llm]; extra == "all"
77
85
  Requires-Dist: natural-pdf[classification]; extra == "all"
86
+ Requires-Dist: natural-pdf[deskew]; extra == "all"
78
87
  Requires-Dist: natural-pdf[test]; extra == "all"
79
88
  Provides-Extra: core-ml
80
89
  Requires-Dist: torch; extra == "core-ml"
@@ -0,0 +1,56 @@
1
+ import subprocess
2
+ import tarfile
3
+ import zipfile
4
+ from pathlib import Path
5
+
6
+ DIST_DIR = Path("dist")
7
+
8
+
9
+ def build_package():
10
+ subprocess.run(["python", "-m", "build", "--sdist", "--wheel"], check=True)
11
+
12
+
13
+ def get_sdist_files():
14
+ sdist_path = next(DIST_DIR.glob("*.tar.gz"))
15
+ with tarfile.open(sdist_path, "r:gz") as tar:
16
+ return sorted(str(Path(m.name)) for m in tar.getmembers() if m.isfile())
17
+
18
+
19
+ def get_wheel_files():
20
+ wheel_path = next(DIST_DIR.glob("*.whl"))
21
+ with zipfile.ZipFile(wheel_path, "r") as zipf:
22
+ return sorted(str(f) for f in zipf.namelist() if not f.endswith("/"))
23
+
24
+
25
+ def get_gitignored_files():
26
+ proc = subprocess.run(
27
+ ["git", "ls-files", "--others", "-i", "--exclude-standard"],
28
+ check=True,
29
+ capture_output=True,
30
+ text=True,
31
+ )
32
+ return sorted(proc.stdout.strip().splitlines())
33
+
34
+
35
+ def diff_lists(packaged, ignored):
36
+ return sorted(set(packaged) & set(ignored))
37
+
38
+
39
+ def main():
40
+ build_package()
41
+
42
+ sdist_files = get_sdist_files()
43
+ wheel_files = get_wheel_files()
44
+ ignored_files = get_gitignored_files()
45
+
46
+ print("\n🚫 Files in *sdist* that are also .gitignored:")
47
+ for f in diff_lists(sdist_files, ignored_files):
48
+ print(" •", f)
49
+
50
+ print("\n🚫 Files in *wheel* that are also .gitignored:")
51
+ for f in diff_lists(wheel_files, ignored_files):
52
+ print(" •", f)
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -22,32 +22,29 @@ from natural_pdf import PDF
22
22
  # Example: Classify a Page
23
23
  pdf = PDF("pdfs/01-practice.pdf")
24
24
  page = pdf.pages[0]
25
- categories = ["invoice", "letter", "report cover", "data table"]
26
- results = page.classify(categories=categories, model="text")
25
+ labels = ["invoice", "letter", "report cover", "data table"]
26
+ page.classify(labels, using="text")
27
27
 
28
28
  # Access the top result
29
29
  print(f"Top Category: {page.category}")
30
30
  print(f"Confidence: {page.category_confidence:.3f}")
31
-
32
- # Access all results
33
- # print(page.classification_results)
34
31
  ```
35
32
 
36
33
  **Key Arguments:**
37
34
 
38
- * `categories` (required): A list of strings representing the potential categories you want to classify the item into.
39
- * `model` (optional): Specifies which classification model or strategy to use. Defaults to `"text"`.
35
+ * `labels` (required): A list of strings representing the potential labels you want to classify the item into.
36
+ * `using` (optional): Specifies which classification model or strategy to use. Defaults to `"text"`.
40
37
  * `"text"`: Uses a text-based model (default: `facebook/bart-large-mnli`) suitable for classifying based on language content.
41
38
  * `"vision"`: Uses a vision-based model (default: `openai/clip-vit-base-patch32`) suitable for classifying based on visual layout and appearance.
42
39
  * Specific Model ID: You can provide a Hugging Face model ID (e.g., `"google/siglip-base-patch16-224"`, `"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"`) compatible with zero-shot text or image classification. The library attempts to infer whether it's text or vision, but you might need `using`.
43
- * `using` (optional): Explicitly set to `"text"` or `"vision"` if the automatic inference based on the `model` ID fails or is ambiguous.
44
- * `min_confidence` (optional): A float between 0.0 and 1.0. Only categories with a confidence score greater than or equal to this threshold will be included in the results (default: 0.0).
40
+ * `model` (optional): Explicitly model ID (HuggingFace repo name)
41
+ * `min_confidence` (optional): A float between 0.0 and 1.0. Only labels with a confidence score greater than or equal to this threshold will be included in the results (default: 0.0).
45
42
 
46
43
  ## Text vs. Vision Classification
47
44
 
48
45
  Choosing the right model type depends on your goal:
49
46
 
50
- ### Text Classification (`model="text"`)
47
+ ### Text Classification (`using="text"`)
51
48
 
52
49
  * **How it works:** Extracts the text from the page or region and analyzes the language content.
53
50
  * **Best for:**
@@ -57,12 +54,12 @@ Choosing the right model type depends on your goal:
57
54
 
58
55
  ```python
59
56
  # Find pages related to finance
60
- financial_categories = ["budget", "revenue", "expenditure", "forecast"]
61
- pdf.classify_pages(categories=financial_categories, model="text")
57
+ financial_labels = ["budget", "revenue", "expenditure", "forecast"]
58
+ pdf.classify_pages(financial_labels, using="text")
62
59
  budget_pages = [p for p in pdf.pages if p.category == "budget"]
63
60
  ```
64
61
 
65
- ### Vision Classification (`model="vision"`)
62
+ ### Vision Classification (`using="vision"`)
66
63
 
67
64
  * **How it works:** Renders the page or region as an image and analyzes its visual layout, structure, and appearance.
68
65
  * **Best for:**
@@ -72,8 +69,8 @@ budget_pages = [p for p in pdf.pages if p.category == "budget"]
72
69
 
73
70
  ```python
74
71
  # Find pages that look like invoices or receipts
75
- visual_categories = ["invoice", "receipt", "letter", "form"]
76
- page.classify(categories=visual_categories, model="vision")
72
+ visual_labels = ["invoice", "receipt", "letter", "form"]
73
+ page.classify(visual_labels, using="vision")
77
74
  if page.category in ["invoice", "receipt"]:
78
75
  print(f"Page {page.number} looks like an invoice or receipt.")
79
76
  ```
@@ -88,7 +85,7 @@ Classifying a whole page is useful for sorting documents or identifying the over
88
85
  # Classify the first page
89
86
  page = pdf.pages[0]
90
87
  page_types = ["cover page", "table of contents", "chapter start", "appendix"]
91
- page.classify(categories=page_types, model="vision") # Vision often good for page structure
88
+ page.classify(page_types, using="vision") # Vision often good for page structure
92
89
  print(f"Page 1 Type: {page.category}")
93
90
  ```
94
91
 
@@ -101,9 +98,9 @@ Classifying a specific region allows for more granular analysis within a page. Y
101
98
  paragraphs = page.find_all("region[type=paragraph]")
102
99
  if paragraphs:
103
100
  # Classify the topic of the first paragraph
104
- topic_categories = ["introduction", "methodology", "results", "conclusion"]
101
+ topic_labels = ["introduction", "methodology", "results", "conclusion"]
105
102
  # Use text model for topic
106
- paragraphs[0].classify(categories=topic_categories, model="text")
103
+ paragraphs[0].classify(topic_labels, using="text")
107
104
  print(f"First paragraph category: {paragraphs[0].category}")
108
105
  ```
109
106
 
@@ -113,10 +110,10 @@ After running `.classify()`, you can access the results:
113
110
 
114
111
  * `page.category` or `region.category`: Returns the string label of the category with the highest confidence score from the *last* classification run. Returns `None` if no classification has been run or no category met the threshold.
115
112
  * `page.category_confidence` or `region.category_confidence`: Returns the float confidence score (0.0-1.0) for the top category. Returns `None` otherwise.
116
- * `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type, categories provided, timestamp, and a list of all scores above the threshold sorted by confidence. Returns `None` if no classification has been run.
113
+ * `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type, labels provided, timestamp, and a list of all scores above the threshold sorted by confidence. Returns `None` if no classification has been run.
117
114
 
118
115
  ```python
119
- results = page.classify(categories=["invoice", "letter"], model="text", min_confidence=0.5)
116
+ results = page.classify(["invoice", "letter"], using="text", min_confidence=0.5)
120
117
 
121
118
  if page.category == "invoice":
122
119
  print(f"Found an invoice with confidence {page.category_confidence:.2f}")
@@ -135,10 +132,10 @@ Classifies pages across all PDFs in the collection. Use `max_workers` for parall
135
132
 
136
133
  ```python
137
134
  collection = natural_pdf.PDFCollection.from_directory("./documents/")
138
- categories = ["form", "datasheet", "image", "text document"]
135
+ labels = ["form", "datasheet", "image", "text document"]
139
136
 
140
137
  # Classify all pages using vision model, processing 4 PDFs concurrently
141
- collection.classify_all(categories=categories, model="vision", max_workers=4)
138
+ collection.classify_all(labels, using="vision", max_workers=4)
142
139
 
143
140
  # Filter PDFs containing forms
144
141
  form_pdfs = []
@@ -160,7 +157,7 @@ layout_regions = pdf.find_all("region")
160
157
  region_types = ["paragraph", "list", "table", "figure", "caption"]
161
158
 
162
159
  # Classify all detected regions based on vision
163
- layout_regions.classify_all(categories=region_types, model="vision")
160
+ layout_regions.classify_all(region_types, model="vision")
164
161
 
165
162
  # Count table regions
166
163
  table_count = sum(1 for r in layout_regions if r.category == "table")
@@ -1,42 +1,56 @@
1
1
  # Structured Data Extraction
2
2
 
3
- Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with Large Language Models (LLMs) via Pydantic schemas to achieve this.
3
+ Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with LLMs to pull out [structured data](https://platform.openai.com/docs/guides/structured-outputs).
4
+
5
+ You need to install more than just the tiny baby default `natural_pdf` for this:
6
+ ```
7
+ # Install just the LLM portions
8
+ pip install "natural_pdf[llm]"
9
+
10
+ # Install eeeeeverything
11
+ pip install "natural_pdf[all]"
12
+ ```
4
13
 
5
14
  ## Introduction
6
15
 
7
16
  This feature allows you to define the exact data structure you want using a Pydantic model and then instruct an LLM to populate that structure based on the content of a PDF element (like a `Page` or `Region`).
8
17
 
18
+ > Not sure how to write a Pydantic schema? Just ask an LLM! "Write me a Pydantic schema to pull out an invoice number (an integer), a company name (string) and a date (string)." It'll go fine.
19
+
9
20
  ## Basic Extraction
10
21
 
11
22
  1. **Define a Schema:** Create a Pydantic model for your desired data.
12
- 2. **Extract:** Use the `.extract()` method on a `PDF`, `Page`, or `Region` object.
13
- 3. **Access:** Use the `.extracted()` method to retrieve the results.
23
+ 2. **Extract:** Use `.extract()` on a `PDF`, `Page`, or `Region` object.
24
+ 3. **Access:** Use `.extracted()` to retrieve the results.
14
25
 
15
26
  ```python
16
27
  from natural_pdf import PDF
17
28
  from pydantic import BaseModel, Field
18
- from openai import OpenAI # Example client
29
+ from openai import OpenAI
19
30
 
20
- # Example: Initialize your LLM client
21
- client = OpenAI()
31
+ # Initialize your LLM client
32
+ # Anything OpenAI-compatible works!
33
+ client = OpenAI(
34
+ api_key="ANTHROPIC_API_KEY", # Your Anthropic API key
35
+ base_url="https://api.anthropic.com/v1/" # Anthropic's API endpoint
36
+ )
22
37
 
23
38
  # Load the PDF
24
39
  pdf = PDF("path/to/your/document.pdf")
25
40
  page = pdf.pages[0]
26
41
 
27
- # 1. Define your schema
42
+ # Define your schema
28
43
  class InvoiceInfo(BaseModel):
29
44
  invoice_number: str = Field(description="The main invoice identifier")
30
45
  total_amount: float = Field(description="The final amount due")
31
46
  company_name: Optional[str] = Field(None, description="The name of the issuing company")
32
47
 
33
- # 2. Extract data (using default analysis_key="default-structured")
48
+ # Extract data
34
49
  page.extract(schema=InvoiceInfo, client=client)
35
50
 
36
- # 3. Access the results
37
51
  # Access the full result object
38
52
  full_data = page.extracted()
39
- print(full_data)
53
+ print(full_data)
40
54
 
41
55
  # Access a single field
42
56
  inv_num = page.extracted('invoice_number')
@@ -51,16 +65,23 @@ print(f"Invoice Number: {inv_num}")
51
65
 
52
66
  ```python
53
67
  # Extract using a specific key
54
- page.extract(InvoiceInfo, client, analysis_key="invoice_header")
68
+ page.extract(InvoiceInfo, client=client, analysis_key="invoice_header")
55
69
 
56
70
  # Access using the specific key
57
71
  header_data = page.extracted(analysis_key="invoice_header")
58
72
  company = page.extracted('company_name', analysis_key="invoice_header")
59
73
  ```
60
74
 
61
- ## Applying to Regions and Collections
75
+ ## Text vs vision
76
+
77
+ When sending a page (or a region or etc) to an LLM, you can choose either `using='text'` (default) or `using='vision'`.
78
+
79
+ - `text` sends the text, somewhat respecting layout using `.extract_text(layout=True)`
80
+ - `vision` sends an image of the page with `.to_image(resolution=72)` (no highlights or labels)
81
+
82
+ ## Batch and bulk extraction
62
83
 
63
- The `.extract()` and `.extracted()` methods work identically on `Region` objects, allowing you to target specific areas of a page for structured data extraction.
84
+ If you have a lot of pages or a lot of PDFs or a lot of anything, the `.extract()` and `.extracted()` methods work identically on most parts of a PDF - regions, pages, collections of pdfs, etc, allowing a lot of flexibility in what you analyze.
64
85
 
65
86
  ```python
66
87
  # Assuming 'header_region' is a Region object you defined
@@ -73,15 +94,16 @@ Furthermore, you can apply extraction to collections of elements (like `pdf.page
73
94
  ```python
74
95
  # Example: Extract InvoiceInfo from the first 5 pages
75
96
  results = pdf.pages[:5].apply(
76
- 'extract',
77
- schema=InvoiceInfo,
78
- client=client,
79
- analysis_key="page_invoice_info", # Use a specific key for batch results
80
- overwrite=True # Allow overwriting if run multiple times
97
+ lambda page: page.extract(
98
+ client=client,
99
+ schema=InvoiceInfo,
100
+ client=client,
101
+ analysis_key="page_invoice_info",
102
+ )
81
103
  )
82
104
 
83
105
  # Access results for the first page in the collection
84
- first_page_company = results[0].extracted('company_name', analysis_key="page_invoice_info")
106
+ pdf.pages[0].extracted('company_name', analysis_key="page_invoice_info")
85
107
  ```
86
108
 
87
109
  This provides a powerful way to turn unstructured PDF content into structured, usable data.
@@ -140,14 +140,14 @@ Categorize pages or specific regions based on their content using text or vision
140
140
 
141
141
  ```python
142
142
  # Classify a page based on text
143
- categories = ["invoice", "scientific article", "presentation"]
144
- page.classify(categories=categories, model="text")
143
+ labels = ["invoice", "scientific article", "presentation"]
144
+ page.classify(labels, using="text")
145
145
  print(f"Page Category: {page.category} (Confidence: {page.category_confidence:.2f})")
146
146
 
147
147
 
148
148
  # Classify a page based on what it looks like
149
- categories = ["invoice", "scientific article", "presentation"]
150
- page.classify(categories=categories, model="vision")
149
+ labels = ["invoice", "scientific article", "presentation"]
150
+ page.classify(labels, using="vision")
151
151
  print(f"Page Category: {page.category} (Confidence: {page.category_confidence:.2f})")
152
152
  ```
153
153