natural-pdf 0.1.11__tar.gz → 0.1.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (222) hide show
  1. natural_pdf-0.1.12/.github/workflows/test.yml +55 -0
  2. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/01-execute_notebooks.py +2 -2
  3. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/MANIFEST.in +3 -0
  4. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/PKG-INFO +54 -49
  5. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/README.md +29 -29
  6. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/check_run_md.sh +2 -2
  7. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/categorizing-documents/index.md +1 -1
  8. natural_pdf-0.1.12/docs/element-selection/index.ipynb +1111 -0
  9. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/element-selection/index.md +31 -0
  10. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/index.md +3 -3
  11. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/installation/index.md +32 -14
  12. natural_pdf-0.1.12/docs/loops-and-groups/index.ipynb +476 -0
  13. natural_pdf-0.1.12/docs/loops-and-groups/index.md +84 -0
  14. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/text-extraction/index.ipynb +234 -220
  15. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/text-extraction/index.md +2 -2
  16. natural_pdf-0.1.12/docs/tutorials/01-loading-and-extraction.ipynb +2980 -0
  17. natural_pdf-0.1.12/docs/tutorials/02-finding-elements.ipynb +352 -0
  18. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/02-finding-elements.md +1 -1
  19. natural_pdf-0.1.12/docs/tutorials/03-extracting-blocks.ipynb +159 -0
  20. natural_pdf-0.1.12/docs/tutorials/04-table-extraction.ipynb +209 -0
  21. natural_pdf-0.1.12/docs/tutorials/05-excluding-content.ipynb +8402 -0
  22. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/06-document-qa.ipynb +45 -31
  23. natural_pdf-0.1.12/docs/tutorials/07-layout-analysis.ipynb +262 -0
  24. natural_pdf-0.1.12/docs/tutorials/07-working-with-regions.ipynb +477 -0
  25. natural_pdf-0.1.12/docs/tutorials/08-spatial-navigation.ipynb +520 -0
  26. natural_pdf-0.1.12/docs/tutorials/09-section-extraction.ipynb +2474 -0
  27. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/09-section-extraction.md +1 -1
  28. natural_pdf-0.1.12/docs/tutorials/10-form-field-extraction.ipynb +496 -0
  29. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/11-enhanced-table-processing.ipynb +9 -9
  30. natural_pdf-0.1.12/docs/tutorials/12-ocr-integration.ipynb +3448 -0
  31. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/12-ocr-integration.md +1 -1
  32. natural_pdf-0.1.12/docs/tutorials/13-semantic-search.ipynb +706 -0
  33. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/13-semantic-search.md +4 -3
  34. natural_pdf-0.1.12/docs/tutorials/14-categorizing-documents.ipynb +2142 -0
  35. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/14-categorizing-documents.md +21 -29
  36. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/mkdocs.yml +2 -0
  37. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/__init__.py +7 -2
  38. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/text_options.py +9 -1
  39. natural_pdf-0.1.12/natural_pdf/analyzers/text_structure.py +627 -0
  40. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/classification/manager.py +1 -1
  41. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/core/element_manager.py +11 -1
  42. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/core/highlighting_service.py +120 -40
  43. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/core/page.py +4 -2
  44. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/core/pdf.py +53 -38
  45. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/elements/base.py +17 -0
  46. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/elements/collections.py +203 -59
  47. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/elements/region.py +43 -11
  48. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/exporters/hocr.py +40 -61
  49. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/exporters/hocr_font.py +7 -13
  50. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/exporters/original_pdf.py +10 -13
  51. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/exporters/searchable_pdf.py +0 -10
  52. natural_pdf-0.1.12/natural_pdf/search/__init__.py +99 -0
  53. natural_pdf-0.1.12/natural_pdf/search/lancedb_search_service.py +325 -0
  54. natural_pdf-0.1.12/natural_pdf/search/numpy_search_service.py +255 -0
  55. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/search/searchable_mixin.py +25 -71
  56. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/widgets/viewer.py +22 -31
  57. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf.egg-info/PKG-INFO +54 -49
  58. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf.egg-info/SOURCES.txt +16 -4
  59. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf.egg-info/requires.txt +25 -21
  60. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf.egg-info/top_level.txt +0 -1
  61. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/noxfile.py +39 -8
  62. natural_pdf-0.1.12/pdfs/.gitkeep +0 -0
  63. natural_pdf-0.1.12/pdfs/geometry.pdf +0 -0
  64. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/pyproject.toml +36 -27
  65. natural_pdf-0.1.12/tests/conftest.py +140 -0
  66. natural_pdf-0.1.12/tests/test_core/test_containment_geometry.py +26 -0
  67. natural_pdf-0.1.12/tests/test_core/test_elements.py +169 -0
  68. natural_pdf-0.1.12/tests/test_core/test_loading.py +86 -0
  69. natural_pdf-0.1.12/tests/test_core/test_spatial.py +201 -0
  70. natural_pdf-0.1.12/tests/test_core/test_text_extraction.py +118 -0
  71. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/tests/test_optional_deps.py +1 -66
  72. natural_pdf-0.1.12/uv.lock +56 -0
  73. natural_pdf-0.1.11/docs/element-selection/index.ipynb +0 -957
  74. natural_pdf-0.1.11/docs/tutorials/01-loading-and-extraction.ipynb +0 -1628
  75. natural_pdf-0.1.11/docs/tutorials/02-finding-elements.ipynb +0 -374
  76. natural_pdf-0.1.11/docs/tutorials/03-extracting-blocks.ipynb +0 -152
  77. natural_pdf-0.1.11/docs/tutorials/04-table-extraction.ipynb +0 -195
  78. natural_pdf-0.1.11/docs/tutorials/05-excluding-content.ipynb +0 -275
  79. natural_pdf-0.1.11/docs/tutorials/07-layout-analysis.ipynb +0 -269
  80. natural_pdf-0.1.11/docs/tutorials/07-working-with-regions.ipynb +0 -470
  81. natural_pdf-0.1.11/docs/tutorials/08-spatial-navigation.ipynb +0 -513
  82. natural_pdf-0.1.11/docs/tutorials/09-section-extraction.ipynb +0 -2439
  83. natural_pdf-0.1.11/docs/tutorials/10-form-field-extraction.ipynb +0 -503
  84. natural_pdf-0.1.11/docs/tutorials/12-ocr-integration.ipynb +0 -3556
  85. natural_pdf-0.1.11/docs/tutorials/13-semantic-search.ipynb +0 -1411
  86. natural_pdf-0.1.11/docs/tutorials/14-categorizing-documents.ipynb +0 -2399
  87. natural_pdf-0.1.11/natural_pdf/analyzers/text_structure.py +0 -314
  88. natural_pdf-0.1.11/natural_pdf/search/__init__.py +0 -86
  89. natural_pdf-0.1.11/natural_pdf/search/haystack_search_service.py +0 -687
  90. natural_pdf-0.1.11/natural_pdf/search/haystack_utils.py +0 -474
  91. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/.cursor/rules/analysis_framework.mdc +0 -0
  92. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/.cursor/rules/coding-style.mdc +0 -0
  93. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  94. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/.cursor/rules/minimal-comments.mdc +0 -0
  95. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  96. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  97. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/.github/workflows/docs.yml +0 -0
  98. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/.gitignore +0 -0
  99. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/02-run_all_tutorials.sh +0 -0
  100. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/CLAUDE.md +0 -0
  101. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/LICENSE +0 -0
  102. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/audit_packaging.py +0 -0
  103. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/api/index.md +0 -0
  104. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/assets/favicon.png +0 -0
  105. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/assets/favicon.svg +0 -0
  106. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/assets/javascripts/custom.js +0 -0
  107. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/assets/logo.svg +0 -0
  108. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/assets/sample-screen.png +0 -0
  109. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/assets/social-preview.png +0 -0
  110. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/assets/social-preview.svg +0 -0
  111. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/assets/stylesheets/custom.css +0 -0
  112. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/data-extraction/index.md +0 -0
  113. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/document-qa/index.ipynb +0 -0
  114. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/document-qa/index.md +0 -0
  115. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/finetuning/index.md +0 -0
  116. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/interactive-widget/index.ipynb +0 -0
  117. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/interactive-widget/index.md +0 -0
  118. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/layout-analysis/index.ipynb +0 -0
  119. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/layout-analysis/index.md +0 -0
  120. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/ocr/index.md +0 -0
  121. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/pdf-navigation/index.ipynb +0 -0
  122. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/pdf-navigation/index.md +0 -0
  123. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/regions/index.ipynb +0 -0
  124. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/regions/index.md +0 -0
  125. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tables/index.ipynb +0 -0
  126. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tables/index.md +0 -0
  127. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/text-analysis/index.ipynb +0 -0
  128. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/text-analysis/index.md +0 -0
  129. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/01-loading-and-extraction.md +0 -0
  130. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/03-extracting-blocks.md +0 -0
  131. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/04-table-extraction.md +0 -0
  132. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/05-excluding-content.md +0 -0
  133. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/06-document-qa.md +0 -0
  134. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/07-layout-analysis.md +0 -0
  135. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/07-working-with-regions.md +0 -0
  136. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/08-spatial-navigation.md +0 -0
  137. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/10-form-field-extraction.md +0 -0
  138. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  139. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/visual-debugging/index.ipynb +0 -0
  140. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/visual-debugging/index.md +0 -0
  141. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/docs/visual-debugging/region.png +0 -0
  142. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/__init__.py +0 -0
  143. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/__init__.py +0 -0
  144. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/base.py +0 -0
  145. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/docling.py +0 -0
  146. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/gemini.py +0 -0
  147. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  148. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  149. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  150. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/paddle.py +0 -0
  151. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  152. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/surya.py +0 -0
  153. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/tatr.py +0 -0
  154. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/layout/yolo.py +0 -0
  155. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/analyzers/utils.py +0 -0
  156. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/classification/mixin.py +0 -0
  157. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/classification/results.py +0 -0
  158. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/collections/mixins.py +0 -0
  159. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/collections/pdf_collection.py +0 -0
  160. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/core/__init__.py +0 -0
  161. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/elements/__init__.py +0 -0
  162. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/elements/line.py +0 -0
  163. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/elements/rect.py +0 -0
  164. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/elements/text.py +0 -0
  165. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/export/mixin.py +0 -0
  166. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/exporters/__init__.py +0 -0
  167. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/exporters/base.py +0 -0
  168. /natural_pdf-0.1.11/pdfs/.gitkeep → /natural_pdf-0.1.12/natural_pdf/exporters/data/__init__.py +0 -0
  169. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/exporters/data/pdf.ttf +0 -0
  170. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/exporters/data/sRGB.icc +0 -0
  171. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/exporters/paddleocr.py +0 -0
  172. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/extraction/manager.py +0 -0
  173. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/extraction/mixin.py +0 -0
  174. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/extraction/result.py +0 -0
  175. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/__init__.py +0 -0
  176. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/engine.py +0 -0
  177. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/engine_doctr.py +0 -0
  178. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/engine_easyocr.py +0 -0
  179. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/engine_paddle.py +0 -0
  180. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/engine_surya.py +0 -0
  181. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/ocr_factory.py +0 -0
  182. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/ocr_manager.py +0 -0
  183. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/ocr_options.py +0 -0
  184. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/ocr/utils.py +0 -0
  185. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/qa/__init__.py +0 -0
  186. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/qa/document_qa.py +0 -0
  187. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/search/search_options.py +0 -0
  188. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/search/search_service_protocol.py +0 -0
  189. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/selectors/__init__.py +0 -0
  190. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/selectors/parser.py +0 -0
  191. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/templates/__init__.py +0 -0
  192. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  193. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/templates/spa/css/style.css +0 -0
  194. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/templates/spa/index.html +0 -0
  195. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/templates/spa/js/app.js +0 -0
  196. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/templates/spa/words.txt +0 -0
  197. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/__init__.py +0 -0
  198. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/debug.py +0 -0
  199. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/highlighting.py +0 -0
  200. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/identifiers.py +0 -0
  201. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/locks.py +0 -0
  202. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/packaging.py +0 -0
  203. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/reading_order.py +0 -0
  204. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/text_extraction.py +0 -0
  205. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/tqdm_utils.py +0 -0
  206. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/utils/visualization.py +0 -0
  207. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/widgets/__init__.py +0 -0
  208. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf/widgets/frontend/viewer.js +0 -0
  209. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/natural_pdf.egg-info/dependency_links.txt +0 -0
  210. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/pdfs/01-practice.pdf +0 -0
  211. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/pdfs/0500000US42001.pdf +0 -0
  212. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/pdfs/0500000US42007.pdf +0 -0
  213. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/pdfs/2014 Statistics.pdf +0 -0
  214. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/pdfs/2019 Statistics.pdf +0 -0
  215. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  216. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/pdfs/cia-doc.pdf +0 -0
  217. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/pdfs/needs-ocr.pdf +0 -0
  218. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/publish.sh +0 -0
  219. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/sample-screen.png +0 -0
  220. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/setup.cfg +0 -0
  221. {natural_pdf-0.1.11 → natural_pdf-0.1.12}/tests/exporters/test_paddleocr_exporter.py +0 -0
  222. /natural_pdf-0.1.11/tests/test_loading.py → /natural_pdf-0.1.12/tests/test_loading_original.py +0 -0
@@ -0,0 +1,55 @@
1
+ name: Python Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ['3.9', '3.10', '3.11']
15
+
16
+ steps:
17
+ - uses: actions/checkout@v3
18
+ with:
19
+ lfs: true # Enable Git LFS if you're using it for PDFs
20
+
21
+ - name: Set up Python ${{ matrix.python-version }}
22
+ uses: actions/setup-python@v4
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+ cache: 'pip'
26
+
27
+ - name: Install dependencies
28
+ run: |
29
+ python -m pip install --upgrade pip
30
+ python -m pip install nox nox-uv
31
+ python -m pip install -e .[dev]
32
+
33
+ - name: Test with nox
34
+ run: |
35
+ nox -s test_core
36
+ nox -s test_favorites
37
+
38
+ lint:
39
+ runs-on: ubuntu-latest
40
+ steps:
41
+ - uses: actions/checkout@v3
42
+
43
+ - name: Set up Python
44
+ uses: actions/setup-python@v4
45
+ with:
46
+ python-version: '3.11'
47
+ cache: 'pip'
48
+
49
+ - name: Install dependencies
50
+ run: |
51
+ python -m pip install --upgrade pip
52
+ python -m pip install nox nox-uv
53
+
54
+ - name: Lint with nox
55
+ run: nox -s lint
@@ -30,7 +30,7 @@ EXCLUDE_PATTERNS = [
30
30
  "finetuning/index.md",
31
31
  "categorizing-documents/index.md",
32
32
  "data-extraction/index.md",
33
- "*.ipynb_checkpoints*"
33
+ "*.ipynb_checkpoints*",
34
34
  ]
35
35
  MAX_WORKERS = os.cpu_count()
36
36
 
@@ -178,7 +178,7 @@ def process_notebook(md_file_path_str: str, log_level: int) -> Dict[str, Any]:
178
178
  client = NotebookClient(
179
179
  notebook,
180
180
  timeout=600,
181
- kernel_name="natural-pdf",
181
+ kernel_name="natural-pdf-project-venv",
182
182
  resources={"metadata": {"path": str(cwd)}},
183
183
  )
184
184
  client.execute() # Modifies 'notebook' object
@@ -4,6 +4,9 @@ include LICENSE
4
4
  # HTML templates
5
5
  recursive-include natural_pdf/templates *.html
6
6
 
7
+ # Data files
8
+ recursive-include natural_pdf/exporters/data *
9
+
7
10
  # Documentation assets
8
11
  recursive-include docs *.md *.png *.jpg *.gif
9
12
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -12,20 +12,16 @@ Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: pdfplumber
15
- Requires-Dist: Pillow
15
+ Requires-Dist: pillow
16
16
  Requires-Dist: colour
17
17
  Requires-Dist: numpy
18
18
  Requires-Dist: urllib3
19
19
  Requires-Dist: tqdm
20
20
  Requires-Dist: pydantic
21
- Provides-Extra: interactive
22
- Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
23
- Provides-Extra: haystack
24
- Requires-Dist: haystack-ai; extra == "haystack"
25
- Requires-Dist: lancedb-haystack; extra == "haystack"
26
- Requires-Dist: lancedb; extra == "haystack"
27
- Requires-Dist: sentence-transformers; extra == "haystack"
28
- Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
21
+ Requires-Dist: jenkspy
22
+ Requires-Dist: pikepdf>=9.7.0
23
+ Provides-Extra: viewer
24
+ Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "viewer"
29
25
  Provides-Extra: easyocr
30
26
  Requires-Dist: easyocr; extra == "easyocr"
31
27
  Requires-Dist: natural-pdf[core-ml]; extra == "easyocr"
@@ -41,19 +37,25 @@ Requires-Dist: natural-pdf[core-ml]; extra == "surya"
41
37
  Provides-Extra: doctr
42
38
  Requires-Dist: python-doctr[torch]; extra == "doctr"
43
39
  Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
44
- Provides-Extra: qa
45
- Requires-Dist: natural-pdf[core-ml]; extra == "qa"
46
40
  Provides-Extra: docling
47
41
  Requires-Dist: docling; extra == "docling"
48
42
  Requires-Dist: natural-pdf[core-ml]; extra == "docling"
49
43
  Provides-Extra: llm
50
44
  Requires-Dist: openai>=1.0; extra == "llm"
51
- Provides-Extra: classification
52
- Requires-Dist: sentence-transformers; extra == "classification"
53
- Requires-Dist: timm; extra == "classification"
54
- Requires-Dist: natural-pdf[core-ml]; extra == "classification"
55
45
  Provides-Extra: test
56
46
  Requires-Dist: pytest; extra == "test"
47
+ Provides-Extra: search
48
+ Requires-Dist: lancedb; extra == "search"
49
+ Requires-Dist: pyarrow; extra == "search"
50
+ Provides-Extra: favorites
51
+ Requires-Dist: natural-pdf[deskew]; extra == "favorites"
52
+ Requires-Dist: natural-pdf[llm]; extra == "favorites"
53
+ Requires-Dist: natural-pdf[surya]; extra == "favorites"
54
+ Requires-Dist: natural-pdf[easyocr]; extra == "favorites"
55
+ Requires-Dist: natural-pdf[layout_yolo]; extra == "favorites"
56
+ Requires-Dist: natural-pdf[ocr-export]; extra == "favorites"
57
+ Requires-Dist: natural-pdf[viewer]; extra == "favorites"
58
+ Requires-Dist: natural-pdf[search]; extra == "favorites"
57
59
  Provides-Extra: dev
58
60
  Requires-Dist: black; extra == "dev"
59
61
  Requires-Dist: isort; extra == "dev"
@@ -67,29 +69,32 @@ Requires-Dist: pipdeptree; extra == "dev"
67
69
  Requires-Dist: nbformat; extra == "dev"
68
70
  Requires-Dist: jupytext; extra == "dev"
69
71
  Requires-Dist: nbclient; extra == "dev"
72
+ Requires-Dist: ipykernel; extra == "dev"
70
73
  Provides-Extra: deskew
71
74
  Requires-Dist: deskew>=1.5; extra == "deskew"
72
75
  Requires-Dist: img2pdf; extra == "deskew"
73
76
  Provides-Extra: all
74
- Requires-Dist: natural-pdf[interactive]; extra == "all"
75
- Requires-Dist: natural-pdf[haystack]; extra == "all"
77
+ Requires-Dist: natural-pdf[viewer]; extra == "all"
76
78
  Requires-Dist: natural-pdf[easyocr]; extra == "all"
77
79
  Requires-Dist: natural-pdf[paddle]; extra == "all"
78
80
  Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
79
81
  Requires-Dist: natural-pdf[surya]; extra == "all"
80
82
  Requires-Dist: natural-pdf[doctr]; extra == "all"
81
- Requires-Dist: natural-pdf[qa]; extra == "all"
82
83
  Requires-Dist: natural-pdf[ocr-export]; extra == "all"
83
84
  Requires-Dist: natural-pdf[docling]; extra == "all"
84
85
  Requires-Dist: natural-pdf[llm]; extra == "all"
85
- Requires-Dist: natural-pdf[classification]; extra == "all"
86
+ Requires-Dist: natural-pdf[core-ml]; extra == "all"
86
87
  Requires-Dist: natural-pdf[deskew]; extra == "all"
87
88
  Requires-Dist: natural-pdf[test]; extra == "all"
89
+ Requires-Dist: natural-pdf[search]; extra == "all"
88
90
  Provides-Extra: core-ml
89
91
  Requires-Dist: torch; extra == "core-ml"
90
92
  Requires-Dist: torchvision; extra == "core-ml"
91
93
  Requires-Dist: transformers[sentencepiece]; extra == "core-ml"
92
94
  Requires-Dist: huggingface_hub; extra == "core-ml"
95
+ Requires-Dist: sentence-transformers; extra == "core-ml"
96
+ Requires-Dist: numpy; extra == "core-ml"
97
+ Requires-Dist: timm; extra == "core-ml"
93
98
  Provides-Extra: ocr-export
94
99
  Requires-Dist: pikepdf; extra == "ocr-export"
95
100
  Provides-Extra: export-extras
@@ -114,26 +119,11 @@ Natural PDF lets you find and extract content from PDFs using simple code that m
114
119
  pip install natural-pdf
115
120
  ```
116
121
 
117
- For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
122
+ For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install one to two million different extras. If you just want the greatest hits:
118
123
 
119
124
  ```bash
120
- # Example: Install with EasyOCR support
121
- pip install natural-pdf[easyocr]
122
- pip install natural-pdf[surya]
123
- pip install natural-pdf[paddle]
124
-
125
- # Example: Install support for features using Large Language Models (e.g., via OpenAI-compatible APIs)
126
- pip install natural-pdf[llm]
127
- # (May require setting API key environment variables, e.g., GOOGLE_API_KEY for Gemini)
128
-
129
- # Example: Install with interactive viewer support
130
- pip install natural-pdf[interactive]
131
-
132
- # Example: Install with semantic search support (Haystack)
133
- pip install natural-pdf[haystack]
134
-
135
- # Install everything
136
- pip install natural-pdf[all]
125
+ # deskewing, OCR (surya) + layout analysis (yolo), interactive browsing
126
+ pip install natural-pdf[favorites]
137
127
  ```
138
128
 
139
129
  See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
@@ -147,25 +137,26 @@ from natural_pdf import PDF
147
137
  pdf = PDF('document.pdf')
148
138
  page = pdf.pages[0]
149
139
 
140
+ # Extract all of the text on the page
141
+ page.extract_text()
142
+
150
143
  # Find elements using CSS-like selectors
151
144
  heading = page.find('text:contains("Summary"):bold')
152
145
 
153
146
  # Extract content below the heading
154
147
  content = heading.below().extract_text()
155
- print("Content below Summary:", content[:100] + "...")
156
148
 
157
- # Exclude headers/footers automatically (example)
158
- # You might define these based on common text or position
159
- page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
160
- page.add_exclusion(page.find_all('line')[-1].below())
149
+ # Examine all the bold text on the page
150
+ page.find_all('text:bold').show()
161
151
 
162
- # Extract clean text from the page
163
- clean_text = page.extract_text()
164
- print("\nClean page text:", clean_text[:200] + "...")
152
+ # Exclude parts of the page from selectors/extractors
153
+ header = page.find('text:contains("CONFIDENTIAL")').above()
154
+ footer = page.find_all('line')[-1].below()
155
+ page.add_exclusion(header)
156
+ page.add_exclusion(footer)
165
157
 
166
- # Highlight the heading and view the page
167
- heading.highlight(color='red')
168
- page.to_image()
158
+ # Extract clean text from the page ignoring exclusions
159
+ clean_text = page.extract_text()
169
160
  ```
170
161
 
171
162
  And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
@@ -186,3 +177,17 @@ Natural PDF offers a range of features for working with PDFs:
186
177
  ## Learn More
187
178
 
188
179
  Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
180
+
181
+ ## Best friends
182
+
183
+ Natural PDF sits on top of a *lot* of fantastic tools and mdoels, some of which are:
184
+
185
+ - [pdfplumber](https://github.com/jsvine/pdfplumber)
186
+ - [EasyOCR](https://www.jaided.ai/easyocr/)
187
+ - [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
188
+ - [Surya](https://github.com/VikParuchuri/surya)
189
+ - A specific [YOLO](https://github.com/opendatalab/DocLayout-YOLO)
190
+ - [deskew](https://github.com/sbrunner/deskew)
191
+ - [doctr](https://github.com/mindee/doctr)
192
+ - [docling](https://github.com/docling-project/docling)
193
+ - [Hugging Face](https://huggingface.co/models)
@@ -15,26 +15,11 @@ Natural PDF lets you find and extract content from PDFs using simple code that m
15
15
  pip install natural-pdf
16
16
  ```
17
17
 
18
- For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
18
+ For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install one to two million different extras. If you just want the greatest hits:
19
19
 
20
20
  ```bash
21
- # Example: Install with EasyOCR support
22
- pip install natural-pdf[easyocr]
23
- pip install natural-pdf[surya]
24
- pip install natural-pdf[paddle]
25
-
26
- # Example: Install support for features using Large Language Models (e.g., via OpenAI-compatible APIs)
27
- pip install natural-pdf[llm]
28
- # (May require setting API key environment variables, e.g., GOOGLE_API_KEY for Gemini)
29
-
30
- # Example: Install with interactive viewer support
31
- pip install natural-pdf[interactive]
32
-
33
- # Example: Install with semantic search support (Haystack)
34
- pip install natural-pdf[haystack]
35
-
36
- # Install everything
37
- pip install natural-pdf[all]
21
+ # deskewing, OCR (surya) + layout analysis (yolo), interactive browsing
22
+ pip install natural-pdf[favorites]
38
23
  ```
39
24
 
40
25
  See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
@@ -48,25 +33,26 @@ from natural_pdf import PDF
48
33
  pdf = PDF('document.pdf')
49
34
  page = pdf.pages[0]
50
35
 
36
+ # Extract all of the text on the page
37
+ page.extract_text()
38
+
51
39
  # Find elements using CSS-like selectors
52
40
  heading = page.find('text:contains("Summary"):bold')
53
41
 
54
42
  # Extract content below the heading
55
43
  content = heading.below().extract_text()
56
- print("Content below Summary:", content[:100] + "...")
57
44
 
58
- # Exclude headers/footers automatically (example)
59
- # You might define these based on common text or position
60
- page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
61
- page.add_exclusion(page.find_all('line')[-1].below())
45
+ # Examine all the bold text on the page
46
+ page.find_all('text:bold').show()
62
47
 
63
- # Extract clean text from the page
64
- clean_text = page.extract_text()
65
- print("\nClean page text:", clean_text[:200] + "...")
48
+ # Exclude parts of the page from selectors/extractors
49
+ header = page.find('text:contains("CONFIDENTIAL")').above()
50
+ footer = page.find_all('line')[-1].below()
51
+ page.add_exclusion(header)
52
+ page.add_exclusion(footer)
66
53
 
67
- # Highlight the heading and view the page
68
- heading.highlight(color='red')
69
- page.to_image()
54
+ # Extract clean text from the page ignoring exclusions
55
+ clean_text = page.extract_text()
70
56
  ```
71
57
 
72
58
  And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
@@ -87,3 +73,17 @@ Natural PDF offers a range of features for working with PDFs:
87
73
  ## Learn More
88
74
 
89
75
  Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
76
+
77
+ ## Best friends
78
+
79
+ Natural PDF sits on top of a *lot* of fantastic tools and mdoels, some of which are:
80
+
81
+ - [pdfplumber](https://github.com/jsvine/pdfplumber)
82
+ - [EasyOCR](https://www.jaided.ai/easyocr/)
83
+ - [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
84
+ - [Surya](https://github.com/VikParuchuri/surya)
85
+ - A specific [YOLO](https://github.com/opendatalab/DocLayout-YOLO)
86
+ - [deskew](https://github.com/sbrunner/deskew)
87
+ - [doctr](https://github.com/mindee/doctr)
88
+ - [docling](https://github.com/docling-project/docling)
89
+ - [Hugging Face](https://huggingface.co/models)
@@ -9,7 +9,7 @@ fi
9
9
 
10
10
  MARKDOWN_FILE=$1
11
11
  NOTEBOOK_FILE="${MARKDOWN_FILE%.md}.ipynb"
12
- KERNEL_NAME="natural-pdf"
12
+ KERNEL_NAME="natural-pdf-project-venv"
13
13
 
14
14
  echo "Converting $MARKDOWN_FILE to notebook..."
15
15
  # Jupytext will now automatically add tags based on markdown metadata
@@ -29,6 +29,6 @@ EOF
29
29
 
30
30
 
31
31
  echo "Executing notebook $NOTEBOOK_FILE..."
32
- jupyter execute "$NOTEBOOK_FILE" --inplace --ExecutePreprocessor.kernel_name=natural-pdf || { echo "Execution failed"; exit 1; }
32
+ jupyter execute "$NOTEBOOK_FILE" --inplace --ExecutePreprocessor.kernel_name=natural-pdf-project-venv || { echo "Execution failed"; exit 1; }
33
33
 
34
34
  echo "Success! Notebook executed and results saved to $NOTEBOOK_FILE"
@@ -7,7 +7,7 @@ Natural PDF allows you to automatically categorize pages or specific regions wit
7
7
  To use the classification features, you need to install the optional dependencies:
8
8
 
9
9
  ```bash
10
- pip install "natural-pdf[classification]"
10
+ pip install "natural-pdf[core-ml]"
11
11
  ```
12
12
 
13
13
  This installs necessary libraries like `torch`, `transformers`, and others.