natural-pdf 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (257) hide show
  1. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/.gitignore +5 -1
  2. natural_pdf-0.1.3/PKG-INFO +137 -0
  3. natural_pdf-0.1.3/README.md +85 -0
  4. natural_pdf-0.1.3/docs/assets/sample-screen.png +0 -0
  5. natural_pdf-0.1.3/docs/index.md +170 -0
  6. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/installation/index.md +1 -2
  7. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/regions/index.ipynb +124 -158
  8. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/regions/index.md +4 -10
  9. natural_pdf-0.1.3/docs/tutorials/01-loading-and-extraction.ipynb +1658 -0
  10. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/02-finding-elements.ipynb +43 -47
  11. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/03-extracting-blocks.ipynb +18 -22
  12. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/04-table-extraction.ipynb +13 -17
  13. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/05-excluding-content.ipynb +66 -39
  14. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/05-excluding-content.md +13 -10
  15. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/06-document-qa.ipynb +29 -33
  16. natural_pdf-0.1.3/docs/tutorials/07-layout-analysis.ipynb +260 -0
  17. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/07-working-with-regions.ipynb +49 -53
  18. natural_pdf-0.1.3/docs/tutorials/08-spatial-navigation.ipynb +508 -0
  19. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/09-section-extraction.ipynb +98 -102
  20. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/10-form-field-extraction.ipynb +51 -55
  21. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/11-enhanced-table-processing.ipynb +7 -11
  22. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/12-ocr-integration.ipynb +173 -65
  23. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/12-ocr-integration.md +32 -0
  24. natural_pdf-0.1.3/docs/tutorials/13-semantic-search.ipynb +1908 -0
  25. natural_pdf-0.1.3/docs/tutorials/13-semantic-search.md +77 -0
  26. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/mkdocs.yml +2 -0
  27. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/__init__.py +33 -1
  28. natural_pdf-0.1.3/natural_pdf/analyzers/layout/layout_analyzer.py +255 -0
  29. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/layout_manager.py +9 -6
  30. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/layout_options.py +2 -4
  31. natural_pdf-0.1.3/natural_pdf/analyzers/layout/surya.py +259 -0
  32. natural_pdf-0.1.3/natural_pdf/collections/pdf_collection.py +259 -0
  33. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/page.py +97 -69
  34. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/pdf.py +382 -171
  35. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/region.py +55 -26
  36. natural_pdf-0.1.3/natural_pdf/exporters/__init__.py +1 -0
  37. natural_pdf-0.1.3/natural_pdf/exporters/searchable_pdf.py +252 -0
  38. natural_pdf-0.1.3/natural_pdf/search/__init__.py +94 -0
  39. natural_pdf-0.1.3/natural_pdf/search/haystack_search_service.py +520 -0
  40. natural_pdf-0.1.3/natural_pdf/search/haystack_utils.py +386 -0
  41. natural_pdf-0.1.3/natural_pdf/search/search_options.py +72 -0
  42. natural_pdf-0.1.3/natural_pdf/search/search_service_protocol.py +189 -0
  43. natural_pdf-0.1.3/natural_pdf/search/searchable_mixin.py +464 -0
  44. natural_pdf-0.1.3/natural_pdf.egg-info/PKG-INFO +137 -0
  45. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf.egg-info/SOURCES.txt +13 -0
  46. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf.egg-info/requires.txt +10 -0
  47. natural_pdf-0.1.3/notebooks/Examples.ipynb +1293 -0
  48. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pyproject.toml +12 -2
  49. natural_pdf-0.1.3/sample-screen.png +0 -0
  50. natural_pdf-0.1.1/PKG-INFO +0 -295
  51. natural_pdf-0.1.1/README.md +0 -252
  52. natural_pdf-0.1.1/docs/index.md +0 -299
  53. natural_pdf-0.1.1/docs/tutorials/01-loading-and-extraction.ipynb +0 -1137
  54. natural_pdf-0.1.1/docs/tutorials/07-layout-analysis.ipynb +0 -264
  55. natural_pdf-0.1.1/docs/tutorials/08-spatial-navigation.ipynb +0 -512
  56. natural_pdf-0.1.1/natural_pdf/analyzers/layout/layout_analyzer.py +0 -166
  57. natural_pdf-0.1.1/natural_pdf/analyzers/layout/surya.py +0 -151
  58. natural_pdf-0.1.1/natural_pdf.egg-info/PKG-INFO +0 -295
  59. natural_pdf-0.1.1/notebooks/Examples.ipynb +0 -1166
  60. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/.github/workflows/docs.yml +0 -0
  61. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/CLAUDE.md +0 -0
  62. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/LICENSE +0 -0
  63. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/MANIFEST.in +0 -0
  64. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/check_run_md.sh +0 -0
  65. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/api/index.md +0 -0
  66. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/favicon.png +0 -0
  67. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/favicon.svg +0 -0
  68. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/javascripts/custom.js +0 -0
  69. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/logo.svg +0 -0
  70. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/social-preview.png +0 -0
  71. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/social-preview.svg +0 -0
  72. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/assets/stylesheets/custom.css +0 -0
  73. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/document-qa/index.ipynb +0 -0
  74. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/document-qa/index.md +0 -0
  75. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/element-selection/index.ipynb +0 -0
  76. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/element-selection/index.md +0 -0
  77. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/interactive-widget/index.ipynb +0 -0
  78. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/interactive-widget/index.md +0 -0
  79. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/layout-analysis/index.ipynb +0 -0
  80. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/layout-analysis/index.md +0 -0
  81. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/ocr/index.md +0 -0
  82. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/pdf-navigation/index.ipynb +0 -0
  83. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/pdf-navigation/index.md +0 -0
  84. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tables/index.ipynb +0 -0
  85. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tables/index.md +0 -0
  86. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/text-analysis/index.ipynb +0 -0
  87. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/text-analysis/index.md +0 -0
  88. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/text-extraction/index.ipynb +0 -0
  89. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/text-extraction/index.md +0 -0
  90. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/01-loading-and-extraction.md +0 -0
  91. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/02-finding-elements.md +0 -0
  92. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/03-extracting-blocks.md +0 -0
  93. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/04-table-extraction.md +0 -0
  94. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/06-document-qa.md +0 -0
  95. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/07-layout-analysis.md +0 -0
  96. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/07-working-with-regions.md +0 -0
  97. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/08-spatial-navigation.md +0 -0
  98. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/09-section-extraction.md +0 -0
  99. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/10-form-field-extraction.md +0 -0
  100. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  101. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/visual-debugging/index.ipynb +0 -0
  102. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/visual-debugging/index.md +0 -0
  103. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/docs/visual-debugging/region.png +0 -0
  104. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/execute_notebooks.py +0 -0
  105. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/__init__.py +0 -0
  106. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/__init__.py +0 -0
  107. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/base.py +0 -0
  108. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/docling.py +0 -0
  109. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/paddle.py +0 -0
  110. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/tatr.py +0 -0
  111. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/layout/yolo.py +0 -0
  112. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/text_options.py +0 -0
  113. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/text_structure.py +0 -0
  114. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/analyzers/utils.py +0 -0
  115. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/__init__.py +0 -0
  116. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/element_manager.py +0 -0
  117. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/core/highlighting_service.py +0 -0
  118. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/__init__.py +0 -0
  119. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/base.py +0 -0
  120. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/collections.py +0 -0
  121. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/line.py +0 -0
  122. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/rect.py +0 -0
  123. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/elements/text.py +0 -0
  124. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/__init__.py +0 -0
  125. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/engine.py +0 -0
  126. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/engine_easyocr.py +0 -0
  127. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/engine_paddle.py +0 -0
  128. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/engine_surya.py +0 -0
  129. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/ocr_manager.py +0 -0
  130. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/ocr/ocr_options.py +0 -0
  131. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/qa/__init__.py +0 -0
  132. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/qa/document_qa.py +0 -0
  133. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/selectors/__init__.py +0 -0
  134. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/selectors/parser.py +0 -0
  135. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/templates/__init__.py +0 -0
  136. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/templates/ocr_debug.html +0 -0
  137. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/utils/__init__.py +0 -0
  138. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/utils/highlighting.py +0 -0
  139. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/utils/reading_order.py +0 -0
  140. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/utils/visualization.py +0 -0
  141. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/widgets/__init__.py +0 -0
  142. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/widgets/frontend/viewer.js +0 -0
  143. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf/widgets/viewer.py +0 -0
  144. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf.egg-info/dependency_links.txt +0 -0
  145. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/natural_pdf.egg-info/top_level.txt +0 -0
  146. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/all_detected_regions.png +0 -0
  147. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/all_elements.png +0 -0
  148. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/basic_highlighting.png +0 -0
  149. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/chainable_layout.png +0 -0
  150. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/chained_analysis.png +0 -0
  151. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/color_names.png +0 -0
  152. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/color_names_with_boxes.png +0 -0
  153. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/conf_display_highlight_all.png +0 -0
  154. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/conf_display_highlight_layout.png +0 -0
  155. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/conf_display_layout_only.png +0 -0
  156. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/confidence_color_coded.png +0 -0
  157. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/debug_page_image.png +0 -0
  158. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/detected_table.png +0 -0
  159. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/dimension_analysis.txt +0 -0
  160. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/direct_ocr_debug.png +0 -0
  161. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/easyocr_debug_input.png +0 -0
  162. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/easyocr_results.png +0 -0
  163. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/easyocr_test_input.png +0 -0
  164. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/exclusion_optimization_regions.png +0 -0
  165. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/explicit_confidence_display.png +0 -0
  166. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/footer_overlap_test.png +0 -0
  167. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all.png +0 -0
  168. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all_styles.png +0 -0
  169. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all_with_all_layouts.png +0 -0
  170. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all_with_attrs.png +0 -0
  171. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_all_with_yolo.png +0 -0
  172. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_by_confidence.png +0 -0
  173. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_color_test_1.png +0 -0
  174. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_color_test_2.png +0 -0
  175. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_color_test_3.png +0 -0
  176. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_color_test_4.png +0 -0
  177. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_layout_method.png +0 -0
  178. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_multiple.png +0 -0
  179. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_no_attrs.png +0 -0
  180. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_region.png +0 -0
  181. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_single.png +0 -0
  182. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_specific_types.png +0 -0
  183. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_specific_types_with_boxes.png +0 -0
  184. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_specific_types_with_tables.png +0 -0
  185. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test.png +0 -0
  186. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_colors.png +0 -0
  187. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual.png +0 -0
  188. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual_annotated.png +0 -0
  189. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual_with_structure.png +0 -0
  190. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual_with_structure_yolo.png +0 -0
  191. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_test_individual_with_tables.png +0 -0
  192. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/highlight_with_attrs.png +0 -0
  193. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_conf_default.png +0 -0
  194. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_conf_high.png +0 -0
  195. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_detection.png +0 -0
  196. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_fix_test.png +0 -0
  197. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_fix_test2.png +0 -0
  198. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_fix_test3.png +0 -0
  199. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/layout_fix_test4.png +0 -0
  200. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/model_comparison.png +0 -0
  201. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/multiple_attributes_display.png +0 -0
  202. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_confidence_visualization.png +0 -0
  203. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_debug.png +0 -0
  204. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_debug_page.html +0 -0
  205. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_highlight_all_test.png +0 -0
  206. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_highlight_test.png +0 -0
  207. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_highlighted.png +0 -0
  208. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_simplified.png +0 -0
  209. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_threshold_comparison.png +0 -0
  210. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_visualization_clean.png +0 -0
  211. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_visualization_highlights.png +0 -0
  212. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/ocr_visualization_text.png +0 -0
  213. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_detection.png +0 -0
  214. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_polygons.png +0 -0
  215. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_sources.png +0 -0
  216. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_with_text.png +0 -0
  217. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddle_layout_without_text.png +0 -0
  218. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddleocr_highlights.png +0 -0
  219. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddleocr_results.png +0 -0
  220. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/paddleocr_test_input.png +0 -0
  221. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/page_1_for_ocr.png +0 -0
  222. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/page_4_for_ocr.png +0 -0
  223. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_exclusion_test.png +0 -0
  224. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_management_test.png +0 -0
  225. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_ocr_cropped.png +0 -0
  226. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_ocr_debug.png +0 -0
  227. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_ocr_full_page.png +0 -0
  228. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/region_ocr_highlighted.png +0 -0
  229. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/spatial_navigation.png +0 -0
  230. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/standard_highlight_all.png +0 -0
  231. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/table_no_ocr.csv +0 -0
  232. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/table_structure.png +0 -0
  233. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/table_structure_detail.png +0 -0
  234. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/table_with_ocr.csv +0 -0
  235. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/tatr_cells_test.png +0 -0
  236. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/tatr_ocr_table_test.png +0 -0
  237. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/tatr_regions.png +0 -0
  238. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/tatr_regions.txt +0 -0
  239. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/text_styles.png +0 -0
  240. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/titles_only.png +0 -0
  241. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/width_1200px.png +0 -0
  242. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/width_800px.png +0 -0
  243. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/width_default.png +0 -0
  244. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/width_with_scale.png +0 -0
  245. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/yolo_regions.png +0 -0
  246. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/output/yolo_regions.txt +0 -0
  247. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/.gitkeep +0 -0
  248. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/01-practice.pdf +0 -0
  249. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/0500000US42001.pdf +0 -0
  250. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/0500000US42007.pdf +0 -0
  251. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/2014 Statistics.pdf +0 -0
  252. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/2019 Statistics.pdf +0 -0
  253. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  254. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/pdfs/needs-ocr.pdf +0 -0
  255. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/publish.sh +0 -0
  256. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/run_all_tutorials.sh +0 -0
  257. {natural_pdf-0.1.1 → natural_pdf-0.1.3}/setup.cfg +0 -0
@@ -5,7 +5,11 @@ docs/tutorials/pdfs
5
5
  install.sh
6
6
  notebooks/Examples.md
7
7
  transcript.md
8
-
8
+ natural_pdf_index
9
+ results
10
+ docs/tutorials/needs-ocr-searchable.pdf
11
+ sample.py
12
+ sample2.py
9
13
 
10
14
  # Created by https://www.toptal.com/developers/gitignore/api/python,macos,visualstudiocode,jupyternotebooks
11
15
  # Edit at https://www.toptal.com/developers/gitignore?templates=python,macos,visualstudiocode,jupyternotebooks
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: natural-pdf
3
+ Version: 0.1.3
4
+ Summary: A more intuitive interface for working with PDFs
5
+ Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/jsoma/natural-pdf
8
+ Project-URL: Repository, https://github.com/jsoma/natural-pdf
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: pdfplumber>=0.7.0
15
+ Requires-Dist: Pillow>=8.0.0
16
+ Requires-Dist: colour>=0.1.5
17
+ Requires-Dist: numpy>=1.20.0
18
+ Requires-Dist: urllib3>=1.26.0
19
+ Requires-Dist: torch>=2.0.0
20
+ Requires-Dist: torchvision>=0.15.0
21
+ Requires-Dist: transformers>=4.30.0
22
+ Requires-Dist: huggingface_hub>=0.19.0
23
+ Requires-Dist: ocrmypdf>=16.0.0
24
+ Requires-Dist: pikepdf>=10.0.0
25
+ Provides-Extra: interactive
26
+ Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
27
+ Provides-Extra: haystack
28
+ Requires-Dist: haystack-ai>=2.0.0b5; extra == "haystack"
29
+ Requires-Dist: chroma-haystack; extra == "haystack"
30
+ Requires-Dist: sentence-transformers; extra == "haystack"
31
+ Provides-Extra: easyocr
32
+ Requires-Dist: easyocr; extra == "easyocr"
33
+ Provides-Extra: paddle
34
+ Requires-Dist: paddlepaddle; extra == "paddle"
35
+ Requires-Dist: paddleocr; extra == "paddle"
36
+ Provides-Extra: layout-yolo
37
+ Requires-Dist: doclayout_yolo; extra == "layout-yolo"
38
+ Provides-Extra: surya
39
+ Requires-Dist: surya-ocr; extra == "surya"
40
+ Provides-Extra: qa
41
+ Provides-Extra: all
42
+ Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "all"
43
+ Requires-Dist: easyocr; extra == "all"
44
+ Requires-Dist: paddlepaddle; extra == "all"
45
+ Requires-Dist: paddleocr; extra == "all"
46
+ Requires-Dist: doclayout_yolo; extra == "all"
47
+ Requires-Dist: surya-ocr; extra == "all"
48
+ Requires-Dist: haystack-ai>=2.0.0b5; extra == "all"
49
+ Requires-Dist: chroma-haystack; extra == "all"
50
+ Requires-Dist: sentence-transformers; extra == "all"
51
+ Dynamic: license-file
52
+
53
+ # Natural PDF
54
+
55
+ A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
56
+
57
+ Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
58
+
59
+ - [Complete documentation here](https://jsoma.github.io/natural-pdf)
60
+ - [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
61
+
62
+ <div style="max-width: 400px; margin: auto"><a href="sample-screen.png"><img src="sample-screen.png"></a></div>
63
+
64
+ ## Installation
65
+
66
+ ```bash
67
+ pip install natural-pdf
68
+ ```
69
+
70
+ For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
71
+
72
+ ```bash
73
+ # Example: Install with EasyOCR support
74
+ pip install natural-pdf[easyocr]
75
+ pip install natural-pdf[surya]
76
+ pip install natural-pdf[paddle]
77
+
78
+ # Example: Install with interactive viewer support
79
+ pip install natural-pdf[interactive]
80
+
81
+ # Example: Install with semantic search support (Haystack)
82
+ pip install natural-pdf[haystack]
83
+
84
+ # Install everything
85
+ pip install natural-pdf[all]
86
+ ```
87
+
88
+ See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
89
+
90
+ ## Quick Start
91
+
92
+ ```python
93
+ from natural_pdf import PDF
94
+
95
+ # Open a PDF
96
+ pdf = PDF('document.pdf')
97
+ page = pdf.pages[0]
98
+
99
+ # Find elements using CSS-like selectors
100
+ heading = page.find('text:contains("Summary"):bold')
101
+
102
+ # Extract content below the heading
103
+ content = heading.below().extract_text()
104
+ print("Content below Summary:", content[:100] + "...")
105
+
106
+ # Exclude headers/footers automatically (example)
107
+ # You might define these based on common text or position
108
+ page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
109
+ page.add_exclusion(page.find_all('line')[-1].below())
110
+
111
+ # Extract clean text from the page
112
+ clean_text = page.extract_text()
113
+ print("\nClean page text:", clean_text[:200] + "...")
114
+
115
+ # Highlight the heading and view the page
116
+ heading.highlight(color='red')
117
+ page.to_image()
118
+ ```
119
+
120
+ And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
121
+
122
+ ## Key Features
123
+
124
+ Natural PDF offers a range of features for working with PDFs:
125
+
126
+ * **CSS-like Selectors:** Find elements using intuitive query strings (`page.find('text:bold')`).
127
+ * **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
128
+ * **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
129
+ * **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
130
+ * **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
131
+ * **Document QA:** Ask natural language questions about your document's content.
132
+ * **Semantic Search:** Index PDFs and find relevant pages or documents based on semantic meaning using Haystack.
133
+ * **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
134
+
135
+ ## Learn More
136
+
137
+ Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
@@ -0,0 +1,85 @@
1
+ # Natural PDF
2
+
3
+ A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
4
+
5
+ Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
6
+
7
+ - [Complete documentation here](https://jsoma.github.io/natural-pdf)
8
+ - [Live demos here](https://colab.research.google.com/github/jsoma/natural-pdf/)
9
+
10
+ <div style="max-width: 400px; margin: auto"><a href="sample-screen.png"><img src="sample-screen.png"></a></div>
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install natural-pdf
16
+ ```
17
+
18
+ For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
19
+
20
+ ```bash
21
+ # Example: Install with EasyOCR support
22
+ pip install natural-pdf[easyocr]
23
+ pip install natural-pdf[surya]
24
+ pip install natural-pdf[paddle]
25
+
26
+ # Example: Install with interactive viewer support
27
+ pip install natural-pdf[interactive]
28
+
29
+ # Example: Install with semantic search support (Haystack)
30
+ pip install natural-pdf[haystack]
31
+
32
+ # Install everything
33
+ pip install natural-pdf[all]
34
+ ```
35
+
36
+ See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
37
+
38
+ ## Quick Start
39
+
40
+ ```python
41
+ from natural_pdf import PDF
42
+
43
+ # Open a PDF
44
+ pdf = PDF('document.pdf')
45
+ page = pdf.pages[0]
46
+
47
+ # Find elements using CSS-like selectors
48
+ heading = page.find('text:contains("Summary"):bold')
49
+
50
+ # Extract content below the heading
51
+ content = heading.below().extract_text()
52
+ print("Content below Summary:", content[:100] + "...")
53
+
54
+ # Exclude headers/footers automatically (example)
55
+ # You might define these based on common text or position
56
+ page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
57
+ page.add_exclusion(page.find_all('line')[-1].below())
58
+
59
+ # Extract clean text from the page
60
+ clean_text = page.extract_text()
61
+ print("\nClean page text:", clean_text[:200] + "...")
62
+
63
+ # Highlight the heading and view the page
64
+ heading.highlight(color='red')
65
+ page.to_image()
66
+ ```
67
+
68
+ And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
69
+
70
+ ## Key Features
71
+
72
+ Natural PDF offers a range of features for working with PDFs:
73
+
74
+ * **CSS-like Selectors:** Find elements using intuitive query strings (`page.find('text:bold')`).
75
+ * **Spatial Navigation:** Select content relative to other elements (`heading.below()`, `element.select_until(...)`).
76
+ * **Text & Table Extraction:** Get clean text or structured table data, automatically handling exclusions.
77
+ * **OCR Integration:** Extract text from scanned documents using engines like EasyOCR, PaddleOCR, or Surya.
78
+ * **Layout Analysis:** Detect document structures (titles, paragraphs, tables) using AI models.
79
+ * **Document QA:** Ask natural language questions about your document's content.
80
+ * **Semantic Search:** Index PDFs and find relevant pages or documents based on semantic meaning using Haystack.
81
+ * **Visual Debugging:** Highlight elements and use an interactive viewer or save images to understand your selections.
82
+
83
+ ## Learn More
84
+
85
+ Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
@@ -0,0 +1,170 @@
1
+ # Natural PDF
2
+
3
+ A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
4
+
5
+ Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
6
+
7
+ - [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
8
+
9
+ <div style="max-width: 400px; margin: auto"><a href="assets/sample-screen.png"><img src="assets/sample-screen.png"></a></div>
10
+
11
+ ## Installation
12
+
13
+ ```
14
+ pip install natural_pdf
15
+ # All the extras
16
+ pip install "natural_pdf[all]"
17
+ ```
18
+
19
+ ## Quick Example
20
+
21
+ ```python
22
+ from natural_pdf import PDF
23
+
24
+ pdf = PDF('document.pdf')
25
+ page = pdf.pages[0]
26
+
27
+ # Find the title and get content below it
28
+ title = page.find('text:contains("Summary"):bold')
29
+ content = title.below().extract_text()
30
+
31
+ # Exclude everything above 'CONFIDENTIAL' and below last line on page
32
+ page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
33
+ page.add_exclusion(page.find_all('line')[-1].below())
34
+
35
+ # Get the clean text without header/footer
36
+ clean_text = page.extract_text()
37
+ ```
38
+
39
+ ## Key Features
40
+
41
+ Here are a few highlights of what you can do:
42
+
43
+ ### Find Elements with Selectors
44
+
45
+ Use CSS-like selectors to find text, shapes, and more.
46
+
47
+ ```python
48
+ # Find bold text containing "Revenue"
49
+ page.find('text:contains("Revenue"):bold').extract_text()
50
+
51
+ # Find all large text
52
+ page.find_all('text[size>=12]').extract_text()
53
+ ```
54
+
55
+ [Learn more about selectors →](element-selection/index.ipynb)
56
+
57
+ ### Navigate Spatially
58
+
59
+ Move around the page relative to elements, not just coordinates.
60
+
61
+ ```python
62
+ # Extract text below a specific heading
63
+ intro_text = page.find('text:contains("Introduction")').below().extract_text()
64
+
65
+ # Extract text from one heading to the next
66
+ methods_text = page.find('text:contains("Methods")').below(
67
+ until='text:contains("Results")'
68
+ ).extract_text()
69
+ ```
70
+
71
+ [Explore more navigation methods →](pdf-navigation/index.ipynb)
72
+
73
+ ### Extract Clean Text
74
+
75
+ Easily extract text content, automatically handling common page elements like headers and footers (if exclusions are set).
76
+
77
+ ```python
78
+ # Extract all text from the page (respecting exclusions)
79
+ page_text = page.extract_text()
80
+
81
+ # Extract text from a specific region
82
+ some_region = page.find(...)
83
+ region_text = some_region.extract_text()
84
+ ```
85
+
86
+ [Learn about text extraction →](text-extraction/index.ipynb)
87
+ [Learn about exclusion zones →](regions/index.ipynb#exclusion-zones)
88
+
89
+ ### Apply OCR
90
+
91
+ Extract text from scanned documents using various OCR engines.
92
+
93
+ ```python
94
+ # Apply OCR using the default engine
95
+ ocr_elements = page.apply_ocr()
96
+
97
+ # Extract text (will use OCR results if available)
98
+ text = page.extract_text()
99
+ ```
100
+
101
+ [Explore OCR options →](ocr/index.md)
102
+
103
+ ### Analyze Document Layout
104
+
105
+ Use AI models to detect document structures like titles, paragraphs, and tables.
106
+
107
+ ```python
108
+ # Detect document structure
109
+ page.analyze_layout()
110
+
111
+ # Highlight titles and tables
112
+ page.find_all('region[type=title]').highlight(color="purple")
113
+ page.find_all('region[type=table]').highlight(color="blue")
114
+
115
+ # Extract data from the first table
116
+ table_data = page.find('region[type=table]').extract_table()
117
+ ```
118
+
119
+ [Learn about layout models →](layout-analysis/index.ipynb)
120
+ [Working with tables? →](tables/index.ipynb)
121
+
122
+ ### Document Question Answering
123
+
124
+ Ask natural language questions directly to your documents.
125
+
126
+ ```python
127
+ # Ask a question
128
+ result = pdf.ask("What was the company's revenue in 2022?")
129
+ if result.get("found", False):
130
+ print(f"Answer: {result['answer']}")
131
+ ```
132
+
133
+ [Learn about Document QA →](document-qa/index.ipynb)
134
+
135
+ ### Visualize Your Work
136
+
137
+ Debug and understand your extractions visually.
138
+
139
+ ```python
140
+ # Highlight headings
141
+ page.find_all('text[size>=14]').highlight(color="red", label="Headings")
142
+
143
+ # Launch the interactive viewer (Jupyter)
144
+ # Requires: pip install natural-pdf[interactive]
145
+ page.viewer()
146
+
147
+ # Or save an image
148
+ # page.save_image("highlighted.png")
149
+ ```
150
+
151
+ [See more visualization options →](visual-debugging/index.ipynb)
152
+
153
+ ## Documentation Topics
154
+
155
+ Choose what you want to learn about:
156
+
157
+ ### Task-based Guides
158
+ - [Getting Started](installation/index.md): Install the library and run your first extraction
159
+ - [PDF Navigation](pdf-navigation/index.ipynb): Open PDFs and work with pages
160
+ - [Element Selection](element-selection/index.ipynb): Find text and other elements using selectors
161
+ - [Text Extraction](text-extraction/index.ipynb): Extract clean text from documents
162
+ - [Regions](regions/index.ipynb): Work with specific areas of a page
163
+ - [Visual Debugging](visual-debugging/index.ipynb): See what you're extracting
164
+ - [OCR](ocr/index.md): Extract text from scanned documents
165
+ - [Layout Analysis](layout-analysis/index.ipynb): Detect document structure
166
+ - [Tables](tables/index.ipynb): Extract tabular data
167
+ - [Document QA](document-qa/index.ipynb): Ask questions to your documents
168
+
169
+ ### Reference
170
+ - [API Reference](api/index.md): Complete library reference
@@ -57,8 +57,7 @@ print(text)
57
57
 
58
58
  # Find something specific
59
59
  title = page.find('text:bold')
60
- if title:
61
- print(f"Found title: {title.text}")
60
+ print(f"Found title: {title.text}")
62
61
  ```
63
62
 
64
63
  ## What's Next?