natural-pdf 0.2.16__tar.gz → 0.2.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (379) hide show
  1. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.gitignore +1 -0
  2. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/CLAUDE.md +13 -0
  3. {natural_pdf-0.2.16/natural_pdf.egg-info → natural_pdf-0.2.17}/PKG-INFO +1 -1
  4. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/element-selection/index.md +110 -0
  5. natural_pdf-0.2.17/docs/guide_adjustment_stream.md +90 -0
  6. natural_pdf-0.2.17/docs/guides_boundary_columns.md +156 -0
  7. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/quick-reference/index.md +60 -13
  8. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/regions/index.md +65 -1
  9. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/05-excluding-content.md +104 -8
  10. natural_pdf-0.2.17/docs/tutorials/08-spatial-navigation.md +449 -0
  11. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/__init__.py +45 -0
  12. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/guides.py +359 -0
  13. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/core/element_manager.py +4 -0
  14. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/core/page.py +88 -22
  15. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/core/page_collection.py +75 -0
  16. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/core/pdf.py +33 -0
  17. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/describe/base.py +48 -7
  18. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/elements/base.py +408 -43
  19. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/elements/element_collection.py +83 -10
  20. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/elements/region.py +217 -178
  21. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/elements/text.py +5 -3
  22. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/flows/element.py +1 -0
  23. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/flows/flow.py +175 -480
  24. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/flows/region.py +76 -0
  25. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/selectors/parser.py +180 -9
  26. natural_pdf-0.2.17/natural_pdf/utils/pdfminer_patches.py +136 -0
  27. natural_pdf-0.2.17/natural_pdf/utils/sections.py +346 -0
  28. natural_pdf-0.2.17/natural_pdf/utils/spatial.py +169 -0
  29. {natural_pdf-0.2.16 → natural_pdf-0.2.17/natural_pdf.egg-info}/PKG-INFO +1 -1
  30. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf.egg-info/SOURCES.txt +22 -1
  31. natural_pdf-0.2.17/tests/test_aggregate_selectors.py +249 -0
  32. natural_pdf-0.2.17/tests/test_auto_multipage_option.py +63 -0
  33. natural_pdf-0.2.17/tests/test_exclude_multi_page.py +101 -0
  34. natural_pdf-0.2.17/tests/test_exclude_real_pdf.py +98 -0
  35. natural_pdf-0.2.17/tests/test_expand_enhanced.py +206 -0
  36. natural_pdf-0.2.17/tests/test_guide_adjustment_stream.py +121 -0
  37. natural_pdf-0.2.17/tests/test_guides_boundaries.py +266 -0
  38. natural_pdf-0.2.17/tests/test_guides_from_headers.py +143 -0
  39. natural_pdf-0.2.17/tests/test_guides_partial.py +110 -0
  40. natural_pdf-0.2.17/tests/test_highlight_color_falsy.py +54 -0
  41. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_highlight_detection_comprehensive.py +3 -2
  42. natural_pdf-0.2.17/tests/test_include_boundaries_fix.py +127 -0
  43. natural_pdf-0.2.17/tests/test_merged_flowregion_specs.py +152 -0
  44. natural_pdf-0.2.17/tests/test_mixed_collection_rendering.py +96 -0
  45. natural_pdf-0.2.17/tests/test_multipage_directional.py +159 -0
  46. natural_pdf-0.2.17/tests/test_pdfminer_bug_status.py +32 -0
  47. natural_pdf-0.2.17/tests/test_pdfminer_color_bug.py +65 -0
  48. natural_pdf-0.2.17/tests/test_pdfminer_color_stack_bug.py +70 -0
  49. natural_pdf-0.2.17/tests/test_smart_exclusion.py +122 -0
  50. natural_pdf-0.2.16/docs/tutorials/08-spatial-navigation.md +0 -237
  51. natural_pdf-0.2.16/tests/test_include_boundaries_debug.py +0 -67
  52. natural_pdf-0.2.16/tests/test_include_boundaries_fix.py +0 -126
  53. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.cursor/rules/analysis_framework.mdc +0 -0
  54. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.cursor/rules/coding-style.mdc +0 -0
  55. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  56. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.cursor/rules/minimal-comments.mdc +0 -0
  57. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  58. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  59. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.github/workflows/ci.yml +0 -0
  60. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.github/workflows/docs.yml +0 -0
  61. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.github/workflows/nightly-tutorials.yml +0 -0
  62. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/.pre-commit-config.yaml +0 -0
  63. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/01-execute_notebooks.py +0 -0
  64. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/02-run_all_tutorials.sh +0 -0
  65. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/LICENSE +0 -0
  66. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/MANIFEST.in +0 -0
  67. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/README.md +0 -0
  68. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/audit_packaging.py +0 -0
  69. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/check_run_md.sh +0 -0
  70. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/api/index.md +0 -0
  71. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/assets/favicon.png +0 -0
  72. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/assets/favicon.svg +0 -0
  73. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/assets/javascripts/custom.js +0 -0
  74. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/assets/logo.svg +0 -0
  75. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/assets/sample-screen.png +0 -0
  76. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/assets/social-preview.png +0 -0
  77. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/assets/social-preview.svg +0 -0
  78. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/assets/stylesheets/custom.css +0 -0
  79. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/categorizing-documents/index.md +0 -0
  80. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/data-extraction/index.md +0 -0
  81. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/describe/index.md +0 -0
  82. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/document-qa/index.md +0 -0
  83. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/extracting-clean-text/index.md +0 -0
  84. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/finetuning/index.md +0 -0
  85. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/fix-messy-tables/index.md +0 -0
  86. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/fix-messy-tables/table_1.csv +0 -0
  87. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/fix-messy-tables/table_2.csv +0 -0
  88. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/fix-messy-tables/table_3.csv +0 -0
  89. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/index.md +0 -0
  90. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/installation/index.md +0 -0
  91. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/interactive-widget/index.md +0 -0
  92. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/layout-analysis/index.md +0 -0
  93. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/loops-and-groups/index.md +0 -0
  94. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/ocr/index.md +0 -0
  95. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/pdf-navigation/index.md +0 -0
  96. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  97. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/process-forms-and-invoices/index.md +0 -0
  98. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/reflowing-pages/index.md +0 -0
  99. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tables/index.md +0 -0
  100. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/text-analysis/index.md +0 -0
  101. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/01-loading-and-extraction.md +0 -0
  102. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/02-finding-elements.md +0 -0
  103. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/03-extracting-blocks.md +0 -0
  104. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/04-table-extraction.md +0 -0
  105. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/06-document-qa.md +0 -0
  106. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/07-layout-analysis.md +0 -0
  107. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/07-working-with-regions.md +0 -0
  108. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/09-section-extraction.md +0 -0
  109. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/10-form-field-extraction.md +0 -0
  110. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  111. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/12-ocr-integration.md +0 -0
  112. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/13-semantic-search.md +0 -0
  113. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/tutorials/14-categorizing-documents.md +0 -0
  114. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/visual-debugging/index.md +0 -0
  115. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/docs/visual-debugging/region.png +0 -0
  116. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/mkdocs.yml +0 -0
  117. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/__init__.py +0 -0
  118. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/__init__.py +0 -0
  119. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/base.py +0 -0
  120. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/docling.py +0 -0
  121. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/gemini.py +0 -0
  122. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  123. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  124. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  125. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/paddle.py +0 -0
  126. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  127. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/surya.py +0 -0
  128. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  129. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/tatr.py +0 -0
  130. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/layout/yolo.py +0 -0
  131. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  132. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/text_options.py +0 -0
  133. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/text_structure.py +0 -0
  134. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/analyzers/utils.py +0 -0
  135. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/classification/manager.py +0 -0
  136. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/classification/mixin.py +0 -0
  137. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/classification/results.py +0 -0
  138. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/cli.py +0 -0
  139. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/collections/mixins.py +0 -0
  140. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/core/__init__.py +0 -0
  141. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/core/highlighting_service.py +0 -0
  142. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/core/page_groupby.py +0 -0
  143. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/core/pdf_collection.py +0 -0
  144. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/core/render_spec.py +0 -0
  145. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/describe/__init__.py +0 -0
  146. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/describe/elements.py +0 -0
  147. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/describe/mixin.py +0 -0
  148. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/describe/summary.py +0 -0
  149. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/elements/__init__.py +0 -0
  150. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/elements/image.py +0 -0
  151. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/elements/line.py +0 -0
  152. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/elements/rect.py +0 -0
  153. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/export/mixin.py +0 -0
  154. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/__init__.py +0 -0
  155. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/base.py +0 -0
  156. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/data/__init__.py +0 -0
  157. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/data/pdf.ttf +0 -0
  158. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/data/sRGB.icc +0 -0
  159. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/hocr.py +0 -0
  160. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/hocr_font.py +0 -0
  161. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/original_pdf.py +0 -0
  162. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/paddleocr.py +0 -0
  163. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/exporters/searchable_pdf.py +0 -0
  164. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/extraction/manager.py +0 -0
  165. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/extraction/mixin.py +0 -0
  166. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/extraction/result.py +0 -0
  167. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/flows/__init__.py +0 -0
  168. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/flows/collections.py +0 -0
  169. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/__init__.py +0 -0
  170. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/engine.py +0 -0
  171. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/engine_doctr.py +0 -0
  172. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/engine_easyocr.py +0 -0
  173. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/engine_paddle.py +0 -0
  174. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/engine_surya.py +0 -0
  175. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/ocr_factory.py +0 -0
  176. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/ocr_manager.py +0 -0
  177. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/ocr_options.py +0 -0
  178. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/ocr/utils.py +0 -0
  179. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/qa/__init__.py +0 -0
  180. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/qa/document_qa.py +0 -0
  181. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/qa/qa_result.py +0 -0
  182. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/search/__init__.py +0 -0
  183. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/search/lancedb_search_service.py +0 -0
  184. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/search/numpy_search_service.py +0 -0
  185. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/search/search_options.py +0 -0
  186. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/search/search_service_protocol.py +0 -0
  187. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/search/searchable_mixin.py +0 -0
  188. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/selectors/__init__.py +0 -0
  189. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/tables/__init__.py +0 -0
  190. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/tables/result.py +0 -0
  191. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/templates/__init__.py +0 -0
  192. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  193. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/templates/spa/css/style.css +0 -0
  194. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/templates/spa/index.html +0 -0
  195. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/templates/spa/js/app.js +0 -0
  196. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/templates/spa/words.txt +0 -0
  197. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/text_mixin.py +0 -0
  198. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/__init__.py +0 -0
  199. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/bidi_mirror.py +0 -0
  200. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/color_utils.py +0 -0
  201. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/debug.py +0 -0
  202. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/highlighting.py +0 -0
  203. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/identifiers.py +0 -0
  204. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/layout.py +0 -0
  205. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/locks.py +0 -0
  206. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/packaging.py +0 -0
  207. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/reading_order.py +0 -0
  208. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/text_extraction.py +0 -0
  209. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/utils/visualization.py +0 -0
  210. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/vision/__init__.py +0 -0
  211. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/vision/mixin.py +0 -0
  212. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/vision/results.py +0 -0
  213. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/vision/similarity.py +0 -0
  214. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/vision/template_matching.py +0 -0
  215. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/widgets/__init__.py +0 -0
  216. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf/widgets/viewer.py +0 -0
  217. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf.egg-info/dependency_links.txt +0 -0
  218. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf.egg-info/entry_points.txt +0 -0
  219. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf.egg-info/requires.txt +0 -0
  220. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/natural_pdf.egg-info/top_level.txt +0 -0
  221. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/noxfile.py +0 -0
  222. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/optimization/memory_comparison.py +0 -0
  223. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/optimization/pdf_analyzer.py +0 -0
  224. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/optimization/performance_analysis.py +0 -0
  225. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  226. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  227. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  228. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  229. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/optimization/test_cleanup_methods.py +0 -0
  230. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/optimization/test_memory_fix.py +0 -0
  231. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/publish.sh +0 -0
  232. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/pyproject.toml +0 -0
  233. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/sample-screen.png +0 -0
  234. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/setup.cfg +0 -0
  235. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/fix_page_exclusions.py +0 -0
  236. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_draw_guides.py +0 -0
  237. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_draw_guides_interactive.py +0 -0
  238. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_exclusion_with_debug.py +0 -0
  239. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_find_exclusions_fix.py +0 -0
  240. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_find_exclusions_fix_no_recursion.py +0 -0
  241. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_fix_real_pdf.py +0 -0
  242. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_fix_working.py +0 -0
  243. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_fixed_pdf_exclusions.py +0 -0
  244. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_guide_draw_notebook.py +0 -0
  245. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_horizontal_top_bottom.py +0 -0
  246. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_inline_js.py +0 -0
  247. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_marker_order.py +0 -0
  248. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_original_exclusions_now_work.py +0 -0
  249. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_pdf_exclusions_with_guides.py +0 -0
  250. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_region_exclusions_detailed.py +0 -0
  251. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_stripes_real_pdf.py +0 -0
  252. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_vertical_stripes.py +0 -0
  253. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_widget_functionality.py +0 -0
  254. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/temp/test_widget_simple.py +0 -0
  255. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/conftest.py +0 -0
  256. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/exporters/test_paddleocr_exporter.py +0 -0
  257. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_annotate.py +0 -0
  258. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_arabic_performance.py +0 -0
  259. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_arabic_real_world.py +0 -0
  260. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_color_conversion.py +0 -0
  261. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_color_hex_display.py +0 -0
  262. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_core/test_containment_geometry.py +0 -0
  263. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_core/test_elements.py +0 -0
  264. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_core/test_loading.py +0 -0
  265. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_core/test_spatial.py +0 -0
  266. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_core/test_text_extraction.py +0 -0
  267. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_core/test_text_layer.py +0 -0
  268. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_crop_enhancements.py +0 -0
  269. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_crop_region_highlights.py +0 -0
  270. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_directional_defaults.py +0 -0
  271. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_dissolve.py +0 -0
  272. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_dissolve_cross_page_bug.py +0 -0
  273. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_dissolve_debug_issue.py +0 -0
  274. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_dissolve_real_world_issue.py +0 -0
  275. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_dissolve_single_elements.py +0 -0
  276. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_dissolve_vertical_offset_issue.py +0 -0
  277. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_document_qa.py +0 -0
  278. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_element_addition.py +0 -0
  279. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_element_collection_guides.py +0 -0
  280. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_element_collection_show_cols.py +0 -0
  281. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_element_collection_slicing.py +0 -0
  282. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_element_exclusions.py +0 -0
  283. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_element_show_crop_highlights.py +0 -0
  284. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_empty_pseudo_class.py +0 -0
  285. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_exclusions.py +0 -0
  286. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_expand.py +0 -0
  287. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_extraction_error.py +0 -0
  288. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_extraction_mixin_fix.py +0 -0
  289. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_extraction_text_and_vision.py +0 -0
  290. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_extraction_working.py +0 -0
  291. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_find_similar.py +0 -0
  292. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_first_last_selectors.py +0 -0
  293. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_fix_get_sections_zero_height.py +0 -0
  294. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_flow_region_directional.py +0 -0
  295. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_get_sections_fix_comprehensive.py +0 -0
  296. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_get_sections_zero_height.py +0 -0
  297. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_groupby.py +0 -0
  298. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides.py +0 -0
  299. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides_apply_exclusions.py +0 -0
  300. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides_apply_exclusions_simple.py +0 -0
  301. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides_extract_table.py +0 -0
  302. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides_extract_table_collections.py +0 -0
  303. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides_extract_table_exclusions.py +0 -0
  304. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides_extract_table_real.py +0 -0
  305. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides_from_stripes.py +0 -0
  306. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides_integration.py +0 -0
  307. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_guides_marker_sorting.py +0 -0
  308. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_highlight_detection.py +0 -0
  309. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_highlight_offset.py +0 -0
  310. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_highlight_protocol.py +0 -0
  311. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_highlight_protocol_simple.py +0 -0
  312. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_highlight_regions.py +0 -0
  313. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_horizontal_guides_alignment.py +0 -0
  314. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_include_boundaries_comprehensive.py +0 -0
  315. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_include_boundaries_final.py +0 -0
  316. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_include_boundaries_final_verification.py +0 -0
  317. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_include_boundaries_mock.py +0 -0
  318. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_include_boundaries_simple.py +0 -0
  319. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_include_boundaries_types_pdf.py +0 -0
  320. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_include_boundaries_verification.py +0 -0
  321. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_include_boundaries_with_real_text.py +0 -0
  322. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_loading_original.py +0 -0
  323. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_match_results_sorting.py +0 -0
  324. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_merge_connected.py +0 -0
  325. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_merge_connected_real_world.py +0 -0
  326. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_merge_method.py +0 -0
  327. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_multi_page_table_discovery.py +0 -0
  328. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_negative_bounds_pdf.py +0 -0
  329. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_optional_deps.py +0 -0
  330. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_page_exclusion_lists.py +0 -0
  331. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  332. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_pdf_exclusions_in_find_methods.py +0 -0
  333. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_phash_masking.py +0 -0
  334. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_region_find_similar.py +0 -0
  335. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_region_show_crop_highlights.py +0 -0
  336. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_region_viewer.py +0 -0
  337. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_sections_end_only.py +0 -0
  338. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_sections_with_start_and_end.py +0 -0
  339. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_show_column_layout.py +0 -0
  340. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_show_edge_cases.py +0 -0
  341. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_show_exclusions.py +0 -0
  342. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_show_exclusions_feature.py +0 -0
  343. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_show_limit.py +0 -0
  344. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_skip_repeating_headers_multipage.py +0 -0
  345. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_slice_cache_reuse.py +0 -0
  346. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_slice_exclusion_fix.py +0 -0
  347. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_slice_exclusion_issue.py +0 -0
  348. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_slice_exclusion_mock.py +0 -0
  349. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_sliced_collection_exclusions.py +0 -0
  350. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_spatial_offset.py +0 -0
  351. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_strikethrough_detection.py +0 -0
  352. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_table_result_header_mismatch.py +0 -0
  353. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_table_result_keep_blank.py +0 -0
  354. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_template_matching.py +0 -0
  355. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_template_white_masking.py +0 -0
  356. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_tiny_text_tables.py +0 -0
  357. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_tiny_text_tables_table.py +0 -0
  358. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_tutorials.py +0 -0
  359. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_underline_detection.py +0 -0
  360. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tests/test_update_text.py +0 -0
  361. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/todo/bad_pdf_analysis.md +0 -0
  362. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/todo/evaluation.md +0 -0
  363. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  364. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  365. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  366. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/README.md +0 -0
  367. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/__init__.py +0 -0
  368. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/analyser.py +0 -0
  369. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  370. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  371. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/eval_suite.py +0 -0
  372. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  373. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  374. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  375. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  376. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  377. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/reporter.py +0 -0
  378. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/tools/bad_pdf_eval/utils.py +0 -0
  379. {natural_pdf-0.2.16 → natural_pdf-0.2.17}/uv.lock +0 -0
@@ -7,6 +7,7 @@ bad-pdfs/
7
7
  bad-pdfs-analysis/**/*.pdf
8
8
  output
9
9
  output.pdf
10
+ Demos.ipynb
10
11
  Untitled.ipynb
11
12
  conversation.md
12
13
  docs/tutorials/pdfs
@@ -61,6 +61,19 @@ Natural PDF is a Python library for intelligent PDF document processing that com
61
61
  - **Automatic conversion**: Elements from iterables are automatically converted to exclusion regions
62
62
  - **Backward compatibility**: Existing Region and callable exclusions continue to work unchanged
63
63
 
64
+ ### 6b. Multi-page Directional Navigation
65
+ - **multipage parameter**: Directional methods now accept `multipage=True` to span pages
66
+ - `element.below(until="text:contains('End')", multipage=True)` searches across pages
67
+ - Returns `FlowRegion` when spanning multiple pages, `Region` when on single page
68
+ - Works with all directional methods: `.below()`, `.above()`, `.left()`, `.right()`
69
+ - **Global auto_multipage option**: Set default behavior for all directional navigation
70
+ - `npdf.set_option('layout.auto_multipage', True)` enables multipage by default
71
+ - Individual calls can override with `multipage=False`
72
+ - **Use cases**:
73
+ - Extract content between headers on different pages
74
+ - Find tables that span page boundaries
75
+ - Navigate document structure without manual page handling
76
+
64
77
  ### 7. Page Grouping with groupby()
65
78
  - **Simple grouping by selector text**: `pages.groupby('text[size=16]')` groups by header text
66
79
  - **Callable functions for complex logic**: `pages.groupby(lambda p: p.find('text:contains("CITY")').extract_text())`
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.16
3
+ Version: 0.2.17
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -223,6 +223,116 @@ headings.extract_text()
223
223
 
224
224
  *Note: `.highest()`, `.lowest()`, etc. will complain if your collection spans multiple pages.*
225
225
 
226
+ ## Finding Elements with Statistical Properties
227
+
228
+ Sometimes you need to find elements based on their extreme values - the leftmost text, the largest font, or the most common color. Natural PDF's aggregate selectors make this easy using statistical functions like `min()`, `max()`, and `avg()`.
229
+
230
+ ### Position-Based Selection
231
+
232
+ ```python
233
+ # Find the leftmost text element on the page
234
+ leftmost = page.find('text[x0=min()]')
235
+ leftmost.show()
236
+ ```
237
+
238
+ ```python
239
+ # Find the rightmost text (useful for page numbers)
240
+ rightmost = page.find('text[x1=max()]')
241
+ rightmost.show()
242
+ ```
243
+
244
+ ```python
245
+ # Find text at the top and bottom of the page
246
+ topmost = page.find('text[top=min()]')
247
+ bottommost = page.find('text[bottom=max()]')
248
+ ```
249
+
250
+ ### Size and Dimension Selection
251
+
252
+ ```python
253
+ # Find the largest text (often titles or headings)
254
+ largest_text = page.find('text[size=max()]')
255
+ print(f"Largest text: {largest_text.extract_text()} (size: {largest_text.size})")
256
+ ```
257
+
258
+ ```python
259
+ # Find elements with average dimensions
260
+ avg_width_text = page.find_all('text[width=avg()]')
261
+ median_height_text = page.find_all('text[height=median()]')
262
+ ```
263
+
264
+ ### Finding Most Common Values
265
+
266
+ The `mode()` function (or its alias `most_common()`) finds elements with the most frequently occurring value for any attribute:
267
+
268
+ ```python
269
+ # Find text with the most common font size (body text)
270
+ body_text = page.find_all('text[size=mode()]')
271
+ print(f"Most common font size: {body_text.first.size if body_text else 'N/A'}")
272
+ ```
273
+
274
+ ```python
275
+ # Find elements with the most common font name
276
+ common_font = page.find_all('text[fontname=most_common()]')
277
+ ```
278
+
279
+ ### Color Proximity Matching
280
+
281
+ For color attributes, you can find elements with colors closest to a target:
282
+
283
+ ```python
284
+ # Find text closest to red
285
+ red_text = page.find_all('text[color=closest("red")]')
286
+
287
+ # Find rectangles with fill color closest to blue
288
+ blue_rects = page.find_all('rect[fill=closest("#0000FF")]')
289
+
290
+ # Works with any color format
291
+ nearly_black = page.find_all('text[color=closest("rgb(10,10,10)")]')
292
+ ```
293
+
294
+ ### Combining Aggregate Conditions
295
+
296
+ Multiple aggregate conditions create an intersection - elements must satisfy ALL conditions:
297
+
298
+ ```python
299
+ # Find text that is both leftmost AND largest
300
+ special_text = page.find('text[x0=min()][size=max()]')
301
+
302
+ # Find the topmost element among large text
303
+ topmost_large = page.find('text[size>12][top=min()]')
304
+ ```
305
+
306
+ ### Using Aggregates in Complex Selectors
307
+
308
+ Aggregate functions work seamlessly with all Natural PDF features:
309
+
310
+ ```python
311
+ # In OR selectors - find either the leftmost text OR the largest rectangle
312
+ elements = page.find_all('text[x0=min()]|rect[width=max()]')
313
+
314
+ # With spatial navigation
315
+ element = page.find('text')
316
+ # Navigate right until reaching the leftmost element
317
+ right_region = element.right(until='text[x0=min()]')
318
+
319
+ # With filters - leftmost among bold text
320
+ leftmost_bold = page.find('text:bold[x0=min()]')
321
+ ```
322
+
323
+ ### Available Aggregate Functions
324
+
325
+ | Function | Alias | Description | Works On |
326
+ |----------|-------|-------------|----------|
327
+ | `min()` | - | Minimum value | Numeric attributes |
328
+ | `max()` | - | Maximum value | Numeric attributes |
329
+ | `avg()` | `mean()` | Average/mean value | Numeric attributes |
330
+ | `median()` | - | Median value | Numeric attributes |
331
+ | `mode()` | `most_common()` | Most frequent value | Any attribute |
332
+ | `closest(value)` | - | Closest match (colors only) | Color attributes |
333
+
334
+ **Note**: Aggregates are calculated across all elements of the same type. For example, `text[x0=min()]` finds the minimum x0 among ALL text elements, not just those matching other filters.
335
+
226
336
  ## Dealing with Weird Font Names
227
337
 
228
338
  PDFs sometimes have bizarre font names that don't look like normal fonts. Don't worry - they're usually normal fonts with weird internal names.
@@ -0,0 +1,90 @@
1
+ # Guide Adjustment for Stream Extraction
2
+
3
+ ## Overview
4
+
5
+ When using the `stream` extraction method (text-based edge detection) with explicit vertical guides, Natural PDF automatically adjusts guides that fall outside text bounds to ensure proper table extraction.
6
+
7
+ ## The Problem
8
+
9
+ In pdfplumber's stream method, horizontal edges are only created where text exists. If vertical guides are placed outside the horizontal extent of text (e.g., at x=0 when text starts at x=51.6), these guides won't intersect with horizontal edges, causing missing columns in the extracted table.
10
+
11
+ ## The Solution
12
+
13
+ Natural PDF automatically clips vertical guides to text bounds when:
14
+ 1. Using `method="stream"` or `horizontal_strategy="text"`
15
+ 2. Explicit vertical lines are provided
16
+ 3. Text elements exist in the region
17
+
18
+ ## Example
19
+
20
+ ```python
21
+ from natural_pdf import PDF
22
+ from natural_pdf.analyzers.guides import Guides
23
+
24
+ # Load PDF and find headers
25
+ pdf = PDF("document.pdf")
26
+ page = pdf[0]
27
+ headers = page.find_all("text[y<100]") # Find header row
28
+
29
+ # Create guides from headers
30
+ guides = Guides(page)
31
+ guides.vertical.from_headers(headers, margin=0)
32
+
33
+ # Guides might include page boundaries (0, page.width)
34
+ # which could be outside text bounds
35
+
36
+ # Extract table - guides are automatically adjusted
37
+ table = page.extract_table(method="stream", verticals=guides.vertical.data)
38
+
39
+ # All columns including first and last are properly extracted
40
+ ```
41
+
42
+ ## How It Works
43
+
44
+ 1. **Detection**: When stream method is used with explicit vertical guides
45
+ 2. **Text Bounds**: The system finds all text elements and determines their bounding box
46
+ 3. **Adjustment**:
47
+ - Guides left of text bounds are moved to the left edge of text
48
+ - Guides right of text bounds are moved to the right edge of text
49
+ - Guides within text bounds remain unchanged
50
+ 4. **Extraction**: The adjusted guides are used for table extraction
51
+
52
+ ## When This Applies
53
+
54
+ Guide adjustment happens when ALL of these conditions are met:
55
+ - Extraction method is `pdfplumber` (or its aliases `stream`)
56
+ - `horizontal_strategy` is `"text"` (text-based edge detection)
57
+ - `vertical_strategy` is `"explicit"` (using provided guides)
58
+ - `explicit_vertical_lines` are provided in table settings
59
+
60
+ ## Debugging
61
+
62
+ Enable debug logging to see guide adjustments:
63
+
64
+ ```python
65
+ import logging
66
+ logging.basicConfig(level=logging.DEBUG)
67
+
68
+ # Extract table - will show adjustment messages
69
+ table = page.extract_table(method="stream", verticals=guides.vertical.data)
70
+ ```
71
+
72
+ Example debug output:
73
+ ```
74
+ Region (0, 0, 1224, 1584): Adjusted left guide from 0.0 to 51.6
75
+ Region (0, 0, 1224, 1584): Adjusted right guide from 1224.0 to 1155.7
76
+ Region (0, 0, 1224, 1584): Adjusted 26 guides for stream extraction. Text bounds: 51.6-1155.7
77
+ ```
78
+
79
+ ## Other Methods
80
+
81
+ This adjustment only applies to stream/text-based extraction. When using:
82
+ - `method="lattice"` (line-based): No adjustment, guides used as-is
83
+ - `method="tatr"` or `method="text"`: Different extraction methods, guides not used
84
+
85
+ ## Best Practices
86
+
87
+ 1. **Use from_headers()**: This method creates appropriate guides for your content
88
+ 2. **Set margin=0**: For tables that span the full width of text
89
+ 3. **Verify with lattice first**: If your PDF has visible lines, lattice method may work better
90
+ 4. **Check text bounds**: Use `page.find_all("text").merge().bbox` to see text extent
@@ -0,0 +1,156 @@
1
+ # Missing First/Last Columns in guides.extract_table()
2
+
3
+ ## Problem Description
4
+
5
+ When using `guides.extract_table()`, the first and last columns may be missing from the extracted table. This happens because the `Guides.from_lines()` method by default does not include the page boundaries (x=0 and x=page.width) as vertical guides.
6
+
7
+ ### Example of the Issue
8
+
9
+ ```python
10
+ # Default behavior - may miss boundary columns
11
+ guides = Guides.from_lines(page)
12
+ result = guides.extract_table()
13
+ # First column "OFFICER" and last column may be missing
14
+ ```
15
+
16
+ ## Root Cause
17
+
18
+ The `from_lines()` method detects lines in the PDF but doesn't automatically add guides at the page boundaries. If your table's first column starts at x=0 or the last column ends at x=page.width, and there are no explicit vertical lines at these positions, those columns won't have guides and will be excluded from extraction.
19
+
20
+ ## Solutions
21
+
22
+ ### Solution 1: Use the `outer` parameter (Recommended)
23
+
24
+ The simplest fix is to use the `outer=True` parameter when creating guides:
25
+
26
+ ```python
27
+ # Include outer boundaries when detecting lines
28
+ guides = Guides.from_lines(page, outer=True)
29
+ result = guides.extract_table()
30
+ ```
31
+
32
+ ### Solution 2: Use `include_outer_boundaries` in extract_table
33
+
34
+ If you've already created guides, you can include boundaries during extraction:
35
+
36
+ ```python
37
+ # Create guides normally
38
+ guides = Guides.from_lines(page)
39
+
40
+ # Include boundaries during extraction
41
+ result = guides.extract_table(include_outer_boundaries=True)
42
+ ```
43
+
44
+ ### Solution 3: Manually add boundary guides
45
+
46
+ For more control, you can manually add guides at the page boundaries:
47
+
48
+ ```python
49
+ # Create guides
50
+ guides = Guides.from_lines(page)
51
+
52
+ # Add page boundaries
53
+ guides.vertical.add([0, page.width])
54
+
55
+ # Extract table
56
+ result = guides.extract_table()
57
+ ```
58
+
59
+ ### Solution 4: Create guides from specific positions
60
+
61
+ If you know the exact column positions:
62
+
63
+ ```python
64
+ # Create guides with specific positions including boundaries
65
+ guides = Guides(page)
66
+ guides.vertical.add([0, 100, 200, 300, 400, page.width])
67
+ guides.horizontal.from_lines(page) # Get horizontal guides from lines
68
+
69
+ result = guides.extract_table()
70
+ ```
71
+
72
+ ## Best Practices
73
+
74
+ 1. **Always use `outer=True`** when you expect table content at page boundaries:
75
+ ```python
76
+ guides = Guides.from_lines(page, outer=True)
77
+ ```
78
+
79
+ 2. **Check your guides** before extraction:
80
+ ```python
81
+ guides = Guides.from_lines(page)
82
+ print(f"Vertical guides: {guides.vertical.data}")
83
+ print(f"Page width: {page.width}")
84
+
85
+ # Check if boundaries are included
86
+ has_left = 0 in guides.vertical.data
87
+ has_right = page.width in guides.vertical.data
88
+ ```
89
+
90
+ 3. **Visualize guides** to debug issues:
91
+ ```python
92
+ # Show the page with guides overlaid
93
+ guides.show()
94
+ ```
95
+
96
+ ## Complete Example
97
+
98
+ ```python
99
+ from natural_pdf import PDF
100
+ from natural_pdf.analyzers import Guides
101
+
102
+ # Load PDF
103
+ pdf = PDF("document.pdf")
104
+ page = pdf[0]
105
+
106
+ # Method 1: Best practice - use outer=True
107
+ guides = Guides.from_lines(page, outer=True)
108
+ table = guides.extract_table()
109
+ df = table.to_df()
110
+ print(df)
111
+
112
+ # Method 2: Alternative - use include_outer_boundaries
113
+ guides = Guides.from_lines(page)
114
+ table = guides.extract_table(include_outer_boundaries=True)
115
+ df = table.to_df()
116
+ print(df)
117
+
118
+ # Method 3: Manual control
119
+ guides = Guides.from_lines(page)
120
+ if 0 not in guides.vertical.data:
121
+ guides.vertical.add([0])
122
+ if page.width not in guides.vertical.data:
123
+ guides.vertical.add([page.width])
124
+ table = guides.extract_table()
125
+ df = table.to_df()
126
+ print(df)
127
+ ```
128
+
129
+ ## When This Issue Occurs
130
+
131
+ This issue typically occurs when:
132
+ - Tables are designed with no margins (content starts at x=0)
133
+ - Tables span the full page width
134
+ - PDF generators don't include explicit border lines at page edges
135
+ - Content is positioned exactly at page boundaries
136
+
137
+ ## Verification
138
+
139
+ To verify if this is your issue:
140
+
141
+ ```python
142
+ # Check text positions
143
+ texts = page.find_all('text')
144
+ min_x = min(t.x0 for t in texts)
145
+ max_x = max(t.x1 for t in texts)
146
+
147
+ print(f"Text spans from x={min_x} to x={max_x}")
148
+ print(f"Page width: {page.width}")
149
+
150
+ # Check guides
151
+ guides = Guides.from_lines(page)
152
+ print(f"First guide: {guides.vertical.data[0] if guides.vertical.data else 'None'}")
153
+ print(f"Last guide: {guides.vertical.data[-1] if guides.vertical.data else 'None'}")
154
+
155
+ # If min_x < first guide or max_x > last guide, you need boundaries
156
+ ```
@@ -74,6 +74,18 @@ page.find_all('text[source=pdf]') # Original PDF text
74
74
  page.find_all('text[confidence>=0.8]') # High-confidence OCR
75
75
  ```
76
76
 
77
+ ### Statistical Selectors (Aggregates)
78
+ ```py
79
+ page.find('text[x0=min()]') # Leftmost text
80
+ page.find('text[x1=max()]') # Rightmost text
81
+ page.find('text[size=max()]') # Largest text
82
+ page.find('text[width=avg()]') # Average width text
83
+ page.find('text[height=median()]') # Median height text
84
+ page.find('text[fontname=mode()]') # Most common font
85
+ page.find('text[color=closest("red")]') # Closest to red
86
+ page.find('text[x0=min()][size=max()]') # Leftmost AND largest
87
+ ```
88
+
77
89
  ## Essential Methods
78
90
 
79
91
  ### Finding Elements
@@ -86,10 +98,21 @@ element.previous() # Previous element
86
98
 
87
99
  ### Spatial Navigation
88
100
  ```py
89
- element.above(height=100) # Region above element
101
+ # Smart defaults (new in 0.9.0)
102
+ element.left() # Default height='element' (matches element height)
103
+ element.right() # Default height='element' (matches element height)
104
+ element.above() # Default width='full' (full page width)
105
+ element.below() # Default width='full' (full page width)
106
+
107
+ # Custom dimensions
108
+ element.above(height=100) # Fixed height above
90
109
  element.below(until='line:horizontal') # Below until boundary
91
- element.left(width=200) # Region to the left
92
- element.right() # Region to the right
110
+ element.left(width=200) # Fixed width to left
111
+ element.right(height='full') # Full page height to right
112
+
113
+ # Exclusion handling
114
+ element.below(apply_exclusions=True) # Skip exclusion zones
115
+ element.expand('down', 50, apply_exclusions=True) # Expand with exclusions
93
116
  ```
94
117
 
95
118
  ### Text Extraction
@@ -194,9 +217,16 @@ page.viewer() # Launch interactive viewer (Jup
194
217
 
195
218
  ### Page-Level Exclusions
196
219
  ```py
197
- header = page.find('text:contains("CONFIDENTIAL")').above()
198
- page.add_exclusion(header) # Exclude from extraction
199
- page.clear_exclusions() # Remove exclusions
220
+ # Smart exclusion behavior (new in 0.9.0)
221
+ text_element = page.find('text:contains("CONFIDENTIAL")')
222
+ page.add_exclusion(text_element) # Excludes just the text bounding box
223
+
224
+ # Traditional region exclusion
225
+ header_region = page.find('text:contains("CONFIDENTIAL")').above()
226
+ page.add_exclusion(header_region) # Excludes entire region
227
+
228
+ # Manage exclusions
229
+ page.clear_exclusions() # Remove all exclusions
200
230
  text = page.extract_text(use_exclusions=False) # Ignore exclusions
201
231
  ```
202
232
 
@@ -207,10 +237,27 @@ pdf.add_exclusion(
207
237
  lambda p: p.create_region(0, 0, p.width, p.height * 0.1),
208
238
  label="Header"
209
239
  )
240
+
241
+ # Exclude specific text elements (new in 0.9.0)
242
+ pdf.add_exclusion(
243
+ lambda p: p.find_all('text:contains("Header")'), # Returns ElementCollection
244
+ label="Headers"
245
+ )
210
246
  ```
211
247
 
212
248
  ## Configuration Options
213
249
 
250
+ ### Global Layout Settings
251
+ ```py
252
+ import natural_pdf
253
+
254
+ # Configure global directional offset (default: 5)
255
+ natural_pdf.options.layout.directional_offset = 10 # Larger gap for directional methods
256
+
257
+ # Reset to default
258
+ natural_pdf.options.layout.directional_offset = 5
259
+ ```
260
+
214
261
  ### OCR Engines
215
262
  ```py
216
263
  from natural_pdf.ocr import EasyOCROptions, PaddleOCROptions
@@ -231,17 +278,17 @@ page.analyze_layout(engine='yolo', options=yolo_opts)
231
278
 
232
279
  ### Extract Inspection Report Data
233
280
  ```py
234
- # Find violation count
235
- violations = page.find('text:contains("Violation Count"):right(width=100)')
281
+ # Find violation count (uses smart default height='element')
282
+ violations = page.find('text:contains("Violation Count"):right()')
236
283
 
237
284
  # Get inspection number from the header box (regex search)
238
285
  inspection_num = page.find('text:contains("INS-[A-Z0-9]+")', regex=True)
239
286
 
240
- # Extract inspection date
287
+ # Extract inspection date (custom width for wider field)
241
288
  inspection_date = page.find('text:contains("Date:"):right(width=150)')
242
289
 
243
- # Get site name (text to the right of "Site:")
244
- site_name = page.find('text:contains("Site:"):right(width=300)').extract_text()
290
+ # Get site name (uses smart default height='element')
291
+ site_name = page.find('text:contains("Site:"):right()').extract_text()
245
292
  ```
246
293
 
247
294
  ### Process Forms
@@ -250,9 +297,9 @@ site_name = page.find('text:contains("Site:"):right(width=300)').extract_text()
250
297
  page.add_exclusion(page.create_region(0, 0, page.width, 50))
251
298
  page.add_exclusion(page.create_region(0, page.height-50, page.width, page.height))
252
299
 
253
- # Extract form fields
300
+ # Extract form fields (smart defaults + exclusion handling)
254
301
  fields = page.find_all('text:bold')
255
- values = [field.right(width=300).extract_text() for field in fields]
302
+ values = [field.right(apply_exclusions=True).extract_text() for field in fields]
256
303
  ```
257
304
 
258
305
  ### Handle Scanned Documents
@@ -43,7 +43,12 @@ mid_region.show(color="blue")
43
43
 
44
44
  ### Using Element Methods: `above()`, `below()`, `left()`, `right()`
45
45
 
46
- You can create regions relative to existing elements.
46
+ You can create regions relative to existing elements. Natural PDF uses smart defaults for these directional methods:
47
+
48
+ - **`.left()` and `.right()`**: Default to `height='element'` (matches the element's height)
49
+ - **`.above()` and `.below()`**: Default to `width='full'` (full page width)
50
+
51
+ These defaults match common use cases - when looking sideways you usually want the same height as your reference element, while looking up/down typically needs the full page width.
47
52
 
48
53
  ```python
49
54
  # Find a heading-like element
@@ -51,6 +56,7 @@ heading = page.find('text[size>=12]:bold')
51
56
 
52
57
  # Create a region below this heading element
53
58
  if heading:
59
+ # Uses default width='full' - extends across full page width
54
60
  region_below = heading.below()
55
61
 
56
62
  # Highlight the heading and the region below it
@@ -60,6 +66,24 @@ if heading:
60
66
  h.show()
61
67
  ```
62
68
 
69
+ ```python
70
+ # Create regions to the left and right with smart defaults
71
+ if heading:
72
+ # Default height='element' - matches heading height
73
+ region_left = heading.left()
74
+ region_right = heading.right()
75
+
76
+ # Or specify custom dimensions
77
+ region_left_tall = heading.left(height=200) # 200px tall
78
+ region_right_full = heading.right(height='full') # Full page height
79
+
80
+ with page.highlights() as h:
81
+ h.add(heading, color="red")
82
+ h.add(region_left, color="green", label="Left (element height)")
83
+ h.add(region_right, color="blue", label="Right (element height)")
84
+ h.show()
85
+ ```
86
+
63
87
  ```python
64
88
  # Create a region with height limit
65
89
  if heading:
@@ -212,6 +236,46 @@ with page.highlights() as h:
212
236
  h.show()
213
237
  ```
214
238
 
239
+ ### Global Offset Configuration
240
+
241
+ You can configure global offsets that will be applied to all regions created with directional methods. This is useful for consistently adding padding or margins:
242
+
243
+ ```python
244
+ from natural_pdf import PDF
245
+
246
+ # Configure global offsets for all PDFs
247
+ PDF.configure_offsets(
248
+ below_offset=5, # Add 5px gap below elements
249
+ above_offset=5, # Add 5px gap above elements
250
+ left_offset=2, # Add 2px gap to the left
251
+ right_offset=2 # Add 2px gap to the right
252
+ )
253
+
254
+ # Now all directional methods will include these offsets
255
+ heading = page.find('text:bold')
256
+ if heading:
257
+ # This region will start 5px below the heading (not touching)
258
+ content_below = heading.below()
259
+
260
+ # This region will end 5px above the heading
261
+ content_above = heading.above(height=100)
262
+ ```
263
+
264
+ ```python
265
+ # Reset to default offsets (all 0)
266
+ PDF.configure_offsets(
267
+ below_offset=0,
268
+ above_offset=0,
269
+ left_offset=0,
270
+ right_offset=0
271
+ )
272
+ ```
273
+
274
+ These offsets are particularly useful when:
275
+ - Extracting text that might be too close to headers/footers
276
+ - Creating regions that need consistent spacing
277
+ - Working with documents that have tight layouts
278
+
215
279
  ## Using Exclusion Zones with Regions
216
280
 
217
281
  Exclusion zones are regions that you want to ignore during operations like text extraction.