natural-pdf 0.2.6__tar.gz → 0.2.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. {natural_pdf-0.2.6/natural_pdf.egg-info → natural_pdf-0.2.8}/PKG-INFO +1 -1
  2. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/page.py +114 -18
  3. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/page_collection.py +41 -19
  4. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/pdf.py +14 -14
  5. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/elements/element_collection.py +27 -8
  6. {natural_pdf-0.2.6 → natural_pdf-0.2.8/natural_pdf.egg-info}/PKG-INFO +1 -1
  7. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_color_hex_display.py +4 -3
  8. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_fix_get_sections_zero_height.py +4 -2
  9. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_get_sections_fix_comprehensive.py +7 -4
  10. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides_extract_table.py +1 -0
  11. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides_extract_table_collections.py +2 -2
  12. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_mock.py +45 -34
  13. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_merge_method.py +8 -6
  14. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_sections_with_start_and_end.py +13 -4
  15. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_slice_cache_reuse.py +27 -12
  16. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_slice_exclusion_mock.py +12 -12
  17. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.cursor/rules/analysis_framework.mdc +0 -0
  18. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.cursor/rules/coding-style.mdc +0 -0
  19. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  20. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.cursor/rules/minimal-comments.mdc +0 -0
  21. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  22. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  23. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.github/workflows/ci.yml +0 -0
  24. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.github/workflows/docs.yml +0 -0
  25. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.github/workflows/nightly-tutorials.yml +0 -0
  26. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.gitignore +0 -0
  27. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/.pre-commit-config.yaml +0 -0
  28. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/01-execute_notebooks.py +0 -0
  29. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/02-run_all_tutorials.sh +0 -0
  30. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/CLAUDE.md +0 -0
  31. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/LICENSE +0 -0
  32. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/MANIFEST.in +0 -0
  33. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/README.md +0 -0
  34. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/audit_packaging.py +0 -0
  35. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/check_run_md.sh +0 -0
  36. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/api/index.md +0 -0
  37. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/assets/favicon.png +0 -0
  38. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/assets/favicon.svg +0 -0
  39. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/assets/javascripts/custom.js +0 -0
  40. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/assets/logo.svg +0 -0
  41. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/assets/sample-screen.png +0 -0
  42. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/assets/social-preview.png +0 -0
  43. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/assets/social-preview.svg +0 -0
  44. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/assets/stylesheets/custom.css +0 -0
  45. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/categorizing-documents/index.md +0 -0
  46. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/data-extraction/index.md +0 -0
  47. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/describe/index.md +0 -0
  48. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/document-qa/index.md +0 -0
  49. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/element-selection/index.md +0 -0
  50. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/extracting-clean-text/index.md +0 -0
  51. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/finetuning/index.md +0 -0
  52. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/fix-messy-tables/index.md +0 -0
  53. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/fix-messy-tables/table_1.csv +0 -0
  54. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/fix-messy-tables/table_2.csv +0 -0
  55. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/fix-messy-tables/table_3.csv +0 -0
  56. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/index.md +0 -0
  57. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/installation/index.md +0 -0
  58. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/interactive-widget/index.md +0 -0
  59. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/layout-analysis/index.md +0 -0
  60. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/loops-and-groups/index.md +0 -0
  61. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/ocr/index.md +0 -0
  62. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/pdf-navigation/index.md +0 -0
  63. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  64. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/process-forms-and-invoices/index.md +0 -0
  65. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/quick-reference/index.md +0 -0
  66. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/reflowing-pages/index.md +0 -0
  67. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/regions/index.md +0 -0
  68. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tables/index.md +0 -0
  69. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/text-analysis/index.md +0 -0
  70. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/01-loading-and-extraction.md +0 -0
  71. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/02-finding-elements.md +0 -0
  72. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/03-extracting-blocks.md +0 -0
  73. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/04-table-extraction.md +0 -0
  74. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/05-excluding-content.md +0 -0
  75. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/06-document-qa.md +0 -0
  76. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/07-layout-analysis.md +0 -0
  77. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/07-working-with-regions.md +0 -0
  78. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/08-spatial-navigation.md +0 -0
  79. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/09-section-extraction.md +0 -0
  80. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/10-form-field-extraction.md +0 -0
  81. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  82. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/12-ocr-integration.md +0 -0
  83. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/13-semantic-search.md +0 -0
  84. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/tutorials/14-categorizing-documents.md +0 -0
  85. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/visual-debugging/index.md +0 -0
  86. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/docs/visual-debugging/region.png +0 -0
  87. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/mkdocs.yml +0 -0
  88. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/__init__.py +0 -0
  89. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/__init__.py +0 -0
  90. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/guides.py +0 -0
  91. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/__init__.py +0 -0
  92. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/base.py +0 -0
  93. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/docling.py +0 -0
  94. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/gemini.py +0 -0
  95. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  96. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  97. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  98. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/paddle.py +0 -0
  99. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  100. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/surya.py +0 -0
  101. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  102. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/tatr.py +0 -0
  103. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/layout/yolo.py +0 -0
  104. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  105. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/text_options.py +0 -0
  106. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/text_structure.py +0 -0
  107. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/analyzers/utils.py +0 -0
  108. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/classification/manager.py +0 -0
  109. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/classification/mixin.py +0 -0
  110. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/classification/results.py +0 -0
  111. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/cli.py +0 -0
  112. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/collections/mixins.py +0 -0
  113. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/__init__.py +0 -0
  114. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/element_manager.py +0 -0
  115. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/highlighting_service.py +0 -0
  116. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/page_groupby.py +0 -0
  117. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/pdf_collection.py +0 -0
  118. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/core/render_spec.py +0 -0
  119. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/describe/__init__.py +0 -0
  120. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/describe/base.py +0 -0
  121. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/describe/elements.py +0 -0
  122. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/describe/mixin.py +0 -0
  123. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/describe/summary.py +0 -0
  124. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/elements/__init__.py +0 -0
  125. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/elements/base.py +0 -0
  126. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/elements/image.py +0 -0
  127. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/elements/line.py +0 -0
  128. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/elements/rect.py +0 -0
  129. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/elements/region.py +0 -0
  130. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/elements/text.py +0 -0
  131. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/export/mixin.py +0 -0
  132. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/__init__.py +0 -0
  133. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/base.py +0 -0
  134. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/data/__init__.py +0 -0
  135. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/data/pdf.ttf +0 -0
  136. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/data/sRGB.icc +0 -0
  137. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/hocr.py +0 -0
  138. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/hocr_font.py +0 -0
  139. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/original_pdf.py +0 -0
  140. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/paddleocr.py +0 -0
  141. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/exporters/searchable_pdf.py +0 -0
  142. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/extraction/manager.py +0 -0
  143. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/extraction/mixin.py +0 -0
  144. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/extraction/result.py +0 -0
  145. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/flows/__init__.py +0 -0
  146. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/flows/collections.py +0 -0
  147. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/flows/element.py +0 -0
  148. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/flows/flow.py +0 -0
  149. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/flows/region.py +0 -0
  150. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/__init__.py +0 -0
  151. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/engine.py +0 -0
  152. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/engine_doctr.py +0 -0
  153. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/engine_easyocr.py +0 -0
  154. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/engine_paddle.py +0 -0
  155. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/engine_surya.py +0 -0
  156. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/ocr_factory.py +0 -0
  157. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/ocr_manager.py +0 -0
  158. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/ocr_options.py +0 -0
  159. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/ocr/utils.py +0 -0
  160. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/qa/__init__.py +0 -0
  161. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/qa/document_qa.py +0 -0
  162. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/qa/qa_result.py +0 -0
  163. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/search/__init__.py +0 -0
  164. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/search/lancedb_search_service.py +0 -0
  165. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/search/numpy_search_service.py +0 -0
  166. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/search/search_options.py +0 -0
  167. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/search/search_service_protocol.py +0 -0
  168. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/search/searchable_mixin.py +0 -0
  169. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/selectors/__init__.py +0 -0
  170. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/selectors/parser.py +0 -0
  171. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/tables/__init__.py +0 -0
  172. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/tables/result.py +0 -0
  173. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/templates/__init__.py +0 -0
  174. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  175. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/templates/spa/css/style.css +0 -0
  176. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/templates/spa/index.html +0 -0
  177. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/templates/spa/js/app.js +0 -0
  178. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/templates/spa/words.txt +0 -0
  179. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/text_mixin.py +0 -0
  180. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/__init__.py +0 -0
  181. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/bidi_mirror.py +0 -0
  182. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/color_utils.py +0 -0
  183. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/debug.py +0 -0
  184. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/highlighting.py +0 -0
  185. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/identifiers.py +0 -0
  186. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/layout.py +0 -0
  187. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/locks.py +0 -0
  188. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/packaging.py +0 -0
  189. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/reading_order.py +0 -0
  190. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/text_extraction.py +0 -0
  191. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/utils/visualization.py +0 -0
  192. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/vision/__init__.py +0 -0
  193. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/vision/mixin.py +0 -0
  194. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/vision/results.py +0 -0
  195. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/vision/similarity.py +0 -0
  196. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/widgets/__init__.py +0 -0
  197. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf/widgets/viewer.py +0 -0
  198. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf.egg-info/SOURCES.txt +0 -0
  199. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf.egg-info/dependency_links.txt +0 -0
  200. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf.egg-info/entry_points.txt +0 -0
  201. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf.egg-info/requires.txt +0 -0
  202. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/natural_pdf.egg-info/top_level.txt +0 -0
  203. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/noxfile.py +0 -0
  204. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/optimization/memory_comparison.py +0 -0
  205. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/optimization/pdf_analyzer.py +0 -0
  206. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/optimization/performance_analysis.py +0 -0
  207. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  208. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  209. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  210. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  211. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/optimization/test_cleanup_methods.py +0 -0
  212. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/optimization/test_memory_fix.py +0 -0
  213. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/publish.sh +0 -0
  214. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/pyproject.toml +0 -0
  215. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/sample-screen.png +0 -0
  216. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/setup.cfg +0 -0
  217. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/conftest.py +0 -0
  218. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/exporters/test_paddleocr_exporter.py +0 -0
  219. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_annotate.py +0 -0
  220. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_arabic_performance.py +0 -0
  221. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_arabic_real_world.py +0 -0
  222. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_color_conversion.py +0 -0
  223. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_core/test_containment_geometry.py +0 -0
  224. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_core/test_elements.py +0 -0
  225. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_core/test_loading.py +0 -0
  226. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_core/test_spatial.py +0 -0
  227. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_core/test_text_extraction.py +0 -0
  228. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_core/test_text_layer.py +0 -0
  229. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_crop_enhancements.py +0 -0
  230. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_crop_region_highlights.py +0 -0
  231. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_directional_defaults.py +0 -0
  232. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_dissolve.py +0 -0
  233. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_dissolve_cross_page_bug.py +0 -0
  234. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_dissolve_debug_issue.py +0 -0
  235. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_dissolve_real_world_issue.py +0 -0
  236. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_dissolve_single_elements.py +0 -0
  237. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_dissolve_vertical_offset_issue.py +0 -0
  238. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_document_qa.py +0 -0
  239. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_element_addition.py +0 -0
  240. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_element_collection_show_cols.py +0 -0
  241. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_element_collection_slicing.py +0 -0
  242. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_element_show_crop_highlights.py +0 -0
  243. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_empty_pseudo_class.py +0 -0
  244. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_exclusions.py +0 -0
  245. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_expand.py +0 -0
  246. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_extraction_error.py +0 -0
  247. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_extraction_mixin_fix.py +0 -0
  248. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_extraction_text_and_vision.py +0 -0
  249. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_extraction_working.py +0 -0
  250. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_find_similar.py +0 -0
  251. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_first_last_selectors.py +0 -0
  252. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_flow_region_directional.py +0 -0
  253. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_get_sections_zero_height.py +0 -0
  254. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_groupby.py +0 -0
  255. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides.py +0 -0
  256. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides_apply_exclusions.py +0 -0
  257. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides_apply_exclusions_simple.py +0 -0
  258. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides_extract_table_exclusions.py +0 -0
  259. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides_extract_table_real.py +0 -0
  260. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_guides_integration.py +0 -0
  261. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_highlight_detection.py +0 -0
  262. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_highlight_detection_comprehensive.py +0 -0
  263. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_highlight_protocol.py +0 -0
  264. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_highlight_protocol_simple.py +0 -0
  265. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_highlight_regions.py +0 -0
  266. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_comprehensive.py +0 -0
  267. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_debug.py +0 -0
  268. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_final.py +0 -0
  269. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_final_verification.py +0 -0
  270. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_fix.py +0 -0
  271. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_simple.py +0 -0
  272. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_types_pdf.py +0 -0
  273. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_verification.py +0 -0
  274. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_include_boundaries_with_real_text.py +0 -0
  275. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_loading_original.py +0 -0
  276. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_merge_connected.py +0 -0
  277. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_merge_connected_real_world.py +0 -0
  278. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_multi_page_table_discovery.py +0 -0
  279. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_optional_deps.py +0 -0
  280. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_page_exclusion_lists.py +0 -0
  281. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  282. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_region_show_crop_highlights.py +0 -0
  283. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_region_viewer.py +0 -0
  284. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_sections_end_only.py +0 -0
  285. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_show_column_layout.py +0 -0
  286. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_show_edge_cases.py +0 -0
  287. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_show_exclusions.py +0 -0
  288. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_show_exclusions_feature.py +0 -0
  289. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_show_limit.py +0 -0
  290. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_skip_repeating_headers_multipage.py +0 -0
  291. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_slice_exclusion_fix.py +0 -0
  292. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_slice_exclusion_issue.py +0 -0
  293. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_sliced_collection_exclusions.py +0 -0
  294. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_strikethrough_detection.py +0 -0
  295. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_table_result_header_mismatch.py +0 -0
  296. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_table_result_keep_blank.py +0 -0
  297. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_tiny_text_tables.py +0 -0
  298. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_tiny_text_tables_table.py +0 -0
  299. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_tutorials.py +0 -0
  300. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_underline_detection.py +0 -0
  301. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tests/test_update_text.py +0 -0
  302. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/todo/bad_pdf_analysis.md +0 -0
  303. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/todo/evaluation.md +0 -0
  304. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  305. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  306. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  307. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/README.md +0 -0
  308. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/__init__.py +0 -0
  309. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/analyser.py +0 -0
  310. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  311. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  312. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/eval_suite.py +0 -0
  313. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  314. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  315. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  316. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  317. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  318. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/reporter.py +0 -0
  319. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/tools/bad_pdf_eval/utils.py +0 -0
  320. {natural_pdf-0.2.6 → natural_pdf-0.2.8}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.6
3
+ Version: 0.2.8
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -717,14 +717,23 @@ class Page(
717
717
 
718
718
  # Add PDF-level exclusions if we have a parent PDF
719
719
  if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
720
+ # Get existing labels to check for duplicates
721
+ existing_labels = set()
722
+ for exc in all_exclusions:
723
+ if len(exc) >= 2 and exc[1]: # Has a label
724
+ existing_labels.add(exc[1])
725
+
720
726
  for pdf_exclusion in self._parent._exclusions:
721
- # Check if this exclusion is already in our list (avoid duplicates)
722
- if pdf_exclusion not in all_exclusions:
723
- # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
724
- if len(pdf_exclusion) == 2:
725
- # Convert to 3-tuple format with default method
726
- pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
727
- all_exclusions.append(pdf_exclusion)
727
+ # Check if this exclusion label is already in our list (avoid duplicates)
728
+ label = pdf_exclusion[1] if len(pdf_exclusion) >= 2 else None
729
+ if label and label in existing_labels:
730
+ continue # Skip this exclusion as it's already been applied
731
+
732
+ # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
733
+ if len(pdf_exclusion) == 2:
734
+ # Convert to 3-tuple format with default method
735
+ pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
736
+ all_exclusions.append(pdf_exclusion)
728
737
 
729
738
  if debug:
730
739
  print(
@@ -829,6 +838,36 @@ class Page(
829
838
  regions.append(exclusion_item) # Label is already on the Region object
830
839
  if debug:
831
840
  print(f" - Added direct region '{label}': {exclusion_item}")
841
+
842
+ # Process string selectors (from PDF-level exclusions)
843
+ elif isinstance(exclusion_item, str):
844
+ selector_str = exclusion_item
845
+ matching_elements = self.find_all(selector_str, apply_exclusions=False)
846
+
847
+ if debug:
848
+ print(
849
+ f" - Evaluating selector '{exclusion_label}': found {len(matching_elements)} elements"
850
+ )
851
+
852
+ if method == "region":
853
+ # Convert each matching element to a region
854
+ for el in matching_elements:
855
+ try:
856
+ bbox_coords = (
857
+ float(el.x0),
858
+ float(el.top),
859
+ float(el.x1),
860
+ float(el.bottom),
861
+ )
862
+ region = Region(self, bbox_coords, label=label)
863
+ regions.append(region)
864
+ if debug:
865
+ print(f" ✓ Added region from selector match: {bbox_coords}")
866
+ except Exception as e:
867
+ if debug:
868
+ print(f" ✗ Failed to create region from element: {e}")
869
+ # If method is "element", it will be handled in _filter_elements_by_exclusions
870
+
832
871
  # Element-based exclusions are not converted to regions here
833
872
  # They will be handled separately in _filter_elements_by_exclusions
834
873
 
@@ -852,7 +891,16 @@ class Page(
852
891
  Returns:
853
892
  A new list containing only the elements not excluded.
854
893
  """
855
- if not self._exclusions:
894
+ # Check both page-level and PDF-level exclusions
895
+ has_page_exclusions = bool(self._exclusions)
896
+ has_pdf_exclusions = (
897
+ hasattr(self, "_parent")
898
+ and self._parent
899
+ and hasattr(self._parent, "_exclusions")
900
+ and bool(self._parent._exclusions)
901
+ )
902
+
903
+ if not has_page_exclusions and not has_pdf_exclusions:
856
904
  if debug_exclusions:
857
905
  print(
858
906
  f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
@@ -865,9 +913,15 @@ class Page(
865
913
  )
866
914
 
867
915
  # Collect element-based exclusions
868
- excluded_elements = set() # Use set for O(1) lookup
916
+ # Store element bboxes for comparison instead of object ids
917
+ excluded_element_bboxes = set() # Use set for O(1) lookup
918
+
919
+ # Process both page-level and PDF-level exclusions
920
+ all_exclusions = list(self._exclusions) if has_page_exclusions else []
921
+ if has_pdf_exclusions:
922
+ all_exclusions.extend(self._parent._exclusions)
869
923
 
870
- for exclusion_data in self._exclusions:
924
+ for exclusion_data in all_exclusions:
871
925
  # Handle both old format (2-tuple) and new format (3-tuple)
872
926
  if len(exclusion_data) == 2:
873
927
  exclusion_item, label = exclusion_data
@@ -883,16 +937,31 @@ class Page(
883
937
  if isinstance(exclusion_item, Region):
884
938
  continue
885
939
 
940
+ # Handle string selectors for element-based exclusions
941
+ if isinstance(exclusion_item, str) and method == "element":
942
+ selector_str = exclusion_item
943
+ matching_elements = self.find_all(selector_str, apply_exclusions=False)
944
+ for el in matching_elements:
945
+ if hasattr(el, "bbox"):
946
+ bbox = tuple(el.bbox)
947
+ excluded_element_bboxes.add(bbox)
948
+ if debug_exclusions:
949
+ print(
950
+ f" - Added element exclusion from selector '{selector_str}': {bbox}"
951
+ )
952
+
886
953
  # Handle element-based exclusions
887
- if method == "element" and hasattr(exclusion_item, "bbox"):
888
- excluded_elements.add(id(exclusion_item))
954
+ elif method == "element" and hasattr(exclusion_item, "bbox"):
955
+ # Store bbox tuple for comparison
956
+ bbox = tuple(exclusion_item.bbox)
957
+ excluded_element_bboxes.add(bbox)
889
958
  if debug_exclusions:
890
- print(f" - Added element exclusion: {exclusion_item}")
959
+ print(f" - Added element exclusion with bbox {bbox}: {exclusion_item}")
891
960
 
892
961
  if debug_exclusions:
893
962
  print(
894
963
  f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
895
- f"and {len(excluded_elements)} element exclusions to {len(elements)} elements."
964
+ f"and {len(excluded_element_bboxes)} element exclusions to {len(elements)} elements."
896
965
  )
897
966
 
898
967
  filtered_elements = []
@@ -903,7 +972,7 @@ class Page(
903
972
  exclude = False
904
973
 
905
974
  # Check element-based exclusions first (faster)
906
- if id(element) in excluded_elements:
975
+ if hasattr(element, "bbox") and tuple(element.bbox) in excluded_element_bboxes:
907
976
  exclude = True
908
977
  element_excluded_count += 1
909
978
  if debug_exclusions:
@@ -2487,10 +2556,23 @@ class Page(
2487
2556
  return self
2488
2557
 
2489
2558
  def get_section_between(
2490
- self, start_element=None, end_element=None, include_boundaries="both"
2559
+ self,
2560
+ start_element=None,
2561
+ end_element=None,
2562
+ include_boundaries="both",
2563
+ orientation="vertical",
2491
2564
  ) -> Optional["Region"]: # Return Optional
2492
2565
  """
2493
2566
  Get a section between two elements on this page.
2567
+
2568
+ Args:
2569
+ start_element: Element marking the start of the section
2570
+ end_element: Element marking the end of the section
2571
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
2572
+ orientation: 'vertical' (default) or 'horizontal' - determines section direction
2573
+
2574
+ Returns:
2575
+ Region representing the section
2494
2576
  """
2495
2577
  # Create a full-page region to operate within
2496
2578
  page_region = self.create_region(0, 0, self.width, self.height)
@@ -2501,6 +2583,7 @@ class Page(
2501
2583
  start_element=start_element,
2502
2584
  end_element=end_element,
2503
2585
  include_boundaries=include_boundaries,
2586
+ orientation=orientation,
2504
2587
  )
2505
2588
  except Exception as e:
2506
2589
  logger.error(
@@ -2575,10 +2658,23 @@ class Page(
2575
2658
  if include_boundaries not in valid_inclusions:
2576
2659
  raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
2577
2660
 
2578
- if not start_elements:
2579
- # Return an empty ElementCollection if no start elements
2661
+ if not start_elements and not end_elements:
2662
+ # Return an empty ElementCollection if no boundary elements at all
2580
2663
  return ElementCollection([])
2581
2664
 
2665
+ # If we only have end elements, create implicit start elements
2666
+ if not start_elements and end_elements:
2667
+ # Delegate to PageCollection implementation for consistency
2668
+ from natural_pdf.core.page_collection import PageCollection
2669
+
2670
+ pages = PageCollection([self])
2671
+ return pages.get_sections(
2672
+ start_elements=start_elements,
2673
+ end_elements=end_elements,
2674
+ include_boundaries=include_boundaries,
2675
+ orientation=orientation,
2676
+ )
2677
+
2582
2678
  # Combine start and end elements with their type
2583
2679
  all_boundaries = []
2584
2680
  for el in start_elements:
@@ -537,10 +537,14 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
537
537
  first_page = self.pages[0]
538
538
  first_start = Region(first_page, (0, 0, first_page.width, 1))
539
539
  first_start.is_implicit_start = True
540
+ # Don't mark this as created from any end element, so it can pair with any end
540
541
  start_elements.append(first_start)
541
542
 
542
543
  # For each end element (except the last), add an implicit start after it
543
- sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
544
+ # Sort by page, then top, then bottom (for elements with same top), then x0
545
+ sorted_end_elements = sorted(
546
+ end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0)
547
+ )
544
548
  for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
545
549
  # Create implicit start element right after this end element
546
550
  implicit_start = Region(
@@ -838,29 +842,47 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
838
842
  # Create a section from current_start to just before this boundary
839
843
  start_element = current_start["element"]
840
844
 
841
- # Find the last element before this boundary on the same page
845
+ # Create section from current start to just before this new start
842
846
  if start_element.page == boundary["element"].page:
843
- # Find elements on this page
844
- page_elements = [e for e in all_elements if e.page == start_element.page]
845
- # Sort by position based on orientation
847
+ from natural_pdf.elements.region import Region
848
+
849
+ next_start = boundary["element"]
850
+
851
+ # Create section based on orientation
846
852
  if orientation == "vertical":
847
- page_elements.sort(key=lambda e: (e.top, e.x0))
853
+ # Determine vertical bounds
854
+ if include_boundaries in ["start", "both"]:
855
+ top = start_element.top
856
+ else:
857
+ top = start_element.bottom
858
+
859
+ # The section ends just before the next start
860
+ bottom = next_start.top
861
+
862
+ # Create the section with full page width
863
+ if top < bottom:
864
+ section = Region(
865
+ start_element.page, (0, top, start_element.page.width, bottom)
866
+ )
867
+ section.start_element = start_element
868
+ sections.append(section)
848
869
  else: # horizontal
849
- page_elements.sort(key=lambda e: (e.x0, e.top))
870
+ # Determine horizontal bounds
871
+ if include_boundaries in ["start", "both"]:
872
+ left = start_element.x0
873
+ else:
874
+ left = start_element.x1
850
875
 
851
- # Find the last element before the boundary
852
- end_idx = (
853
- page_elements.index(boundary["element"]) - 1
854
- if boundary["element"] in page_elements
855
- else -1
856
- )
857
- end_element = page_elements[end_idx] if end_idx >= 0 else None
876
+ # The section ends just before the next start
877
+ right = next_start.x0
858
878
 
859
- # Create the section
860
- section = start_element.page.get_section_between(
861
- start_element, end_element, include_boundaries, orientation
862
- )
863
- sections.append(section)
879
+ # Create the section with full page height
880
+ if left < right:
881
+ section = Region(
882
+ start_element.page, (left, 0, right, start_element.page.height)
883
+ )
884
+ section.start_element = start_element
885
+ sections.append(section)
864
886
  else:
865
887
  # Cross-page section - create from current_start to the end of its page
866
888
  from natural_pdf.elements.region import Region
@@ -252,6 +252,16 @@ class _LazyPageList(Sequence):
252
252
  logger.warning(f"Failed to apply region to page {cached.number}: {e}")
253
253
 
254
254
  self._cache[index] = cached
255
+
256
+ # Also cache in the parent PDF's main page list if this is a slice
257
+ if (
258
+ hasattr(self._parent_pdf, "_pages")
259
+ and hasattr(self._parent_pdf._pages, "_cache")
260
+ and actual_page_index < len(self._parent_pdf._pages._cache)
261
+ and self._parent_pdf._pages._cache[actual_page_index] is None
262
+ ):
263
+ self._parent_pdf._pages._cache[actual_page_index] = cached
264
+
255
265
  return cached
256
266
 
257
267
  # Sequence protocol ---------------------------------------------------
@@ -720,26 +730,16 @@ class PDF(
720
730
  # Store for bookkeeping and lazy application
721
731
  self._exclusions.append((exclusion_func, label))
722
732
 
723
- # Apply only to already-created (cached) pages to avoid forcing page creation
724
- for i in range(len(self._pages)):
725
- if self._pages._cache[i] is not None: # Only apply to existing pages
726
- try:
727
- self._pages._cache[i].add_exclusion(exclusion_func, label=label)
728
- except Exception as e:
729
- logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
733
+ # Don't modify already-cached pages - they will get PDF-level exclusions
734
+ # dynamically through _get_exclusion_regions()
730
735
  return self
731
736
 
732
737
  # Fallback to original callable / Region behaviour ------------------
733
738
  exclusion_data = (exclusion_func, label)
734
739
  self._exclusions.append(exclusion_data)
735
740
 
736
- # Apply only to already-created (cached) pages to avoid forcing page creation
737
- for i in range(len(self._pages)):
738
- if self._pages._cache[i] is not None: # Only apply to existing pages
739
- try:
740
- self._pages._cache[i].add_exclusion(exclusion_func, label=label)
741
- except Exception as e:
742
- logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
741
+ # Don't modify already-cached pages - they will get PDF-level exclusions
742
+ # dynamically through _get_exclusion_regions()
743
743
 
744
744
  return self
745
745
 
@@ -621,6 +621,7 @@ class ElementCollection(
621
621
 
622
622
  def extract_text(
623
623
  self,
624
+ separator: str = " ",
624
625
  preserve_whitespace: bool = True,
625
626
  use_exclusions: bool = True,
626
627
  strip: Optional[bool] = None,
@@ -632,6 +633,9 @@ class ElementCollection(
632
633
  pdfplumber's layout engine if layout=True is specified.
633
634
 
634
635
  Args:
636
+ separator: String to insert between text from different elements when
637
+ using simple joining (layout=False). Default is a single space.
638
+ Ignored when layout=True as the layout engine handles spacing.
635
639
  preserve_whitespace: Deprecated. Use layout=False for simple joining.
636
640
  use_exclusions: Deprecated. Exclusions should be applied *before* creating
637
641
  the collection or by filtering the collection itself.
@@ -668,7 +672,7 @@ class ElementCollection(
668
672
  logger.warning(
669
673
  "ElementCollection.extract_text: No character dictionaries found in TextElements."
670
674
  )
671
- return " ".join(
675
+ return separator.join(
672
676
  getattr(el, "text", "") for el in text_elements
673
677
  ) # Fallback to simple join of word text
674
678
 
@@ -733,18 +737,33 @@ class ElementCollection(
733
737
  all_char_dicts.sort(
734
738
  key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
735
739
  )
736
- result = " ".join(c.get("text", "") for c in all_char_dicts)
740
+ result = separator.join(c.get("text", "") for c in all_char_dicts)
737
741
 
738
742
  else:
739
743
  # Default: Simple join without layout
740
744
  logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
741
- # Sort chars by document order (page, top, x0)
742
- all_char_dicts.sort(
743
- key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
745
+
746
+ # Instead of joining all characters individually, we need to:
747
+ # 1. Extract text from each element
748
+ # 2. Join the element texts with the separator
749
+
750
+ # Sort elements by document order (page, top, x0)
751
+ sorted_elements = sorted(
752
+ text_elements,
753
+ key=lambda el: (
754
+ el.page.index if hasattr(el, "page") else 0,
755
+ el.top if hasattr(el, "top") else 0,
756
+ el.x0 if hasattr(el, "x0") else 0,
757
+ ),
744
758
  )
745
- # Simple join of character text
746
- result = "".join(c.get("text", "") for c in all_char_dicts)
747
- # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
759
+
760
+ # Extract text from each element
761
+ element_texts = []
762
+ for el in sorted_elements:
763
+ if hasattr(el, "text") and el.text:
764
+ element_texts.append(el.text)
765
+
766
+ result = separator.join(element_texts)
748
767
 
749
768
  # Determine final strip flag – same rule as global helper unless caller overrides
750
769
  strip_text = strip if strip is not None else (not use_layout)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.6
3
+ Version: 0.2.8
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -114,8 +114,9 @@ class TestGroupByColorDisplay:
114
114
  colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
115
115
  for i, color in enumerate(colors):
116
116
  page = MagicMock()
117
- # Create a closure to capture the correct color
118
- page.find.return_value = MagicMock(extract_text=lambda c=color: str(c))
117
+ # PageGroupBy groups by the text content of the element found
118
+ # So we need to return the color tuple as the extracted text
119
+ page.find.return_value = MagicMock(extract_text=lambda c=color: c)
119
120
  mock_pages.append(page)
120
121
 
121
122
  collection = PageCollection(mock_pages)
@@ -141,7 +142,7 @@ class TestGroupByColorDisplay:
141
142
  colors = [(255, 0, 0), (0, 255, 0)]
142
143
  for color in colors:
143
144
  page = MagicMock()
144
- page.find.return_value = MagicMock(extract_text=lambda c=color: str(c))
145
+ page.find.return_value = MagicMock(extract_text=lambda c=color: c)
145
146
  mock_pages.append(page)
146
147
 
147
148
  collection = PageCollection(mock_pages)
@@ -68,7 +68,8 @@ def test_edge_case_single_end_element():
68
68
  print(f"\nSingle end element: bottom={end_elem.bottom}")
69
69
 
70
70
  # Create sections with single end element
71
- sections = page.get_sections(end_elements=[end_elem])
71
+ # When using only end elements, we typically want to include the end boundary
72
+ sections = page.get_sections(end_elements=[end_elem], include_boundaries="end")
72
73
 
73
74
  print(f"Sections created: {len(sections)}")
74
75
 
@@ -80,7 +81,8 @@ def test_edge_case_single_end_element():
80
81
  print(f"Expected height: {end_elem.bottom}")
81
82
 
82
83
  # Height should be approximately end_elem.bottom (from top of page)
83
- assert abs(section.height - end_elem.bottom) < 1.0
84
+ # Allow for small rounding differences
85
+ assert abs(section.height - end_elem.bottom) <= 1.0
84
86
 
85
87
 
86
88
  def test_mixed_start_end_elements():
@@ -115,13 +115,16 @@ def test_implicit_start_not_paired_with_source_end():
115
115
 
116
116
  print(f"\nSections created: {len(sections)}")
117
117
 
118
- # The first section should go from top of page to first end
119
- # The second section should go from first end to second end
118
+ # With default include_boundaries="start", sections exclude the end boundary
119
+ # So the first section should go from top of page to TOP of first end element
120
120
  # There should NOT be a zero-height section at first end
121
121
 
122
+ # Sort end elements like the implementation does
123
+ sorted_ends = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0))
124
+
122
125
  expected_sections = [
123
- (0, end_elements[0].bottom), # Top to first end
124
- (end_elements[0].bottom, end_elements[1].bottom), # First end to second end
126
+ (0, sorted_ends[0].top), # Top to TOP of first sorted end (exclude end boundary)
127
+ # Second section continues from there - we don't check its end
125
128
  ]
126
129
 
127
130
  for i, section in enumerate(sections):
@@ -96,6 +96,7 @@ def test_guides_extract_table_with_parameters():
96
96
  cell_extraction_func=None,
97
97
  show_progress=False,
98
98
  content_filter=None,
99
+ apply_exclusions=True,
99
100
  )
100
101
 
101
102
 
@@ -77,13 +77,13 @@ def test_extract_table_collection_header_options():
77
77
 
78
78
  # Test header=None
79
79
  result2 = guide.extract_table(pages, header=None)
80
- df2 = result2.to_df()
80
+ df2 = result2.to_df(header=None) # Need to pass header=None to to_df as well
81
81
  assert isinstance(df2.columns[0], int) # Should use numeric indices
82
82
 
83
83
  # Test custom headers
84
84
  custom_headers = ["A", "B", "C", "D", "E", "F", "G", "H"]
85
85
  result3 = guide.extract_table(pages, header=custom_headers)
86
- df3 = result3.to_df()
86
+ df3 = result3.to_df(header=custom_headers) # Pass custom headers to to_df
87
87
  assert list(df3.columns) == custom_headers
88
88
 
89
89
 
@@ -40,18 +40,19 @@ def test_get_sections_include_boundaries():
40
40
  page.pdf = pdf
41
41
 
42
42
  # Create mock elements on the page
43
- # Header at top of page
44
- header_element = create_mock_element(page, "Section 1", top=700, bottom=720)
43
+ # In PDF coordinates, top of page has higher Y value
44
+ # Header at top of page (high Y value)
45
+ header_element = create_mock_element(page, "Section 1", top=100, bottom=120)
45
46
 
46
47
  # Content in middle
47
48
  content_elements = [
48
- create_mock_element(page, "Content line 1", top=650, bottom=670),
49
- create_mock_element(page, "Content line 2", top=620, bottom=640),
50
- create_mock_element(page, "Content line 3", top=590, bottom=610),
49
+ create_mock_element(page, "Content line 1", top=150, bottom=170),
50
+ create_mock_element(page, "Content line 2", top=200, bottom=220),
51
+ create_mock_element(page, "Content line 3", top=250, bottom=270),
51
52
  ]
52
53
 
53
- # Next header
54
- next_header = create_mock_element(page, "Section 2", top=550, bottom=570)
54
+ # Next header (lower on page, higher Y value)
55
+ next_header = create_mock_element(page, "Section 2", top=300, bottom=320)
55
56
 
56
57
  # Set up the page's element finding
57
58
  all_elements = [header_element] + content_elements + [next_header]
@@ -63,24 +64,38 @@ def test_get_sections_include_boundaries():
63
64
 
64
65
  page.find_all = mock_find_all
65
66
 
67
+ # Mock get_elements to return all elements
68
+ page.get_elements = Mock(return_value=all_elements)
69
+
66
70
  # Mock get_section_between to return regions with correct boundaries
67
- def mock_get_section_between(start, end, include_boundaries="both"):
71
+ def mock_get_section_between(start, end, include_boundaries="both", orientation="vertical"):
72
+ # Ensure start and end are in the right order
73
+ # In this test setup, start should come before end (lower top value)
74
+ if not end:
75
+ end_top = page.height
76
+ end_bottom = page.height
77
+ else:
78
+ end_top = end.top
79
+ end_bottom = end.bottom
80
+
68
81
  if include_boundaries == "both":
69
82
  top = start.top
70
- bottom = end.bottom if end else page.height
83
+ bottom = end_bottom
71
84
  elif include_boundaries == "start":
72
85
  top = start.top
73
- bottom = end.top if end else page.height
86
+ bottom = end_top
74
87
  elif include_boundaries == "end":
75
88
  top = start.bottom
76
- bottom = end.bottom if end else page.height
89
+ bottom = end_bottom
77
90
  else: # none
78
91
  top = start.bottom
79
- bottom = end.top if end else page.height
92
+ bottom = end_top
93
+
94
+ # Ensure top < bottom for valid region
95
+ if top > bottom:
96
+ top, bottom = bottom, top
80
97
 
81
98
  region = Region(page, (0, top, page.width, bottom))
82
- # Store which elements would be in this region
83
- region._included_elements = [e for e in all_elements if e.top >= bottom and e.bottom <= top]
84
99
  return region
85
100
 
86
101
  page.get_section_between = mock_get_section_between
@@ -106,37 +121,33 @@ def test_get_sections_include_boundaries():
106
121
  for boundaries in ["both", "start", "end", "none"]:
107
122
  sections = collection.get_sections("text:contains(Section)", include_boundaries=boundaries)
108
123
 
124
+ print(f"\ninclude_boundaries='{boundaries}':")
125
+ print(f" Number of sections: {len(sections)}")
126
+
109
127
  if len(sections) > 0:
110
128
  section = sections[0]
111
- print(f"\ninclude_boundaries='{boundaries}':")
112
129
  print(f" Section bbox: {section.bbox}")
113
130
  print(f" Top: {section.bbox[1]}, Bottom: {section.bbox[3]}")
114
131
 
115
- # Verify boundaries are correct
116
- if boundaries == "both":
132
+ # When we have only start elements, sections go from start to next start
133
+ # The section always ends at the TOP of the next start element
134
+ # include_boundaries only affects whether we include the START element
135
+ if boundaries == "both" or boundaries == "start":
136
+ # Should include the start element
117
137
  assert (
118
138
  section.bbox[1] == header_element.top
119
- ), f"'both' should include start element top"
139
+ ), f"'{boundaries}' should start at first element top"
120
140
  assert (
121
- section.bbox[3] == next_header.bottom
122
- ), f"'both' should include end element bottom"
123
- elif boundaries == "start":
124
- assert (
125
- section.bbox[1] == header_element.top
126
- ), f"'start' should include start element top"
127
- assert section.bbox[3] == next_header.top, f"'start' should exclude end element"
128
- elif boundaries == "end":
141
+ section.bbox[3] == next_header.top
142
+ ), f"Section should always end at next element top"
143
+ else: # "end" or "none"
144
+ # Should exclude the start element
129
145
  assert (
130
146
  section.bbox[1] == header_element.bottom
131
- ), f"'end' should exclude start element"
147
+ ), f"'{boundaries}' should start after first element"
132
148
  assert (
133
- section.bbox[3] == next_header.bottom
134
- ), f"'end' should include end element bottom"
135
- else: # none
136
- assert (
137
- section.bbox[1] == header_element.bottom
138
- ), f"'none' should exclude start element"
139
- assert section.bbox[3] == next_header.top, f"'none' should exclude end element"
149
+ section.bbox[3] == next_header.top
150
+ ), f"Section should always end at next element top"
140
151
 
141
152
  print("\n✅ All mock tests passed! include_boundaries parameter is working correctly.")
142
153