natural-pdf 0.2.6__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. {natural_pdf-0.2.6/natural_pdf.egg-info → natural_pdf-0.2.9}/PKG-INFO +1 -1
  2. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/page.py +114 -18
  3. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/page_collection.py +41 -19
  4. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/pdf.py +14 -14
  5. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/element_collection.py +62 -15
  6. {natural_pdf-0.2.6 → natural_pdf-0.2.9/natural_pdf.egg-info}/PKG-INFO +1 -1
  7. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_color_hex_display.py +4 -3
  8. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_fix_get_sections_zero_height.py +4 -2
  9. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_get_sections_fix_comprehensive.py +7 -4
  10. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table.py +1 -0
  11. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table_collections.py +2 -2
  12. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_mock.py +45 -34
  13. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_merge_method.py +8 -6
  14. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_sections_with_start_and_end.py +13 -4
  15. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_slice_cache_reuse.py +27 -12
  16. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_slice_exclusion_mock.py +12 -12
  17. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/analysis_framework.mdc +0 -0
  18. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/coding-style.mdc +0 -0
  19. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  20. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/minimal-comments.mdc +0 -0
  21. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  22. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  23. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.github/workflows/ci.yml +0 -0
  24. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.github/workflows/docs.yml +0 -0
  25. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.github/workflows/nightly-tutorials.yml +0 -0
  26. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.gitignore +0 -0
  27. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/.pre-commit-config.yaml +0 -0
  28. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/01-execute_notebooks.py +0 -0
  29. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/02-run_all_tutorials.sh +0 -0
  30. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/CLAUDE.md +0 -0
  31. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/LICENSE +0 -0
  32. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/MANIFEST.in +0 -0
  33. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/README.md +0 -0
  34. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/audit_packaging.py +0 -0
  35. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/check_run_md.sh +0 -0
  36. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/api/index.md +0 -0
  37. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/favicon.png +0 -0
  38. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/favicon.svg +0 -0
  39. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/javascripts/custom.js +0 -0
  40. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/logo.svg +0 -0
  41. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/sample-screen.png +0 -0
  42. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/social-preview.png +0 -0
  43. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/social-preview.svg +0 -0
  44. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/assets/stylesheets/custom.css +0 -0
  45. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/categorizing-documents/index.md +0 -0
  46. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/data-extraction/index.md +0 -0
  47. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/describe/index.md +0 -0
  48. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/document-qa/index.md +0 -0
  49. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/element-selection/index.md +0 -0
  50. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/extracting-clean-text/index.md +0 -0
  51. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/finetuning/index.md +0 -0
  52. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/fix-messy-tables/index.md +0 -0
  53. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/fix-messy-tables/table_1.csv +0 -0
  54. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/fix-messy-tables/table_2.csv +0 -0
  55. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/fix-messy-tables/table_3.csv +0 -0
  56. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/index.md +0 -0
  57. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/installation/index.md +0 -0
  58. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/interactive-widget/index.md +0 -0
  59. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/layout-analysis/index.md +0 -0
  60. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/loops-and-groups/index.md +0 -0
  61. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/ocr/index.md +0 -0
  62. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/pdf-navigation/index.md +0 -0
  63. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  64. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/process-forms-and-invoices/index.md +0 -0
  65. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/quick-reference/index.md +0 -0
  66. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/reflowing-pages/index.md +0 -0
  67. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/regions/index.md +0 -0
  68. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tables/index.md +0 -0
  69. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/text-analysis/index.md +0 -0
  70. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/01-loading-and-extraction.md +0 -0
  71. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/02-finding-elements.md +0 -0
  72. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/03-extracting-blocks.md +0 -0
  73. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/04-table-extraction.md +0 -0
  74. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/05-excluding-content.md +0 -0
  75. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/06-document-qa.md +0 -0
  76. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/07-layout-analysis.md +0 -0
  77. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/07-working-with-regions.md +0 -0
  78. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/08-spatial-navigation.md +0 -0
  79. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/09-section-extraction.md +0 -0
  80. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/10-form-field-extraction.md +0 -0
  81. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  82. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/12-ocr-integration.md +0 -0
  83. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/13-semantic-search.md +0 -0
  84. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/tutorials/14-categorizing-documents.md +0 -0
  85. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/visual-debugging/index.md +0 -0
  86. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/docs/visual-debugging/region.png +0 -0
  87. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/mkdocs.yml +0 -0
  88. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/__init__.py +0 -0
  89. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/__init__.py +0 -0
  90. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/guides.py +0 -0
  91. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/__init__.py +0 -0
  92. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/base.py +0 -0
  93. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/docling.py +0 -0
  94. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/gemini.py +0 -0
  95. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  96. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  97. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  98. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/paddle.py +0 -0
  99. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  100. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/surya.py +0 -0
  101. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  102. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/tatr.py +0 -0
  103. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/yolo.py +0 -0
  104. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  105. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/text_options.py +0 -0
  106. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/text_structure.py +0 -0
  107. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/analyzers/utils.py +0 -0
  108. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/classification/manager.py +0 -0
  109. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/classification/mixin.py +0 -0
  110. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/classification/results.py +0 -0
  111. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/cli.py +0 -0
  112. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/collections/mixins.py +0 -0
  113. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/__init__.py +0 -0
  114. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/element_manager.py +0 -0
  115. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/highlighting_service.py +0 -0
  116. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/page_groupby.py +0 -0
  117. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/pdf_collection.py +0 -0
  118. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/core/render_spec.py +0 -0
  119. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/__init__.py +0 -0
  120. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/base.py +0 -0
  121. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/elements.py +0 -0
  122. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/mixin.py +0 -0
  123. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/describe/summary.py +0 -0
  124. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/__init__.py +0 -0
  125. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/base.py +0 -0
  126. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/image.py +0 -0
  127. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/line.py +0 -0
  128. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/rect.py +0 -0
  129. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/region.py +0 -0
  130. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/elements/text.py +0 -0
  131. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/export/mixin.py +0 -0
  132. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/__init__.py +0 -0
  133. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/base.py +0 -0
  134. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/data/__init__.py +0 -0
  135. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/data/pdf.ttf +0 -0
  136. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/data/sRGB.icc +0 -0
  137. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/hocr.py +0 -0
  138. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/hocr_font.py +0 -0
  139. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/original_pdf.py +0 -0
  140. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/paddleocr.py +0 -0
  141. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/exporters/searchable_pdf.py +0 -0
  142. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/extraction/manager.py +0 -0
  143. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/extraction/mixin.py +0 -0
  144. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/extraction/result.py +0 -0
  145. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/__init__.py +0 -0
  146. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/collections.py +0 -0
  147. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/element.py +0 -0
  148. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/flow.py +0 -0
  149. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/flows/region.py +0 -0
  150. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/__init__.py +0 -0
  151. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine.py +0 -0
  152. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_doctr.py +0 -0
  153. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_easyocr.py +0 -0
  154. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_paddle.py +0 -0
  155. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_surya.py +0 -0
  156. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/ocr_factory.py +0 -0
  157. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/ocr_manager.py +0 -0
  158. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/ocr_options.py +0 -0
  159. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/ocr/utils.py +0 -0
  160. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/qa/__init__.py +0 -0
  161. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/qa/document_qa.py +0 -0
  162. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/qa/qa_result.py +0 -0
  163. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/__init__.py +0 -0
  164. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/lancedb_search_service.py +0 -0
  165. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/numpy_search_service.py +0 -0
  166. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/search_options.py +0 -0
  167. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/search_service_protocol.py +0 -0
  168. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/search/searchable_mixin.py +0 -0
  169. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/selectors/__init__.py +0 -0
  170. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/selectors/parser.py +0 -0
  171. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/tables/__init__.py +0 -0
  172. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/tables/result.py +0 -0
  173. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/__init__.py +0 -0
  174. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  175. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/spa/css/style.css +0 -0
  176. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/spa/index.html +0 -0
  177. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/spa/js/app.js +0 -0
  178. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/templates/spa/words.txt +0 -0
  179. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/text_mixin.py +0 -0
  180. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/__init__.py +0 -0
  181. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/bidi_mirror.py +0 -0
  182. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/color_utils.py +0 -0
  183. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/debug.py +0 -0
  184. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/highlighting.py +0 -0
  185. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/identifiers.py +0 -0
  186. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/layout.py +0 -0
  187. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/locks.py +0 -0
  188. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/packaging.py +0 -0
  189. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/reading_order.py +0 -0
  190. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/text_extraction.py +0 -0
  191. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/utils/visualization.py +0 -0
  192. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/vision/__init__.py +0 -0
  193. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/vision/mixin.py +0 -0
  194. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/vision/results.py +0 -0
  195. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/vision/similarity.py +0 -0
  196. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/widgets/__init__.py +0 -0
  197. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf/widgets/viewer.py +0 -0
  198. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/SOURCES.txt +0 -0
  199. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/dependency_links.txt +0 -0
  200. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/entry_points.txt +0 -0
  201. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/requires.txt +0 -0
  202. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/natural_pdf.egg-info/top_level.txt +0 -0
  203. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/noxfile.py +0 -0
  204. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/memory_comparison.py +0 -0
  205. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/pdf_analyzer.py +0 -0
  206. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_analysis.py +0 -0
  207. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  208. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  209. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  210. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  211. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/test_cleanup_methods.py +0 -0
  212. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/optimization/test_memory_fix.py +0 -0
  213. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/publish.sh +0 -0
  214. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/pyproject.toml +0 -0
  215. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/sample-screen.png +0 -0
  216. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/setup.cfg +0 -0
  217. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/conftest.py +0 -0
  218. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/exporters/test_paddleocr_exporter.py +0 -0
  219. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_annotate.py +0 -0
  220. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_arabic_performance.py +0 -0
  221. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_arabic_real_world.py +0 -0
  222. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_color_conversion.py +0 -0
  223. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_containment_geometry.py +0 -0
  224. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_elements.py +0 -0
  225. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_loading.py +0 -0
  226. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_spatial.py +0 -0
  227. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_text_extraction.py +0 -0
  228. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_core/test_text_layer.py +0 -0
  229. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_crop_enhancements.py +0 -0
  230. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_crop_region_highlights.py +0 -0
  231. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_directional_defaults.py +0 -0
  232. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve.py +0 -0
  233. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_cross_page_bug.py +0 -0
  234. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_debug_issue.py +0 -0
  235. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_real_world_issue.py +0 -0
  236. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_single_elements.py +0 -0
  237. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_dissolve_vertical_offset_issue.py +0 -0
  238. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_document_qa.py +0 -0
  239. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_element_addition.py +0 -0
  240. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_element_collection_show_cols.py +0 -0
  241. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_element_collection_slicing.py +0 -0
  242. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_element_show_crop_highlights.py +0 -0
  243. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_empty_pseudo_class.py +0 -0
  244. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_exclusions.py +0 -0
  245. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_expand.py +0 -0
  246. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_extraction_error.py +0 -0
  247. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_extraction_mixin_fix.py +0 -0
  248. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_extraction_text_and_vision.py +0 -0
  249. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_extraction_working.py +0 -0
  250. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_find_similar.py +0 -0
  251. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_first_last_selectors.py +0 -0
  252. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_flow_region_directional.py +0 -0
  253. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_get_sections_zero_height.py +0 -0
  254. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_groupby.py +0 -0
  255. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides.py +0 -0
  256. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_apply_exclusions.py +0 -0
  257. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_apply_exclusions_simple.py +0 -0
  258. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table_exclusions.py +0 -0
  259. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_extract_table_real.py +0 -0
  260. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_guides_integration.py +0 -0
  261. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_detection.py +0 -0
  262. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_detection_comprehensive.py +0 -0
  263. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_protocol.py +0 -0
  264. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_protocol_simple.py +0 -0
  265. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_highlight_regions.py +0 -0
  266. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_comprehensive.py +0 -0
  267. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_debug.py +0 -0
  268. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_final.py +0 -0
  269. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_final_verification.py +0 -0
  270. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_fix.py +0 -0
  271. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_simple.py +0 -0
  272. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_types_pdf.py +0 -0
  273. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_verification.py +0 -0
  274. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_include_boundaries_with_real_text.py +0 -0
  275. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_loading_original.py +0 -0
  276. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_merge_connected.py +0 -0
  277. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_merge_connected_real_world.py +0 -0
  278. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_multi_page_table_discovery.py +0 -0
  279. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_optional_deps.py +0 -0
  280. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_page_exclusion_lists.py +0 -0
  281. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  282. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_region_show_crop_highlights.py +0 -0
  283. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_region_viewer.py +0 -0
  284. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_sections_end_only.py +0 -0
  285. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_column_layout.py +0 -0
  286. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_edge_cases.py +0 -0
  287. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_exclusions.py +0 -0
  288. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_exclusions_feature.py +0 -0
  289. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_show_limit.py +0 -0
  290. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_skip_repeating_headers_multipage.py +0 -0
  291. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_slice_exclusion_fix.py +0 -0
  292. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_slice_exclusion_issue.py +0 -0
  293. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_sliced_collection_exclusions.py +0 -0
  294. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_strikethrough_detection.py +0 -0
  295. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_table_result_header_mismatch.py +0 -0
  296. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_table_result_keep_blank.py +0 -0
  297. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_tiny_text_tables.py +0 -0
  298. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_tiny_text_tables_table.py +0 -0
  299. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_tutorials.py +0 -0
  300. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_underline_detection.py +0 -0
  301. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tests/test_update_text.py +0 -0
  302. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/todo/bad_pdf_analysis.md +0 -0
  303. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/todo/evaluation.md +0 -0
  304. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  305. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  306. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  307. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/README.md +0 -0
  308. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/__init__.py +0 -0
  309. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/analyser.py +0 -0
  310. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  311. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  312. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/eval_suite.py +0 -0
  313. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  314. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  315. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  316. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  317. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  318. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/reporter.py +0 -0
  319. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/tools/bad_pdf_eval/utils.py +0 -0
  320. {natural_pdf-0.2.6 → natural_pdf-0.2.9}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.6
3
+ Version: 0.2.9
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -717,14 +717,23 @@ class Page(
717
717
 
718
718
  # Add PDF-level exclusions if we have a parent PDF
719
719
  if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
720
+ # Get existing labels to check for duplicates
721
+ existing_labels = set()
722
+ for exc in all_exclusions:
723
+ if len(exc) >= 2 and exc[1]: # Has a label
724
+ existing_labels.add(exc[1])
725
+
720
726
  for pdf_exclusion in self._parent._exclusions:
721
- # Check if this exclusion is already in our list (avoid duplicates)
722
- if pdf_exclusion not in all_exclusions:
723
- # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
724
- if len(pdf_exclusion) == 2:
725
- # Convert to 3-tuple format with default method
726
- pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
727
- all_exclusions.append(pdf_exclusion)
727
+ # Check if this exclusion label is already in our list (avoid duplicates)
728
+ label = pdf_exclusion[1] if len(pdf_exclusion) >= 2 else None
729
+ if label and label in existing_labels:
730
+ continue # Skip this exclusion as it's already been applied
731
+
732
+ # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
733
+ if len(pdf_exclusion) == 2:
734
+ # Convert to 3-tuple format with default method
735
+ pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
736
+ all_exclusions.append(pdf_exclusion)
728
737
 
729
738
  if debug:
730
739
  print(
@@ -829,6 +838,36 @@ class Page(
829
838
  regions.append(exclusion_item) # Label is already on the Region object
830
839
  if debug:
831
840
  print(f" - Added direct region '{label}': {exclusion_item}")
841
+
842
+ # Process string selectors (from PDF-level exclusions)
843
+ elif isinstance(exclusion_item, str):
844
+ selector_str = exclusion_item
845
+ matching_elements = self.find_all(selector_str, apply_exclusions=False)
846
+
847
+ if debug:
848
+ print(
849
+ f" - Evaluating selector '{exclusion_label}': found {len(matching_elements)} elements"
850
+ )
851
+
852
+ if method == "region":
853
+ # Convert each matching element to a region
854
+ for el in matching_elements:
855
+ try:
856
+ bbox_coords = (
857
+ float(el.x0),
858
+ float(el.top),
859
+ float(el.x1),
860
+ float(el.bottom),
861
+ )
862
+ region = Region(self, bbox_coords, label=label)
863
+ regions.append(region)
864
+ if debug:
865
+ print(f" ✓ Added region from selector match: {bbox_coords}")
866
+ except Exception as e:
867
+ if debug:
868
+ print(f" ✗ Failed to create region from element: {e}")
869
+ # If method is "element", it will be handled in _filter_elements_by_exclusions
870
+
832
871
  # Element-based exclusions are not converted to regions here
833
872
  # They will be handled separately in _filter_elements_by_exclusions
834
873
 
@@ -852,7 +891,16 @@ class Page(
852
891
  Returns:
853
892
  A new list containing only the elements not excluded.
854
893
  """
855
- if not self._exclusions:
894
+ # Check both page-level and PDF-level exclusions
895
+ has_page_exclusions = bool(self._exclusions)
896
+ has_pdf_exclusions = (
897
+ hasattr(self, "_parent")
898
+ and self._parent
899
+ and hasattr(self._parent, "_exclusions")
900
+ and bool(self._parent._exclusions)
901
+ )
902
+
903
+ if not has_page_exclusions and not has_pdf_exclusions:
856
904
  if debug_exclusions:
857
905
  print(
858
906
  f"Page {self.index}: No exclusions defined, returning all {len(elements)} elements."
@@ -865,9 +913,15 @@ class Page(
865
913
  )
866
914
 
867
915
  # Collect element-based exclusions
868
- excluded_elements = set() # Use set for O(1) lookup
916
+ # Store element bboxes for comparison instead of object ids
917
+ excluded_element_bboxes = set() # Use set for O(1) lookup
918
+
919
+ # Process both page-level and PDF-level exclusions
920
+ all_exclusions = list(self._exclusions) if has_page_exclusions else []
921
+ if has_pdf_exclusions:
922
+ all_exclusions.extend(self._parent._exclusions)
869
923
 
870
- for exclusion_data in self._exclusions:
924
+ for exclusion_data in all_exclusions:
871
925
  # Handle both old format (2-tuple) and new format (3-tuple)
872
926
  if len(exclusion_data) == 2:
873
927
  exclusion_item, label = exclusion_data
@@ -883,16 +937,31 @@ class Page(
883
937
  if isinstance(exclusion_item, Region):
884
938
  continue
885
939
 
940
+ # Handle string selectors for element-based exclusions
941
+ if isinstance(exclusion_item, str) and method == "element":
942
+ selector_str = exclusion_item
943
+ matching_elements = self.find_all(selector_str, apply_exclusions=False)
944
+ for el in matching_elements:
945
+ if hasattr(el, "bbox"):
946
+ bbox = tuple(el.bbox)
947
+ excluded_element_bboxes.add(bbox)
948
+ if debug_exclusions:
949
+ print(
950
+ f" - Added element exclusion from selector '{selector_str}': {bbox}"
951
+ )
952
+
886
953
  # Handle element-based exclusions
887
- if method == "element" and hasattr(exclusion_item, "bbox"):
888
- excluded_elements.add(id(exclusion_item))
954
+ elif method == "element" and hasattr(exclusion_item, "bbox"):
955
+ # Store bbox tuple for comparison
956
+ bbox = tuple(exclusion_item.bbox)
957
+ excluded_element_bboxes.add(bbox)
889
958
  if debug_exclusions:
890
- print(f" - Added element exclusion: {exclusion_item}")
959
+ print(f" - Added element exclusion with bbox {bbox}: {exclusion_item}")
891
960
 
892
961
  if debug_exclusions:
893
962
  print(
894
963
  f"Page {self.index}: Applying {len(exclusion_regions)} region exclusions "
895
- f"and {len(excluded_elements)} element exclusions to {len(elements)} elements."
964
+ f"and {len(excluded_element_bboxes)} element exclusions to {len(elements)} elements."
896
965
  )
897
966
 
898
967
  filtered_elements = []
@@ -903,7 +972,7 @@ class Page(
903
972
  exclude = False
904
973
 
905
974
  # Check element-based exclusions first (faster)
906
- if id(element) in excluded_elements:
975
+ if hasattr(element, "bbox") and tuple(element.bbox) in excluded_element_bboxes:
907
976
  exclude = True
908
977
  element_excluded_count += 1
909
978
  if debug_exclusions:
@@ -2487,10 +2556,23 @@ class Page(
2487
2556
  return self
2488
2557
 
2489
2558
  def get_section_between(
2490
- self, start_element=None, end_element=None, include_boundaries="both"
2559
+ self,
2560
+ start_element=None,
2561
+ end_element=None,
2562
+ include_boundaries="both",
2563
+ orientation="vertical",
2491
2564
  ) -> Optional["Region"]: # Return Optional
2492
2565
  """
2493
2566
  Get a section between two elements on this page.
2567
+
2568
+ Args:
2569
+ start_element: Element marking the start of the section
2570
+ end_element: Element marking the end of the section
2571
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
2572
+ orientation: 'vertical' (default) or 'horizontal' - determines section direction
2573
+
2574
+ Returns:
2575
+ Region representing the section
2494
2576
  """
2495
2577
  # Create a full-page region to operate within
2496
2578
  page_region = self.create_region(0, 0, self.width, self.height)
@@ -2501,6 +2583,7 @@ class Page(
2501
2583
  start_element=start_element,
2502
2584
  end_element=end_element,
2503
2585
  include_boundaries=include_boundaries,
2586
+ orientation=orientation,
2504
2587
  )
2505
2588
  except Exception as e:
2506
2589
  logger.error(
@@ -2575,10 +2658,23 @@ class Page(
2575
2658
  if include_boundaries not in valid_inclusions:
2576
2659
  raise ValueError(f"include_boundaries must be one of {valid_inclusions}")
2577
2660
 
2578
- if not start_elements:
2579
- # Return an empty ElementCollection if no start elements
2661
+ if not start_elements and not end_elements:
2662
+ # Return an empty ElementCollection if no boundary elements at all
2580
2663
  return ElementCollection([])
2581
2664
 
2665
+ # If we only have end elements, create implicit start elements
2666
+ if not start_elements and end_elements:
2667
+ # Delegate to PageCollection implementation for consistency
2668
+ from natural_pdf.core.page_collection import PageCollection
2669
+
2670
+ pages = PageCollection([self])
2671
+ return pages.get_sections(
2672
+ start_elements=start_elements,
2673
+ end_elements=end_elements,
2674
+ include_boundaries=include_boundaries,
2675
+ orientation=orientation,
2676
+ )
2677
+
2582
2678
  # Combine start and end elements with their type
2583
2679
  all_boundaries = []
2584
2680
  for el in start_elements:
@@ -537,10 +537,14 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
537
537
  first_page = self.pages[0]
538
538
  first_start = Region(first_page, (0, 0, first_page.width, 1))
539
539
  first_start.is_implicit_start = True
540
+ # Don't mark this as created from any end element, so it can pair with any end
540
541
  start_elements.append(first_start)
541
542
 
542
543
  # For each end element (except the last), add an implicit start after it
543
- sorted_end_elements = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.x0))
544
+ # Sort by page, then top, then bottom (for elements with same top), then x0
545
+ sorted_end_elements = sorted(
546
+ end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0)
547
+ )
544
548
  for i, end_elem in enumerate(sorted_end_elements[:-1]): # Exclude last end element
545
549
  # Create implicit start element right after this end element
546
550
  implicit_start = Region(
@@ -838,29 +842,47 @@ class PageCollection(TextMixin, Generic[P], ApplyMixin, ShapeDetectionMixin, Vis
838
842
  # Create a section from current_start to just before this boundary
839
843
  start_element = current_start["element"]
840
844
 
841
- # Find the last element before this boundary on the same page
845
+ # Create section from current start to just before this new start
842
846
  if start_element.page == boundary["element"].page:
843
- # Find elements on this page
844
- page_elements = [e for e in all_elements if e.page == start_element.page]
845
- # Sort by position based on orientation
847
+ from natural_pdf.elements.region import Region
848
+
849
+ next_start = boundary["element"]
850
+
851
+ # Create section based on orientation
846
852
  if orientation == "vertical":
847
- page_elements.sort(key=lambda e: (e.top, e.x0))
853
+ # Determine vertical bounds
854
+ if include_boundaries in ["start", "both"]:
855
+ top = start_element.top
856
+ else:
857
+ top = start_element.bottom
858
+
859
+ # The section ends just before the next start
860
+ bottom = next_start.top
861
+
862
+ # Create the section with full page width
863
+ if top < bottom:
864
+ section = Region(
865
+ start_element.page, (0, top, start_element.page.width, bottom)
866
+ )
867
+ section.start_element = start_element
868
+ sections.append(section)
848
869
  else: # horizontal
849
- page_elements.sort(key=lambda e: (e.x0, e.top))
870
+ # Determine horizontal bounds
871
+ if include_boundaries in ["start", "both"]:
872
+ left = start_element.x0
873
+ else:
874
+ left = start_element.x1
850
875
 
851
- # Find the last element before the boundary
852
- end_idx = (
853
- page_elements.index(boundary["element"]) - 1
854
- if boundary["element"] in page_elements
855
- else -1
856
- )
857
- end_element = page_elements[end_idx] if end_idx >= 0 else None
876
+ # The section ends just before the next start
877
+ right = next_start.x0
858
878
 
859
- # Create the section
860
- section = start_element.page.get_section_between(
861
- start_element, end_element, include_boundaries, orientation
862
- )
863
- sections.append(section)
879
+ # Create the section with full page height
880
+ if left < right:
881
+ section = Region(
882
+ start_element.page, (left, 0, right, start_element.page.height)
883
+ )
884
+ section.start_element = start_element
885
+ sections.append(section)
864
886
  else:
865
887
  # Cross-page section - create from current_start to the end of its page
866
888
  from natural_pdf.elements.region import Region
@@ -252,6 +252,16 @@ class _LazyPageList(Sequence):
252
252
  logger.warning(f"Failed to apply region to page {cached.number}: {e}")
253
253
 
254
254
  self._cache[index] = cached
255
+
256
+ # Also cache in the parent PDF's main page list if this is a slice
257
+ if (
258
+ hasattr(self._parent_pdf, "_pages")
259
+ and hasattr(self._parent_pdf._pages, "_cache")
260
+ and actual_page_index < len(self._parent_pdf._pages._cache)
261
+ and self._parent_pdf._pages._cache[actual_page_index] is None
262
+ ):
263
+ self._parent_pdf._pages._cache[actual_page_index] = cached
264
+
255
265
  return cached
256
266
 
257
267
  # Sequence protocol ---------------------------------------------------
@@ -720,26 +730,16 @@ class PDF(
720
730
  # Store for bookkeeping and lazy application
721
731
  self._exclusions.append((exclusion_func, label))
722
732
 
723
- # Apply only to already-created (cached) pages to avoid forcing page creation
724
- for i in range(len(self._pages)):
725
- if self._pages._cache[i] is not None: # Only apply to existing pages
726
- try:
727
- self._pages._cache[i].add_exclusion(exclusion_func, label=label)
728
- except Exception as e:
729
- logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
733
+ # Don't modify already-cached pages - they will get PDF-level exclusions
734
+ # dynamically through _get_exclusion_regions()
730
735
  return self
731
736
 
732
737
  # Fallback to original callable / Region behaviour ------------------
733
738
  exclusion_data = (exclusion_func, label)
734
739
  self._exclusions.append(exclusion_data)
735
740
 
736
- # Apply only to already-created (cached) pages to avoid forcing page creation
737
- for i in range(len(self._pages)):
738
- if self._pages._cache[i] is not None: # Only apply to existing pages
739
- try:
740
- self._pages._cache[i].add_exclusion(exclusion_func, label=label)
741
- except Exception as e:
742
- logger.warning(f"Failed to apply exclusion to existing page {i}: {e}")
741
+ # Don't modify already-cached pages - they will get PDF-level exclusions
742
+ # dynamically through _get_exclusion_regions()
743
743
 
744
744
  return self
745
745
 
@@ -621,6 +621,7 @@ class ElementCollection(
621
621
 
622
622
  def extract_text(
623
623
  self,
624
+ separator: str = " ",
624
625
  preserve_whitespace: bool = True,
625
626
  use_exclusions: bool = True,
626
627
  strip: Optional[bool] = None,
@@ -632,6 +633,7 @@ class ElementCollection(
632
633
  pdfplumber's layout engine if layout=True is specified.
633
634
 
634
635
  Args:
636
+ separator: String to join text from elements. Default is a single space.
635
637
  preserve_whitespace: Deprecated. Use layout=False for simple joining.
636
638
  use_exclusions: Deprecated. Exclusions should be applied *before* creating
637
639
  the collection or by filtering the collection itself.
@@ -648,15 +650,49 @@ class ElementCollection(
648
650
  Returns:
649
651
  Combined text from elements, potentially with layout-based spacing.
650
652
  """
651
- # Filter to just TextElements that likely have _char_dicts
652
- text_elements = [
653
+ # Check if we have any elements at all
654
+ if not self._elements:
655
+ return ""
656
+
657
+ # Check if all elements are TextElements with character data
658
+ text_elements_with_chars = [
653
659
  el
654
660
  for el in self._elements
655
- if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
661
+ if isinstance(el, TextElement) and hasattr(el, "_char_dicts") and el._char_dicts
656
662
  ]
657
663
 
658
- if not text_elements:
659
- return ""
664
+ # If we have a mixed collection (Regions, TextElements without chars, etc),
665
+ # use a simpler approach: call extract_text on each element
666
+ if len(text_elements_with_chars) < len(self._elements):
667
+ # Mixed collection - extract text from each element
668
+ element_texts = []
669
+
670
+ # Sort elements by position first
671
+ sorted_elements = sorted(
672
+ self._elements,
673
+ key=lambda el: (
674
+ el.page.index if hasattr(el, "page") else 0,
675
+ el.top if hasattr(el, "top") else 0,
676
+ el.x0 if hasattr(el, "x0") else 0,
677
+ ),
678
+ )
679
+
680
+ for el in sorted_elements:
681
+ if hasattr(el, "extract_text"):
682
+ # Call extract_text on the element (works for TextElement, Region, etc)
683
+ text = el.extract_text(**kwargs)
684
+ if text:
685
+ element_texts.append(text)
686
+ elif hasattr(el, "text"):
687
+ # Fallback to text property if available
688
+ text = getattr(el, "text", "")
689
+ if text:
690
+ element_texts.append(text)
691
+
692
+ return separator.join(element_texts)
693
+
694
+ # All elements are TextElements with char data - use the original approach
695
+ text_elements = text_elements_with_chars
660
696
 
661
697
  # Collect all character dictionaries
662
698
  all_char_dicts = []
@@ -665,11 +701,20 @@ class ElementCollection(
665
701
 
666
702
  if not all_char_dicts:
667
703
  # Handle case where elements exist but have no char dicts
668
- logger.warning(
704
+ logger.debug(
669
705
  "ElementCollection.extract_text: No character dictionaries found in TextElements."
670
706
  )
671
- return " ".join(
672
- getattr(el, "text", "") for el in text_elements
707
+ # Sort elements by position before joining
708
+ sorted_text_elements = sorted(
709
+ text_elements,
710
+ key=lambda el: (
711
+ el.page.index if hasattr(el, "page") else 0,
712
+ el.top if hasattr(el, "top") else 0,
713
+ el.x0 if hasattr(el, "x0") else 0,
714
+ ),
715
+ )
716
+ return separator.join(
717
+ getattr(el, "text", "") for el in sorted_text_elements
673
718
  ) # Fallback to simple join of word text
674
719
 
675
720
  # Apply content filtering if provided
@@ -736,15 +781,17 @@ class ElementCollection(
736
781
  result = " ".join(c.get("text", "") for c in all_char_dicts)
737
782
 
738
783
  else:
784
+ print("JOIN WITHOUT LAYOUT")
739
785
  # Default: Simple join without layout
740
786
  logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
741
- # Sort chars by document order (page, top, x0)
742
- all_char_dicts.sort(
743
- key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
744
- )
745
- # Simple join of character text
746
- result = "".join(c.get("text", "") for c in all_char_dicts)
747
- # Replace multiple spaces created by joining possibly overlapping chars? Maybe not necessary.
787
+ result = separator.join(el.extract_text() for el in text_elements)
788
+
789
+ # # Sort chars by document order (page, top, x0)
790
+ # all_char_dicts.sort(
791
+ # key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
792
+ # )
793
+ # # Simple join of character text
794
+ # result = "".join(c.get("text", "") for c in all_char_dicts)
748
795
 
749
796
  # Determine final strip flag – same rule as global helper unless caller overrides
750
797
  strip_text = strip if strip is not None else (not use_layout)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.6
3
+ Version: 0.2.9
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -114,8 +114,9 @@ class TestGroupByColorDisplay:
114
114
  colors = [(1, 0, 0), (0, 1, 0), (0, 0, 1)]
115
115
  for i, color in enumerate(colors):
116
116
  page = MagicMock()
117
- # Create a closure to capture the correct color
118
- page.find.return_value = MagicMock(extract_text=lambda c=color: str(c))
117
+ # PageGroupBy groups by the text content of the element found
118
+ # So we need to return the color tuple as the extracted text
119
+ page.find.return_value = MagicMock(extract_text=lambda c=color: c)
119
120
  mock_pages.append(page)
120
121
 
121
122
  collection = PageCollection(mock_pages)
@@ -141,7 +142,7 @@ class TestGroupByColorDisplay:
141
142
  colors = [(255, 0, 0), (0, 255, 0)]
142
143
  for color in colors:
143
144
  page = MagicMock()
144
- page.find.return_value = MagicMock(extract_text=lambda c=color: str(c))
145
+ page.find.return_value = MagicMock(extract_text=lambda c=color: c)
145
146
  mock_pages.append(page)
146
147
 
147
148
  collection = PageCollection(mock_pages)
@@ -68,7 +68,8 @@ def test_edge_case_single_end_element():
68
68
  print(f"\nSingle end element: bottom={end_elem.bottom}")
69
69
 
70
70
  # Create sections with single end element
71
- sections = page.get_sections(end_elements=[end_elem])
71
+ # When using only end elements, we typically want to include the end boundary
72
+ sections = page.get_sections(end_elements=[end_elem], include_boundaries="end")
72
73
 
73
74
  print(f"Sections created: {len(sections)}")
74
75
 
@@ -80,7 +81,8 @@ def test_edge_case_single_end_element():
80
81
  print(f"Expected height: {end_elem.bottom}")
81
82
 
82
83
  # Height should be approximately end_elem.bottom (from top of page)
83
- assert abs(section.height - end_elem.bottom) < 1.0
84
+ # Allow for small rounding differences
85
+ assert abs(section.height - end_elem.bottom) <= 1.0
84
86
 
85
87
 
86
88
  def test_mixed_start_end_elements():
@@ -115,13 +115,16 @@ def test_implicit_start_not_paired_with_source_end():
115
115
 
116
116
  print(f"\nSections created: {len(sections)}")
117
117
 
118
- # The first section should go from top of page to first end
119
- # The second section should go from first end to second end
118
+ # With default include_boundaries="start", sections exclude the end boundary
119
+ # So the first section should go from top of page to TOP of first end element
120
120
  # There should NOT be a zero-height section at first end
121
121
 
122
+ # Sort end elements like the implementation does
123
+ sorted_ends = sorted(end_elements, key=lambda e: (e.page.index, e.top, e.bottom, e.x0))
124
+
122
125
  expected_sections = [
123
- (0, end_elements[0].bottom), # Top to first end
124
- (end_elements[0].bottom, end_elements[1].bottom), # First end to second end
126
+ (0, sorted_ends[0].top), # Top to TOP of first sorted end (exclude end boundary)
127
+ # Second section continues from there - we don't check its end
125
128
  ]
126
129
 
127
130
  for i, section in enumerate(sections):
@@ -96,6 +96,7 @@ def test_guides_extract_table_with_parameters():
96
96
  cell_extraction_func=None,
97
97
  show_progress=False,
98
98
  content_filter=None,
99
+ apply_exclusions=True,
99
100
  )
100
101
 
101
102
 
@@ -77,13 +77,13 @@ def test_extract_table_collection_header_options():
77
77
 
78
78
  # Test header=None
79
79
  result2 = guide.extract_table(pages, header=None)
80
- df2 = result2.to_df()
80
+ df2 = result2.to_df(header=None) # Need to pass header=None to to_df as well
81
81
  assert isinstance(df2.columns[0], int) # Should use numeric indices
82
82
 
83
83
  # Test custom headers
84
84
  custom_headers = ["A", "B", "C", "D", "E", "F", "G", "H"]
85
85
  result3 = guide.extract_table(pages, header=custom_headers)
86
- df3 = result3.to_df()
86
+ df3 = result3.to_df(header=custom_headers) # Pass custom headers to to_df
87
87
  assert list(df3.columns) == custom_headers
88
88
 
89
89