natural-pdf 0.2.5__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (321) hide show
  1. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.gitignore +1 -0
  2. {natural_pdf-0.2.5/natural_pdf.egg-info → natural_pdf-0.2.6}/PKG-INFO +1 -1
  3. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/guides.py +94 -42
  4. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/page.py +110 -44
  5. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/page_collection.py +223 -34
  6. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/page_groupby.py +20 -2
  7. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/pdf.py +3 -0
  8. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/render_spec.py +20 -5
  9. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/base.py +1 -1
  10. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/elements.py +1 -1
  11. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/base.py +84 -8
  12. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/element_collection.py +730 -12
  13. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/region.py +181 -48
  14. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/flow.py +3 -0
  15. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/selectors/parser.py +2 -2
  16. natural_pdf-0.2.6/natural_pdf/utils/color_utils.py +100 -0
  17. {natural_pdf-0.2.5 → natural_pdf-0.2.6/natural_pdf.egg-info}/PKG-INFO +1 -1
  18. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/SOURCES.txt +32 -1
  19. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/top_level.txt +1 -0
  20. natural_pdf-0.2.6/tests/test_color_hex_display.py +194 -0
  21. natural_pdf-0.2.6/tests/test_crop_enhancements.py +149 -0
  22. natural_pdf-0.2.6/tests/test_crop_region_highlights.py +119 -0
  23. natural_pdf-0.2.6/tests/test_dissolve.py +471 -0
  24. natural_pdf-0.2.6/tests/test_dissolve_cross_page_bug.py +155 -0
  25. natural_pdf-0.2.6/tests/test_dissolve_debug_issue.py +195 -0
  26. natural_pdf-0.2.6/tests/test_dissolve_real_world_issue.py +201 -0
  27. natural_pdf-0.2.6/tests/test_dissolve_single_elements.py +159 -0
  28. natural_pdf-0.2.6/tests/test_dissolve_vertical_offset_issue.py +139 -0
  29. natural_pdf-0.2.6/tests/test_element_addition.py +176 -0
  30. natural_pdf-0.2.6/tests/test_element_collection_show_cols.py +132 -0
  31. natural_pdf-0.2.6/tests/test_empty_pseudo_class.py +215 -0
  32. natural_pdf-0.2.6/tests/test_fix_get_sections_zero_height.py +120 -0
  33. natural_pdf-0.2.6/tests/test_get_sections_fix_comprehensive.py +183 -0
  34. natural_pdf-0.2.6/tests/test_get_sections_zero_height.py +179 -0
  35. natural_pdf-0.2.5/tests/test_guides_extract_table_from_pages.py → natural_pdf-0.2.6/tests/test_guides_extract_table_collections.py +78 -55
  36. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_extract_table_exclusions.py +41 -40
  37. natural_pdf-0.2.6/tests/test_highlight_detection.py +40 -0
  38. natural_pdf-0.2.6/tests/test_highlight_detection_comprehensive.py +94 -0
  39. natural_pdf-0.2.6/tests/test_include_boundaries_comprehensive.py +124 -0
  40. natural_pdf-0.2.6/tests/test_include_boundaries_debug.py +67 -0
  41. natural_pdf-0.2.6/tests/test_include_boundaries_final.py +159 -0
  42. natural_pdf-0.2.6/tests/test_include_boundaries_final_verification.py +126 -0
  43. natural_pdf-0.2.6/tests/test_include_boundaries_fix.py +126 -0
  44. natural_pdf-0.2.6/tests/test_include_boundaries_mock.py +188 -0
  45. natural_pdf-0.2.6/tests/test_include_boundaries_simple.py +119 -0
  46. natural_pdf-0.2.6/tests/test_include_boundaries_types_pdf.py +113 -0
  47. natural_pdf-0.2.6/tests/test_include_boundaries_verification.py +134 -0
  48. natural_pdf-0.2.6/tests/test_include_boundaries_with_real_text.py +104 -0
  49. natural_pdf-0.2.6/tests/test_merge_connected.py +302 -0
  50. natural_pdf-0.2.6/tests/test_merge_connected_real_world.py +240 -0
  51. natural_pdf-0.2.6/tests/test_merge_method.py +185 -0
  52. natural_pdf-0.2.6/tests/test_sections_with_start_and_end.py +98 -0
  53. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_slice_cache_reuse.py +43 -40
  54. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_slice_exclusion_fix.py +37 -34
  55. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_slice_exclusion_issue.py +22 -16
  56. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_slice_exclusion_mock.py +49 -49
  57. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_sliced_collection_exclusions.py +50 -42
  58. natural_pdf-0.2.5/tests/test_highlight_detection.py +0 -11
  59. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/analysis_framework.mdc +0 -0
  60. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/coding-style.mdc +0 -0
  61. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  62. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/minimal-comments.mdc +0 -0
  63. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  64. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  65. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.github/workflows/ci.yml +0 -0
  66. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.github/workflows/docs.yml +0 -0
  67. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.github/workflows/nightly-tutorials.yml +0 -0
  68. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/.pre-commit-config.yaml +0 -0
  69. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/01-execute_notebooks.py +0 -0
  70. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/02-run_all_tutorials.sh +0 -0
  71. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/CLAUDE.md +0 -0
  72. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/LICENSE +0 -0
  73. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/MANIFEST.in +0 -0
  74. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/README.md +0 -0
  75. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/audit_packaging.py +0 -0
  76. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/check_run_md.sh +0 -0
  77. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/api/index.md +0 -0
  78. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/favicon.png +0 -0
  79. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/favicon.svg +0 -0
  80. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/javascripts/custom.js +0 -0
  81. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/logo.svg +0 -0
  82. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/sample-screen.png +0 -0
  83. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/social-preview.png +0 -0
  84. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/social-preview.svg +0 -0
  85. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/assets/stylesheets/custom.css +0 -0
  86. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/categorizing-documents/index.md +0 -0
  87. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/data-extraction/index.md +0 -0
  88. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/describe/index.md +0 -0
  89. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/document-qa/index.md +0 -0
  90. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/element-selection/index.md +0 -0
  91. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/extracting-clean-text/index.md +0 -0
  92. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/finetuning/index.md +0 -0
  93. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/fix-messy-tables/index.md +0 -0
  94. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_1.csv +0 -0
  95. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_2.csv +0 -0
  96. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_3.csv +0 -0
  97. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/index.md +0 -0
  98. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/installation/index.md +0 -0
  99. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/interactive-widget/index.md +0 -0
  100. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/layout-analysis/index.md +0 -0
  101. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/loops-and-groups/index.md +0 -0
  102. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/ocr/index.md +0 -0
  103. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/pdf-navigation/index.md +0 -0
  104. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  105. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/process-forms-and-invoices/index.md +0 -0
  106. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/quick-reference/index.md +0 -0
  107. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/reflowing-pages/index.md +0 -0
  108. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/regions/index.md +0 -0
  109. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tables/index.md +0 -0
  110. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/text-analysis/index.md +0 -0
  111. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/01-loading-and-extraction.md +0 -0
  112. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/02-finding-elements.md +0 -0
  113. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/03-extracting-blocks.md +0 -0
  114. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/04-table-extraction.md +0 -0
  115. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/05-excluding-content.md +0 -0
  116. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/06-document-qa.md +0 -0
  117. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/07-layout-analysis.md +0 -0
  118. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/07-working-with-regions.md +0 -0
  119. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/08-spatial-navigation.md +0 -0
  120. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/09-section-extraction.md +0 -0
  121. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/10-form-field-extraction.md +0 -0
  122. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  123. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/12-ocr-integration.md +0 -0
  124. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/13-semantic-search.md +0 -0
  125. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/tutorials/14-categorizing-documents.md +0 -0
  126. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/visual-debugging/index.md +0 -0
  127. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/docs/visual-debugging/region.png +0 -0
  128. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/mkdocs.yml +0 -0
  129. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/__init__.py +0 -0
  130. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/__init__.py +0 -0
  131. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/__init__.py +0 -0
  132. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/base.py +0 -0
  133. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/docling.py +0 -0
  134. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/gemini.py +0 -0
  135. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  136. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  137. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  138. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/paddle.py +0 -0
  139. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  140. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/surya.py +0 -0
  141. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  142. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/tatr.py +0 -0
  143. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/yolo.py +0 -0
  144. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  145. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/text_options.py +0 -0
  146. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/text_structure.py +0 -0
  147. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/analyzers/utils.py +0 -0
  148. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/classification/manager.py +0 -0
  149. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/classification/mixin.py +0 -0
  150. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/classification/results.py +0 -0
  151. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/cli.py +0 -0
  152. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/collections/mixins.py +0 -0
  153. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/__init__.py +0 -0
  154. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/element_manager.py +0 -0
  155. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/highlighting_service.py +0 -0
  156. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/core/pdf_collection.py +0 -0
  157. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/__init__.py +0 -0
  158. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/mixin.py +0 -0
  159. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/describe/summary.py +0 -0
  160. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/__init__.py +0 -0
  161. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/image.py +0 -0
  162. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/line.py +0 -0
  163. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/rect.py +0 -0
  164. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/elements/text.py +0 -0
  165. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/export/mixin.py +0 -0
  166. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/__init__.py +0 -0
  167. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/base.py +0 -0
  168. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/data/__init__.py +0 -0
  169. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/data/pdf.ttf +0 -0
  170. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/data/sRGB.icc +0 -0
  171. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/hocr.py +0 -0
  172. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/hocr_font.py +0 -0
  173. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/original_pdf.py +0 -0
  174. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/paddleocr.py +0 -0
  175. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/exporters/searchable_pdf.py +0 -0
  176. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/extraction/manager.py +0 -0
  177. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/extraction/mixin.py +0 -0
  178. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/extraction/result.py +0 -0
  179. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/__init__.py +0 -0
  180. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/collections.py +0 -0
  181. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/element.py +0 -0
  182. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/flows/region.py +0 -0
  183. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/__init__.py +0 -0
  184. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine.py +0 -0
  185. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_doctr.py +0 -0
  186. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_easyocr.py +0 -0
  187. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_paddle.py +0 -0
  188. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_surya.py +0 -0
  189. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_factory.py +0 -0
  190. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_manager.py +0 -0
  191. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_options.py +0 -0
  192. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/ocr/utils.py +0 -0
  193. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/qa/__init__.py +0 -0
  194. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/qa/document_qa.py +0 -0
  195. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/qa/qa_result.py +0 -0
  196. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/__init__.py +0 -0
  197. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/lancedb_search_service.py +0 -0
  198. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/numpy_search_service.py +0 -0
  199. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/search_options.py +0 -0
  200. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/search_service_protocol.py +0 -0
  201. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/search/searchable_mixin.py +0 -0
  202. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/selectors/__init__.py +0 -0
  203. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/tables/__init__.py +0 -0
  204. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/tables/result.py +0 -0
  205. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/__init__.py +0 -0
  206. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  207. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/spa/css/style.css +0 -0
  208. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/spa/index.html +0 -0
  209. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/spa/js/app.js +0 -0
  210. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/templates/spa/words.txt +0 -0
  211. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/text_mixin.py +0 -0
  212. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/__init__.py +0 -0
  213. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/bidi_mirror.py +0 -0
  214. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/debug.py +0 -0
  215. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/highlighting.py +0 -0
  216. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/identifiers.py +0 -0
  217. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/layout.py +0 -0
  218. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/locks.py +0 -0
  219. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/packaging.py +0 -0
  220. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/reading_order.py +0 -0
  221. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/text_extraction.py +0 -0
  222. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/utils/visualization.py +0 -0
  223. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/vision/__init__.py +0 -0
  224. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/vision/mixin.py +0 -0
  225. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/vision/results.py +0 -0
  226. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/vision/similarity.py +0 -0
  227. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/widgets/__init__.py +0 -0
  228. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf/widgets/viewer.py +0 -0
  229. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/dependency_links.txt +0 -0
  230. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/entry_points.txt +0 -0
  231. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/natural_pdf.egg-info/requires.txt +0 -0
  232. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/noxfile.py +0 -0
  233. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/memory_comparison.py +0 -0
  234. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/pdf_analyzer.py +0 -0
  235. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_analysis.py +0 -0
  236. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  237. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  238. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  239. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  240. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/test_cleanup_methods.py +0 -0
  241. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/optimization/test_memory_fix.py +0 -0
  242. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/publish.sh +0 -0
  243. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/pyproject.toml +0 -0
  244. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/sample-screen.png +0 -0
  245. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/setup.cfg +0 -0
  246. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/conftest.py +0 -0
  247. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/exporters/test_paddleocr_exporter.py +0 -0
  248. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_annotate.py +0 -0
  249. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_arabic_performance.py +0 -0
  250. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_arabic_real_world.py +0 -0
  251. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_color_conversion.py +0 -0
  252. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_containment_geometry.py +0 -0
  253. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_elements.py +0 -0
  254. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_loading.py +0 -0
  255. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_spatial.py +0 -0
  256. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_text_extraction.py +0 -0
  257. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_core/test_text_layer.py +0 -0
  258. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_directional_defaults.py +0 -0
  259. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_document_qa.py +0 -0
  260. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_element_collection_slicing.py +0 -0
  261. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_element_show_crop_highlights.py +0 -0
  262. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_exclusions.py +0 -0
  263. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_expand.py +0 -0
  264. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_extraction_error.py +0 -0
  265. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_extraction_mixin_fix.py +0 -0
  266. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_extraction_text_and_vision.py +0 -0
  267. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_extraction_working.py +0 -0
  268. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_find_similar.py +0 -0
  269. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_first_last_selectors.py +0 -0
  270. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_flow_region_directional.py +0 -0
  271. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_groupby.py +0 -0
  272. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides.py +0 -0
  273. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_apply_exclusions.py +0 -0
  274. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_apply_exclusions_simple.py +0 -0
  275. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_extract_table.py +0 -0
  276. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_extract_table_real.py +0 -0
  277. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_guides_integration.py +0 -0
  278. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_highlight_protocol.py +0 -0
  279. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_highlight_protocol_simple.py +0 -0
  280. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_highlight_regions.py +0 -0
  281. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_loading_original.py +0 -0
  282. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_multi_page_table_discovery.py +0 -0
  283. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_optional_deps.py +0 -0
  284. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_page_exclusion_lists.py +0 -0
  285. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  286. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_region_show_crop_highlights.py +0 -0
  287. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_region_viewer.py +0 -0
  288. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_sections_end_only.py +0 -0
  289. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_column_layout.py +0 -0
  290. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_edge_cases.py +0 -0
  291. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_exclusions.py +0 -0
  292. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_exclusions_feature.py +0 -0
  293. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_show_limit.py +0 -0
  294. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_skip_repeating_headers_multipage.py +0 -0
  295. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_strikethrough_detection.py +0 -0
  296. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_table_result_header_mismatch.py +0 -0
  297. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_table_result_keep_blank.py +0 -0
  298. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_tiny_text_tables.py +0 -0
  299. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_tiny_text_tables_table.py +0 -0
  300. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_tutorials.py +0 -0
  301. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_underline_detection.py +0 -0
  302. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tests/test_update_text.py +0 -0
  303. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/todo/bad_pdf_analysis.md +0 -0
  304. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/todo/evaluation.md +0 -0
  305. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  306. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  307. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  308. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/README.md +0 -0
  309. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/__init__.py +0 -0
  310. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/analyser.py +0 -0
  311. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  312. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  313. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/eval_suite.py +0 -0
  314. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  315. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  316. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  317. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  318. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  319. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/reporter.py +0 -0
  320. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/tools/bad_pdf_eval/utils.py +0 -0
  321. {natural_pdf-0.2.5 → natural_pdf-0.2.6}/uv.lock +0 -0
@@ -1,3 +1,4 @@
1
+ temp
1
2
  Untitled*.ipynb
2
3
  importtime_output.txt
3
4
  .notebook_cache.json
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -3462,7 +3462,15 @@ class Guides:
3462
3462
 
3463
3463
  def extract_table(
3464
3464
  self,
3465
- target: Optional[Union["Page", "Region"]] = None,
3465
+ target: Optional[
3466
+ Union[
3467
+ "Page",
3468
+ "Region",
3469
+ "PageCollection",
3470
+ "ElementCollection",
3471
+ List[Union["Page", "Region"]],
3472
+ ]
3473
+ ] = None,
3466
3474
  source: str = "guides_temp",
3467
3475
  cell_padding: float = 0.5,
3468
3476
  include_outer_boundaries: bool = False,
@@ -3477,6 +3485,8 @@ class Guides:
3477
3485
  apply_exclusions: bool = True,
3478
3486
  *,
3479
3487
  multi_page: Literal["auto", True, False] = "auto",
3488
+ header: Union[str, List[str], None] = "first",
3489
+ skip_repeating_headers: Optional[bool] = None,
3480
3490
  ) -> "TableResult":
3481
3491
  """
3482
3492
  Extract table data directly from guides without leaving temporary regions.
@@ -3487,8 +3497,11 @@ class Guides:
3487
3497
  3. Cleans up all temporary regions
3488
3498
  4. Returns the TableResult
3489
3499
 
3500
+ When passed a collection (PageCollection, ElementCollection, or list), this method
3501
+ will extract tables from each element and combine them into a single result.
3502
+
3490
3503
  Args:
3491
- target: Page or Region to create regions on (uses self.context if None)
3504
+ target: Page, Region, or collection of Pages/Regions to extract from (uses self.context if None)
3492
3505
  source: Source label for temporary regions (will be cleaned up)
3493
3506
  cell_padding: Internal padding for cell regions in points
3494
3507
  include_outer_boundaries: Whether to add boundaries at edges if missing
@@ -3502,6 +3515,13 @@ class Guides:
3502
3515
  content_filter: Content filtering function or patterns
3503
3516
  apply_exclusions: Whether to apply exclusion regions during text extraction (default: True)
3504
3517
  multi_page: Controls multi-region table creation for FlowRegions
3518
+ header: How to handle headers when extracting from collections:
3519
+ - "first": Use first row of first element as headers (default)
3520
+ - "all": Expect headers on each element, use from first element
3521
+ - None: No headers, use numeric indices
3522
+ - List[str]: Custom column names
3523
+ skip_repeating_headers: Whether to remove duplicate header rows when extracting from collections.
3524
+ Defaults to True when header is "first" or "all", False otherwise.
3505
3525
 
3506
3526
  Returns:
3507
3527
  TableResult: Extracted table data
@@ -3513,20 +3533,49 @@ class Guides:
3513
3533
  ```python
3514
3534
  from natural_pdf.analyzers import Guides
3515
3535
 
3516
- # Create guides from detected lines
3536
+ # Single page extraction
3517
3537
  guides = Guides.from_lines(page, source_label="detected")
3518
-
3519
- # Extract table directly - no temporary regions left behind
3520
3538
  table_data = guides.extract_table()
3521
-
3522
- # Convert to pandas DataFrame
3523
3539
  df = table_data.to_df()
3540
+
3541
+ # Multiple page extraction
3542
+ guides = Guides(pages[0])
3543
+ guides.vertical.from_content(['Column 1', 'Column 2'])
3544
+ table_result = guides.extract_table(pages, header=['Col1', 'Col2'])
3545
+ df = table_result.to_df()
3546
+
3547
+ # Region collection extraction
3548
+ regions = pdf.find_all('region[type=table]')
3549
+ guides = Guides(regions[0])
3550
+ guides.vertical.from_lines(n=3)
3551
+ table_result = guides.extract_table(regions)
3524
3552
  ```
3525
3553
  """
3526
- target_obj = target or self.context
3527
- if not target_obj:
3554
+ from natural_pdf.core.page_collection import PageCollection
3555
+ from natural_pdf.elements.element_collection import ElementCollection
3556
+
3557
+ target_obj = target if target is not None else self.context
3558
+ if target_obj is None:
3528
3559
  raise ValueError("No target object available. Provide target parameter or context.")
3529
3560
 
3561
+ # Check if target is a collection - if so, delegate to _extract_table_from_collection
3562
+ if isinstance(target_obj, (PageCollection, ElementCollection, list)):
3563
+ # For collections, pass through most parameters as-is
3564
+ return self._extract_table_from_collection(
3565
+ elements=target_obj,
3566
+ header=header,
3567
+ skip_repeating_headers=skip_repeating_headers,
3568
+ method=method,
3569
+ table_settings=table_settings,
3570
+ use_ocr=use_ocr,
3571
+ ocr_config=ocr_config,
3572
+ text_options=text_options,
3573
+ cell_extraction_func=cell_extraction_func,
3574
+ show_progress=show_progress,
3575
+ content_filter=content_filter,
3576
+ apply_exclusions=apply_exclusions,
3577
+ )
3578
+
3530
3579
  # Get the page for cleanup later
3531
3580
  if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
3532
3581
  page = target_obj._page
@@ -3597,9 +3646,9 @@ class Guides:
3597
3646
  except Exception as cleanup_err:
3598
3647
  logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
3599
3648
 
3600
- def extract_table_from_pages(
3649
+ def _extract_table_from_collection(
3601
3650
  self,
3602
- pages: Union["PageCollection", List["Page"]],
3651
+ elements: Union["PageCollection", "ElementCollection", List[Union["Page", "Region"]]],
3603
3652
  header: Union[str, List[str], None] = "first",
3604
3653
  skip_repeating_headers: Optional[bool] = None,
3605
3654
  method: Optional[str] = None,
@@ -3613,17 +3662,17 @@ class Guides:
3613
3662
  apply_exclusions: bool = True,
3614
3663
  ) -> "TableResult":
3615
3664
  """
3616
- Extract tables from multiple pages using this guide pattern.
3665
+ Extract tables from multiple pages or regions using this guide pattern.
3617
3666
 
3618
- This method applies the guide to each page, extracts tables, and combines
3667
+ This method applies the guide to each element, extracts tables, and combines
3619
3668
  them into a single TableResult. Dynamic guides (using lambdas) are evaluated
3620
- for each page.
3669
+ for each element.
3621
3670
 
3622
3671
  Args:
3623
- pages: PageCollection or list of Pages to extract from
3672
+ elements: PageCollection, ElementCollection, or list of Pages/Regions to extract from
3624
3673
  header: How to handle headers:
3625
- - "first": Use first row of first page as headers (default)
3626
- - "all": Expect headers on each page, use from first page
3674
+ - "first": Use first row of first element as headers (default)
3675
+ - "all": Expect headers on each element, use from first element
3627
3676
  - None: No headers, use numeric indices
3628
3677
  - List[str]: Custom column names
3629
3678
  skip_repeating_headers: Whether to remove duplicate header rows.
@@ -3634,35 +3683,36 @@ class Guides:
3634
3683
  ocr_config: OCR configuration parameters
3635
3684
  text_options: Dictionary of options for the 'text' method
3636
3685
  cell_extraction_func: Optional callable for custom cell text extraction
3637
- show_progress: Show progress bar for multi-page extraction (default: True)
3686
+ show_progress: Show progress bar for multi-element extraction (default: True)
3638
3687
  content_filter: Content filtering function or patterns
3639
3688
  apply_exclusions: Whether to apply exclusion regions during extraction
3640
3689
 
3641
3690
  Returns:
3642
- TableResult: Combined table data from all pages
3691
+ TableResult: Combined table data from all elements
3643
3692
 
3644
3693
  Example:
3645
3694
  ```python
3646
3695
  # Create guide with static vertical, dynamic horizontal
3647
- guide = Guides(pages[0])
3696
+ guide = Guides(regions[0])
3648
3697
  guide.vertical.from_content(columns, outer="last")
3649
- guide.horizontal.from_content(lambda p: p.find_all('text:starts-with(NF-)'))
3698
+ guide.horizontal.from_content(lambda r: r.find_all('text:starts-with(NF-)'))
3650
3699
 
3651
- # Extract from all pages
3652
- table_result = guide.extract_table_from_pages(pages, header=columns)
3700
+ # Extract from all regions
3701
+ table_result = guide._extract_table_from_collection(regions, header=columns)
3653
3702
  df = table_result.to_df()
3654
3703
  ```
3655
3704
  """
3656
3705
  from natural_pdf.core.page_collection import PageCollection
3706
+ from natural_pdf.elements.element_collection import ElementCollection
3657
3707
  from natural_pdf.tables.result import TableResult
3658
3708
 
3659
- # Convert to list if it's a PageCollection
3660
- if isinstance(pages, PageCollection):
3661
- page_list = list(pages)
3709
+ # Convert to list if it's a collection
3710
+ if isinstance(elements, (PageCollection, ElementCollection)):
3711
+ element_list = list(elements)
3662
3712
  else:
3663
- page_list = pages
3713
+ element_list = elements
3664
3714
 
3665
- if not page_list:
3715
+ if not element_list:
3666
3716
  return TableResult([])
3667
3717
 
3668
3718
  # Determine header handling
@@ -3673,37 +3723,39 @@ class Guides:
3673
3723
  header_row = None
3674
3724
 
3675
3725
  # Configure progress bar
3676
- iterator = page_list
3677
- if show_progress and len(page_list) > 1:
3726
+ iterator = element_list
3727
+ if show_progress and len(element_list) > 1:
3678
3728
  try:
3679
3729
  from tqdm.auto import tqdm
3680
3730
 
3681
- iterator = tqdm(page_list, desc="Extracting tables from pages", unit="page")
3731
+ iterator = tqdm(
3732
+ element_list, desc="Extracting tables from elements", unit="element"
3733
+ )
3682
3734
  except ImportError:
3683
3735
  pass
3684
3736
 
3685
- for i, page in enumerate(iterator):
3686
- # Create a new Guides object for this page
3687
- page_guide = Guides(page)
3737
+ for i, element in enumerate(iterator):
3738
+ # Create a new Guides object for this element
3739
+ element_guide = Guides(element)
3688
3740
 
3689
3741
  # Copy vertical guides (usually static)
3690
3742
  if hasattr(self.vertical, "_callable") and self.vertical._callable is not None:
3691
3743
  # If vertical is dynamic (lambda), evaluate it
3692
- page_guide.vertical.from_content(self.vertical._callable(page))
3744
+ element_guide.vertical.from_content(self.vertical._callable(element))
3693
3745
  else:
3694
3746
  # Copy static vertical positions
3695
- page_guide.vertical.data = self.vertical.data.copy()
3747
+ element_guide.vertical.data = self.vertical.data.copy()
3696
3748
 
3697
3749
  # Handle horizontal guides
3698
3750
  if hasattr(self.horizontal, "_callable") and self.horizontal._callable is not None:
3699
3751
  # If horizontal is dynamic (lambda), evaluate it
3700
- page_guide.horizontal.from_content(self.horizontal._callable(page))
3752
+ element_guide.horizontal.from_content(self.horizontal._callable(element))
3701
3753
  else:
3702
3754
  # Copy static horizontal positions
3703
- page_guide.horizontal.data = self.horizontal.data.copy()
3755
+ element_guide.horizontal.data = self.horizontal.data.copy()
3704
3756
 
3705
- # Extract table from this page
3706
- table_result = page_guide.extract_table(
3757
+ # Extract table from this element
3758
+ table_result = element_guide.extract_table(
3707
3759
  method=method,
3708
3760
  table_settings=table_settings,
3709
3761
  use_ocr=use_ocr,
@@ -3719,7 +3771,7 @@ class Guides:
3719
3771
  rows = list(table_result)
3720
3772
 
3721
3773
  # Handle headers based on strategy
3722
- if i == 0: # First page
3774
+ if i == 0: # First element
3723
3775
  if header == "first" or header == "all":
3724
3776
  # Use first row as header
3725
3777
  if rows:
@@ -3728,7 +3780,7 @@ class Guides:
3728
3780
  elif isinstance(header, list):
3729
3781
  # Custom headers provided
3730
3782
  header_row = header
3731
- else: # Subsequent pages
3783
+ else: # Subsequent elements
3732
3784
  if header == "all" and skip_repeating_headers and rows:
3733
3785
  # Expect and remove header row
3734
3786
  if rows and header_row and rows[0] == header_row:
@@ -2525,11 +2525,20 @@ class Page(
2525
2525
  include_boundaries="start",
2526
2526
  y_threshold=5.0,
2527
2527
  bounding_box=None,
2528
+ orientation="vertical",
2528
2529
  ) -> "ElementCollection[Region]":
2529
2530
  """
2530
2531
  Get sections of a page defined by start/end elements.
2531
2532
  Uses the page-level implementation.
2532
2533
 
2534
+ Args:
2535
+ start_elements: Elements or selector string that mark the start of sections
2536
+ end_elements: Elements or selector string that mark the end of sections
2537
+ include_boundaries: How to include boundary elements: 'start', 'end', 'both', or 'none'
2538
+ y_threshold: Threshold for vertical alignment (only used for vertical orientation)
2539
+ bounding_box: Optional bounding box to constrain sections
2540
+ orientation: 'vertical' (default) or 'horizontal' - determines section direction
2541
+
2533
2542
  Returns:
2534
2543
  An ElementCollection containing the found Region objects.
2535
2544
  """
@@ -2577,11 +2586,14 @@ class Page(
2577
2586
  for el in end_elements:
2578
2587
  all_boundaries.append((el, "end"))
2579
2588
 
2580
- # Sort all boundary elements primarily by top, then x0
2589
+ # Sort all boundary elements based on orientation
2581
2590
  try:
2582
- all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
2591
+ if orientation == "vertical":
2592
+ all_boundaries.sort(key=lambda x: (x[0].top, x[0].x0))
2593
+ else: # horizontal
2594
+ all_boundaries.sort(key=lambda x: (x[0].x0, x[0].top))
2583
2595
  except AttributeError as e:
2584
- logger.error(f"Error sorting boundaries: Element missing top/x0 attribute? {e}")
2596
+ logger.error(f"Error sorting boundaries: Element missing position attribute? {e}")
2585
2597
  return ElementCollection([]) # Cannot proceed if elements lack position
2586
2598
 
2587
2599
  # Process sorted boundaries to find sections
@@ -2593,72 +2605,126 @@ class Page(
2593
2605
  # If we have an active section, this start implicitly ends it
2594
2606
  if active_section_started:
2595
2607
  end_boundary_el = element # Use this start as the end boundary
2596
- # Determine region boundaries
2608
+ # Determine region boundaries based on orientation
2609
+ if orientation == "vertical":
2610
+ sec_top = (
2611
+ current_start_element.top
2612
+ if include_boundaries in ["start", "both"]
2613
+ else current_start_element.bottom
2614
+ )
2615
+ sec_bottom = (
2616
+ end_boundary_el.top
2617
+ if include_boundaries not in ["end", "both"]
2618
+ else end_boundary_el.bottom
2619
+ )
2620
+
2621
+ if sec_top < sec_bottom: # Ensure valid region
2622
+ x0, _, x1, _ = get_bounds()
2623
+ region = self.create_region(x0, sec_top, x1, sec_bottom)
2624
+ region.start_element = current_start_element
2625
+ region.end_element = end_boundary_el # Mark the element that ended it
2626
+ region.is_end_next_start = True # Mark how it ended
2627
+ regions.append(region)
2628
+ else: # horizontal
2629
+ sec_left = (
2630
+ current_start_element.x0
2631
+ if include_boundaries in ["start", "both"]
2632
+ else current_start_element.x1
2633
+ )
2634
+ sec_right = (
2635
+ end_boundary_el.x0
2636
+ if include_boundaries not in ["end", "both"]
2637
+ else end_boundary_el.x1
2638
+ )
2639
+
2640
+ if sec_left < sec_right: # Ensure valid region
2641
+ _, y0, _, y1 = get_bounds()
2642
+ region = self.create_region(sec_left, y0, sec_right, y1)
2643
+ region.start_element = current_start_element
2644
+ region.end_element = end_boundary_el # Mark the element that ended it
2645
+ region.is_end_next_start = True # Mark how it ended
2646
+ regions.append(region)
2647
+ active_section_started = False # Reset for the new start
2648
+
2649
+ # Set this as the potential start of the next section
2650
+ current_start_element = element
2651
+ active_section_started = True
2652
+
2653
+ elif element_type == "end" and active_section_started:
2654
+ # We found an explicit end for the current section
2655
+ end_boundary_el = element
2656
+ if orientation == "vertical":
2597
2657
  sec_top = (
2598
2658
  current_start_element.top
2599
2659
  if include_boundaries in ["start", "both"]
2600
2660
  else current_start_element.bottom
2601
2661
  )
2602
2662
  sec_bottom = (
2603
- end_boundary_el.top
2604
- if include_boundaries not in ["end", "both"]
2605
- else end_boundary_el.bottom
2663
+ end_boundary_el.bottom
2664
+ if include_boundaries in ["end", "both"]
2665
+ else end_boundary_el.top
2606
2666
  )
2607
2667
 
2608
2668
  if sec_top < sec_bottom: # Ensure valid region
2609
2669
  x0, _, x1, _ = get_bounds()
2610
2670
  region = self.create_region(x0, sec_top, x1, sec_bottom)
2611
2671
  region.start_element = current_start_element
2612
- region.end_element = end_boundary_el # Mark the element that ended it
2613
- region.is_end_next_start = True # Mark how it ended
2672
+ region.end_element = end_boundary_el
2673
+ region.is_end_next_start = False
2614
2674
  regions.append(region)
2615
- active_section_started = False # Reset for the new start
2675
+ else: # horizontal
2676
+ sec_left = (
2677
+ current_start_element.x0
2678
+ if include_boundaries in ["start", "both"]
2679
+ else current_start_element.x1
2680
+ )
2681
+ sec_right = (
2682
+ end_boundary_el.x1
2683
+ if include_boundaries in ["end", "both"]
2684
+ else end_boundary_el.x0
2685
+ )
2616
2686
 
2617
- # Set this as the potential start of the next section
2618
- current_start_element = element
2619
- active_section_started = True
2687
+ if sec_left < sec_right: # Ensure valid region
2688
+ _, y0, _, y1 = get_bounds()
2689
+ region = self.create_region(sec_left, y0, sec_right, y1)
2690
+ region.start_element = current_start_element
2691
+ region.end_element = end_boundary_el
2692
+ region.is_end_next_start = False
2693
+ regions.append(region)
2620
2694
 
2621
- elif element_type == "end" and active_section_started:
2622
- # We found an explicit end for the current section
2623
- end_boundary_el = element
2695
+ # Reset: section ended explicitly
2696
+ current_start_element = None
2697
+ active_section_started = False
2698
+
2699
+ # Handle the last section if it was started but never explicitly ended
2700
+ if active_section_started:
2701
+ if orientation == "vertical":
2624
2702
  sec_top = (
2625
2703
  current_start_element.top
2626
2704
  if include_boundaries in ["start", "both"]
2627
2705
  else current_start_element.bottom
2628
2706
  )
2629
- sec_bottom = (
2630
- end_boundary_el.bottom
2631
- if include_boundaries in ["end", "both"]
2632
- else end_boundary_el.top
2707
+ x0, _, x1, page_bottom = get_bounds()
2708
+ if sec_top < page_bottom:
2709
+ region = self.create_region(x0, sec_top, x1, page_bottom)
2710
+ region.start_element = current_start_element
2711
+ region.end_element = None # Ended by page end
2712
+ region.is_end_next_start = False
2713
+ regions.append(region)
2714
+ else: # horizontal
2715
+ sec_left = (
2716
+ current_start_element.x0
2717
+ if include_boundaries in ["start", "both"]
2718
+ else current_start_element.x1
2633
2719
  )
2634
-
2635
- if sec_top < sec_bottom: # Ensure valid region
2636
- x0, _, x1, _ = get_bounds()
2637
- region = self.create_region(x0, sec_top, x1, sec_bottom)
2720
+ page_left, y0, page_right, y1 = get_bounds()
2721
+ if sec_left < page_right:
2722
+ region = self.create_region(sec_left, y0, page_right, y1)
2638
2723
  region.start_element = current_start_element
2639
- region.end_element = end_boundary_el
2724
+ region.end_element = None # Ended by page end
2640
2725
  region.is_end_next_start = False
2641
2726
  regions.append(region)
2642
2727
 
2643
- # Reset: section ended explicitly
2644
- current_start_element = None
2645
- active_section_started = False
2646
-
2647
- # Handle the last section if it was started but never explicitly ended
2648
- if active_section_started:
2649
- sec_top = (
2650
- current_start_element.top
2651
- if include_boundaries in ["start", "both"]
2652
- else current_start_element.bottom
2653
- )
2654
- x0, _, x1, page_bottom = get_bounds()
2655
- if sec_top < page_bottom:
2656
- region = self.create_region(x0, sec_top, x1, page_bottom)
2657
- region.start_element = current_start_element
2658
- region.end_element = None # Ended by page end
2659
- region.is_end_next_start = False
2660
- regions.append(region)
2661
-
2662
2728
  return ElementCollection(regions)
2663
2729
 
2664
2730
  def __repr__(self) -> str: