natural-pdf 0.2.4__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (322) hide show
  1. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.gitignore +1 -0
  2. {natural_pdf-0.2.4/natural_pdf.egg-info → natural_pdf-0.2.6}/PKG-INFO +1 -1
  3. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/guides.py +246 -18
  4. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/element_manager.py +5 -0
  5. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/page.py +150 -48
  6. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/page_collection.py +223 -34
  7. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/page_groupby.py +20 -2
  8. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/pdf.py +44 -2
  9. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/render_spec.py +20 -5
  10. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/base.py +1 -1
  11. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/elements.py +1 -1
  12. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/base.py +84 -8
  13. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/element_collection.py +730 -12
  14. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/region.py +213 -61
  15. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/flow.py +3 -0
  16. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/selectors/parser.py +2 -2
  17. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/tables/result.py +39 -6
  18. natural_pdf-0.2.6/natural_pdf/utils/color_utils.py +100 -0
  19. {natural_pdf-0.2.4 → natural_pdf-0.2.6/natural_pdf.egg-info}/PKG-INFO +1 -1
  20. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/SOURCES.txt +38 -1
  21. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/top_level.txt +1 -0
  22. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_arabic_real_world.py +0 -3
  23. natural_pdf-0.2.6/tests/test_color_hex_display.py +194 -0
  24. natural_pdf-0.2.6/tests/test_crop_enhancements.py +149 -0
  25. natural_pdf-0.2.6/tests/test_crop_region_highlights.py +119 -0
  26. natural_pdf-0.2.6/tests/test_dissolve.py +471 -0
  27. natural_pdf-0.2.6/tests/test_dissolve_cross_page_bug.py +155 -0
  28. natural_pdf-0.2.6/tests/test_dissolve_debug_issue.py +195 -0
  29. natural_pdf-0.2.6/tests/test_dissolve_real_world_issue.py +201 -0
  30. natural_pdf-0.2.6/tests/test_dissolve_single_elements.py +159 -0
  31. natural_pdf-0.2.6/tests/test_dissolve_vertical_offset_issue.py +139 -0
  32. natural_pdf-0.2.6/tests/test_element_addition.py +176 -0
  33. natural_pdf-0.2.6/tests/test_element_collection_show_cols.py +132 -0
  34. natural_pdf-0.2.6/tests/test_empty_pseudo_class.py +215 -0
  35. natural_pdf-0.2.6/tests/test_fix_get_sections_zero_height.py +120 -0
  36. natural_pdf-0.2.6/tests/test_get_sections_fix_comprehensive.py +183 -0
  37. natural_pdf-0.2.6/tests/test_get_sections_zero_height.py +179 -0
  38. natural_pdf-0.2.6/tests/test_guides_extract_table_collections.py +165 -0
  39. natural_pdf-0.2.6/tests/test_guides_extract_table_exclusions.py +181 -0
  40. natural_pdf-0.2.6/tests/test_highlight_detection.py +40 -0
  41. natural_pdf-0.2.6/tests/test_highlight_detection_comprehensive.py +94 -0
  42. natural_pdf-0.2.6/tests/test_include_boundaries_comprehensive.py +124 -0
  43. natural_pdf-0.2.6/tests/test_include_boundaries_debug.py +67 -0
  44. natural_pdf-0.2.6/tests/test_include_boundaries_final.py +159 -0
  45. natural_pdf-0.2.6/tests/test_include_boundaries_final_verification.py +126 -0
  46. natural_pdf-0.2.6/tests/test_include_boundaries_fix.py +126 -0
  47. natural_pdf-0.2.6/tests/test_include_boundaries_mock.py +188 -0
  48. natural_pdf-0.2.6/tests/test_include_boundaries_simple.py +119 -0
  49. natural_pdf-0.2.6/tests/test_include_boundaries_types_pdf.py +113 -0
  50. natural_pdf-0.2.6/tests/test_include_boundaries_verification.py +134 -0
  51. natural_pdf-0.2.6/tests/test_include_boundaries_with_real_text.py +104 -0
  52. natural_pdf-0.2.6/tests/test_merge_connected.py +302 -0
  53. natural_pdf-0.2.6/tests/test_merge_connected_real_world.py +240 -0
  54. natural_pdf-0.2.6/tests/test_merge_method.py +185 -0
  55. natural_pdf-0.2.6/tests/test_sections_with_start_and_end.py +98 -0
  56. natural_pdf-0.2.6/tests/test_slice_cache_reuse.py +202 -0
  57. natural_pdf-0.2.6/tests/test_slice_exclusion_fix.py +148 -0
  58. natural_pdf-0.2.6/tests/test_slice_exclusion_issue.py +78 -0
  59. natural_pdf-0.2.6/tests/test_slice_exclusion_mock.py +158 -0
  60. natural_pdf-0.2.6/tests/test_sliced_collection_exclusions.py +166 -0
  61. natural_pdf-0.2.4/test_install.sh +0 -46
  62. natural_pdf-0.2.4/tests/test_highlight_detection.py +0 -11
  63. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/analysis_framework.mdc +0 -0
  64. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/coding-style.mdc +0 -0
  65. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  66. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/minimal-comments.mdc +0 -0
  67. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  68. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  69. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.github/workflows/ci.yml +0 -0
  70. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.github/workflows/docs.yml +0 -0
  71. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.github/workflows/nightly-tutorials.yml +0 -0
  72. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/.pre-commit-config.yaml +0 -0
  73. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/01-execute_notebooks.py +0 -0
  74. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/02-run_all_tutorials.sh +0 -0
  75. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/CLAUDE.md +0 -0
  76. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/LICENSE +0 -0
  77. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/MANIFEST.in +0 -0
  78. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/README.md +0 -0
  79. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/audit_packaging.py +0 -0
  80. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/check_run_md.sh +0 -0
  81. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/api/index.md +0 -0
  82. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/favicon.png +0 -0
  83. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/favicon.svg +0 -0
  84. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/javascripts/custom.js +0 -0
  85. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/logo.svg +0 -0
  86. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/sample-screen.png +0 -0
  87. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/social-preview.png +0 -0
  88. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/social-preview.svg +0 -0
  89. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/assets/stylesheets/custom.css +0 -0
  90. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/categorizing-documents/index.md +0 -0
  91. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/data-extraction/index.md +0 -0
  92. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/describe/index.md +0 -0
  93. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/document-qa/index.md +0 -0
  94. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/element-selection/index.md +0 -0
  95. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/extracting-clean-text/index.md +0 -0
  96. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/finetuning/index.md +0 -0
  97. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/fix-messy-tables/index.md +0 -0
  98. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_1.csv +0 -0
  99. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_2.csv +0 -0
  100. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/fix-messy-tables/table_3.csv +0 -0
  101. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/index.md +0 -0
  102. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/installation/index.md +0 -0
  103. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/interactive-widget/index.md +0 -0
  104. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/layout-analysis/index.md +0 -0
  105. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/loops-and-groups/index.md +0 -0
  106. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/ocr/index.md +0 -0
  107. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/pdf-navigation/index.md +0 -0
  108. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  109. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/process-forms-and-invoices/index.md +0 -0
  110. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/quick-reference/index.md +0 -0
  111. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/reflowing-pages/index.md +0 -0
  112. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/regions/index.md +0 -0
  113. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tables/index.md +0 -0
  114. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/text-analysis/index.md +0 -0
  115. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/01-loading-and-extraction.md +0 -0
  116. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/02-finding-elements.md +0 -0
  117. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/03-extracting-blocks.md +0 -0
  118. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/04-table-extraction.md +0 -0
  119. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/05-excluding-content.md +0 -0
  120. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/06-document-qa.md +0 -0
  121. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/07-layout-analysis.md +0 -0
  122. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/07-working-with-regions.md +0 -0
  123. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/08-spatial-navigation.md +0 -0
  124. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/09-section-extraction.md +0 -0
  125. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/10-form-field-extraction.md +0 -0
  126. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  127. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/12-ocr-integration.md +0 -0
  128. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/13-semantic-search.md +0 -0
  129. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/tutorials/14-categorizing-documents.md +0 -0
  130. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/visual-debugging/index.md +0 -0
  131. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/docs/visual-debugging/region.png +0 -0
  132. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/mkdocs.yml +0 -0
  133. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/__init__.py +0 -0
  134. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/__init__.py +0 -0
  135. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/__init__.py +0 -0
  136. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/base.py +0 -0
  137. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/docling.py +0 -0
  138. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/gemini.py +0 -0
  139. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  140. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  141. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  142. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/paddle.py +0 -0
  143. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  144. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/surya.py +0 -0
  145. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  146. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/tatr.py +0 -0
  147. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/layout/yolo.py +0 -0
  148. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  149. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/text_options.py +0 -0
  150. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/text_structure.py +0 -0
  151. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/analyzers/utils.py +0 -0
  152. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/classification/manager.py +0 -0
  153. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/classification/mixin.py +0 -0
  154. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/classification/results.py +0 -0
  155. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/cli.py +0 -0
  156. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/collections/mixins.py +0 -0
  157. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/__init__.py +0 -0
  158. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/highlighting_service.py +0 -0
  159. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/core/pdf_collection.py +0 -0
  160. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/__init__.py +0 -0
  161. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/mixin.py +0 -0
  162. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/describe/summary.py +0 -0
  163. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/__init__.py +0 -0
  164. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/image.py +0 -0
  165. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/line.py +0 -0
  166. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/rect.py +0 -0
  167. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/elements/text.py +0 -0
  168. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/export/mixin.py +0 -0
  169. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/__init__.py +0 -0
  170. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/base.py +0 -0
  171. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/data/__init__.py +0 -0
  172. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/data/pdf.ttf +0 -0
  173. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/data/sRGB.icc +0 -0
  174. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/hocr.py +0 -0
  175. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/hocr_font.py +0 -0
  176. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/original_pdf.py +0 -0
  177. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/paddleocr.py +0 -0
  178. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/exporters/searchable_pdf.py +0 -0
  179. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/extraction/manager.py +0 -0
  180. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/extraction/mixin.py +0 -0
  181. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/extraction/result.py +0 -0
  182. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/__init__.py +0 -0
  183. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/collections.py +0 -0
  184. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/element.py +0 -0
  185. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/flows/region.py +0 -0
  186. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/__init__.py +0 -0
  187. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine.py +0 -0
  188. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_doctr.py +0 -0
  189. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_easyocr.py +0 -0
  190. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_paddle.py +0 -0
  191. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/engine_surya.py +0 -0
  192. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_factory.py +0 -0
  193. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_manager.py +0 -0
  194. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/ocr_options.py +0 -0
  195. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/ocr/utils.py +0 -0
  196. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/qa/__init__.py +0 -0
  197. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/qa/document_qa.py +0 -0
  198. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/qa/qa_result.py +0 -0
  199. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/__init__.py +0 -0
  200. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/lancedb_search_service.py +0 -0
  201. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/numpy_search_service.py +0 -0
  202. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/search_options.py +0 -0
  203. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/search_service_protocol.py +0 -0
  204. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/search/searchable_mixin.py +0 -0
  205. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/selectors/__init__.py +0 -0
  206. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/tables/__init__.py +0 -0
  207. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/__init__.py +0 -0
  208. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  209. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/spa/css/style.css +0 -0
  210. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/spa/index.html +0 -0
  211. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/spa/js/app.js +0 -0
  212. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/templates/spa/words.txt +0 -0
  213. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/text_mixin.py +0 -0
  214. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/__init__.py +0 -0
  215. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/bidi_mirror.py +0 -0
  216. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/debug.py +0 -0
  217. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/highlighting.py +0 -0
  218. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/identifiers.py +0 -0
  219. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/layout.py +0 -0
  220. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/locks.py +0 -0
  221. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/packaging.py +0 -0
  222. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/reading_order.py +0 -0
  223. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/text_extraction.py +0 -0
  224. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/utils/visualization.py +0 -0
  225. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/vision/__init__.py +0 -0
  226. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/vision/mixin.py +0 -0
  227. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/vision/results.py +0 -0
  228. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/vision/similarity.py +0 -0
  229. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/widgets/__init__.py +0 -0
  230. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf/widgets/viewer.py +0 -0
  231. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/dependency_links.txt +0 -0
  232. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/entry_points.txt +0 -0
  233. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/natural_pdf.egg-info/requires.txt +0 -0
  234. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/noxfile.py +0 -0
  235. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/memory_comparison.py +0 -0
  236. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/pdf_analyzer.py +0 -0
  237. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_analysis.py +0 -0
  238. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  239. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  240. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  241. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  242. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/test_cleanup_methods.py +0 -0
  243. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/optimization/test_memory_fix.py +0 -0
  244. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/publish.sh +0 -0
  245. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/pyproject.toml +0 -0
  246. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/sample-screen.png +0 -0
  247. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/setup.cfg +0 -0
  248. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/conftest.py +0 -0
  249. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/exporters/test_paddleocr_exporter.py +0 -0
  250. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_annotate.py +0 -0
  251. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_arabic_performance.py +0 -0
  252. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_color_conversion.py +0 -0
  253. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_containment_geometry.py +0 -0
  254. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_elements.py +0 -0
  255. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_loading.py +0 -0
  256. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_spatial.py +0 -0
  257. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_text_extraction.py +0 -0
  258. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_core/test_text_layer.py +0 -0
  259. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_directional_defaults.py +0 -0
  260. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_document_qa.py +0 -0
  261. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_element_collection_slicing.py +0 -0
  262. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_element_show_crop_highlights.py +0 -0
  263. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_exclusions.py +0 -0
  264. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_expand.py +0 -0
  265. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_extraction_error.py +0 -0
  266. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_extraction_mixin_fix.py +0 -0
  267. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_extraction_text_and_vision.py +0 -0
  268. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_extraction_working.py +0 -0
  269. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_find_similar.py +0 -0
  270. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_first_last_selectors.py +0 -0
  271. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_flow_region_directional.py +0 -0
  272. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_groupby.py +0 -0
  273. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides.py +0 -0
  274. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_apply_exclusions.py +0 -0
  275. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_apply_exclusions_simple.py +0 -0
  276. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_extract_table.py +0 -0
  277. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_extract_table_real.py +0 -0
  278. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_guides_integration.py +0 -0
  279. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_highlight_protocol.py +0 -0
  280. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_highlight_protocol_simple.py +0 -0
  281. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_highlight_regions.py +0 -0
  282. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_loading_original.py +0 -0
  283. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_multi_page_table_discovery.py +0 -0
  284. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_optional_deps.py +0 -0
  285. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_page_exclusion_lists.py +0 -0
  286. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  287. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_region_show_crop_highlights.py +0 -0
  288. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_region_viewer.py +0 -0
  289. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_sections_end_only.py +0 -0
  290. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_column_layout.py +0 -0
  291. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_edge_cases.py +0 -0
  292. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_exclusions.py +0 -0
  293. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_exclusions_feature.py +0 -0
  294. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_show_limit.py +0 -0
  295. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_skip_repeating_headers_multipage.py +0 -0
  296. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_strikethrough_detection.py +0 -0
  297. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_table_result_header_mismatch.py +0 -0
  298. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_table_result_keep_blank.py +0 -0
  299. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_tiny_text_tables.py +0 -0
  300. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_tiny_text_tables_table.py +0 -0
  301. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_tutorials.py +0 -0
  302. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_underline_detection.py +0 -0
  303. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tests/test_update_text.py +0 -0
  304. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/todo/bad_pdf_analysis.md +0 -0
  305. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/todo/evaluation.md +0 -0
  306. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  307. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  308. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  309. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/README.md +0 -0
  310. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/__init__.py +0 -0
  311. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/analyser.py +0 -0
  312. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  313. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  314. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/eval_suite.py +0 -0
  315. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  316. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  317. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  318. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  319. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  320. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/reporter.py +0 -0
  321. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/tools/bad_pdf_eval/utils.py +0 -0
  322. {natural_pdf-0.2.4 → natural_pdf-0.2.6}/uv.lock +0 -0
@@ -1,3 +1,4 @@
1
+ temp
1
2
  Untitled*.ipynb
2
3
  importtime_output.txt
3
4
  .notebook_cache.json
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -143,7 +143,7 @@ class GuidesList(UserList):
143
143
 
144
144
  def from_content(
145
145
  self,
146
- markers: Union[str, List[str], "ElementCollection", None],
146
+ markers: Union[str, List[str], "ElementCollection", Callable, None],
147
147
  obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
148
148
  align: Literal["left", "right", "center", "between"] = "left",
149
149
  outer: bool = True,
@@ -160,6 +160,7 @@ class GuidesList(UserList):
160
160
  - str: single selector (e.g., 'text:contains("Name")') or literal text
161
161
  - List[str]: list of selectors or literal text strings
162
162
  - ElementCollection: collection of elements to extract text from
163
+ - Callable: function that takes a page and returns markers
163
164
  - None: no markers
164
165
  obj: Page/Region/FlowRegion to search (uses parent's context if None)
165
166
  align: How to align guides relative to found elements
@@ -174,13 +175,22 @@ class GuidesList(UserList):
174
175
  if target_obj is None:
175
176
  raise ValueError("No object provided and no context available")
176
177
 
178
+ # Store callable markers for later evaluation
179
+ if callable(markers):
180
+ self._callable = markers
181
+ # For now, evaluate with the current target object to get initial guides
182
+ actual_markers = markers(target_obj)
183
+ else:
184
+ self._callable = None
185
+ actual_markers = markers
186
+
177
187
  # Check if parent is in flow mode
178
188
  if self._parent.is_flow_region:
179
189
  # Create guides across all constituent regions
180
190
  all_guides = []
181
191
  for region in self._parent.context.constituent_regions:
182
192
  # Normalize markers for this region
183
- marker_texts = _normalize_markers(markers, region)
193
+ marker_texts = _normalize_markers(actual_markers, region)
184
194
 
185
195
  # Create guides for this region
186
196
  region_guides = Guides.from_content(
@@ -263,7 +273,7 @@ class GuidesList(UserList):
263
273
 
264
274
  # Original single-region logic
265
275
  # Normalize markers to list of text strings
266
- marker_texts = _normalize_markers(markers, target_obj)
276
+ marker_texts = _normalize_markers(actual_markers, target_obj)
267
277
 
268
278
  # Create guides for this axis
269
279
  new_guides = Guides.from_content(
@@ -1541,11 +1551,15 @@ class Guides:
1541
1551
  # Add outer guides if requested
1542
1552
  if outer and bounds:
1543
1553
  if axis == "vertical":
1544
- guides_coords.insert(0, bounds[0]) # x0
1545
- guides_coords.append(bounds[2]) # x1
1554
+ if outer == True or outer == "first":
1555
+ guides_coords.insert(0, bounds[0]) # x0
1556
+ if outer == True or outer == "last":
1557
+ guides_coords.append(bounds[2]) # x1
1546
1558
  else:
1547
- guides_coords.insert(0, bounds[1]) # y0
1548
- guides_coords.append(bounds[3]) # y1
1559
+ if outer == True or outer == "first":
1560
+ guides_coords.insert(0, bounds[1]) # y0
1561
+ if outer == True or outer == "last":
1562
+ guides_coords.append(bounds[3]) # y1
1549
1563
 
1550
1564
  # Remove duplicates and sort
1551
1565
  guides_coords = sorted(list(set(guides_coords)))
@@ -3302,7 +3316,7 @@ class Guides:
3302
3316
  markers: Union[str, List[str], "ElementCollection", None] = None,
3303
3317
  obj: Optional[Union["Page", "Region"]] = None,
3304
3318
  align: Literal["left", "right", "center", "between"] = "left",
3305
- outer: bool = True,
3319
+ outer: Union[str, bool] = True,
3306
3320
  tolerance: float = 5,
3307
3321
  apply_exclusions: bool = True,
3308
3322
  ) -> "Guides":
@@ -3319,7 +3333,10 @@ class Guides:
3319
3333
  - None: no markers
3320
3334
  obj: Page or Region to search (uses self.context if None)
3321
3335
  align: How to align guides relative to found elements
3322
- outer: Whether to add outer boundary guides
3336
+ outer: Whether to add outer boundary guides. Can be:
3337
+ - bool: True/False to add/not add both
3338
+ - "first": To add boundary before the first element
3339
+ - "last": To add boundary before the last element
3323
3340
  tolerance: Tolerance for snapping to element edges
3324
3341
  apply_exclusions: Whether to apply exclusion zones when searching for text
3325
3342
 
@@ -3445,7 +3462,15 @@ class Guides:
3445
3462
 
3446
3463
  def extract_table(
3447
3464
  self,
3448
- target: Optional[Union["Page", "Region"]] = None,
3465
+ target: Optional[
3466
+ Union[
3467
+ "Page",
3468
+ "Region",
3469
+ "PageCollection",
3470
+ "ElementCollection",
3471
+ List[Union["Page", "Region"]],
3472
+ ]
3473
+ ] = None,
3449
3474
  source: str = "guides_temp",
3450
3475
  cell_padding: float = 0.5,
3451
3476
  include_outer_boundaries: bool = False,
@@ -3457,8 +3482,11 @@ class Guides:
3457
3482
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
3458
3483
  show_progress: bool = False,
3459
3484
  content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
3485
+ apply_exclusions: bool = True,
3460
3486
  *,
3461
3487
  multi_page: Literal["auto", True, False] = "auto",
3488
+ header: Union[str, List[str], None] = "first",
3489
+ skip_repeating_headers: Optional[bool] = None,
3462
3490
  ) -> "TableResult":
3463
3491
  """
3464
3492
  Extract table data directly from guides without leaving temporary regions.
@@ -3469,8 +3497,11 @@ class Guides:
3469
3497
  3. Cleans up all temporary regions
3470
3498
  4. Returns the TableResult
3471
3499
 
3500
+ When passed a collection (PageCollection, ElementCollection, or list), this method
3501
+ will extract tables from each element and combine them into a single result.
3502
+
3472
3503
  Args:
3473
- target: Page or Region to create regions on (uses self.context if None)
3504
+ target: Page, Region, or collection of Pages/Regions to extract from (uses self.context if None)
3474
3505
  source: Source label for temporary regions (will be cleaned up)
3475
3506
  cell_padding: Internal padding for cell regions in points
3476
3507
  include_outer_boundaries: Whether to add boundaries at edges if missing
@@ -3482,7 +3513,15 @@ class Guides:
3482
3513
  cell_extraction_func: Optional callable for custom cell text extraction
3483
3514
  show_progress: Controls progress bar for text method
3484
3515
  content_filter: Content filtering function or patterns
3516
+ apply_exclusions: Whether to apply exclusion regions during text extraction (default: True)
3485
3517
  multi_page: Controls multi-region table creation for FlowRegions
3518
+ header: How to handle headers when extracting from collections:
3519
+ - "first": Use first row of first element as headers (default)
3520
+ - "all": Expect headers on each element, use from first element
3521
+ - None: No headers, use numeric indices
3522
+ - List[str]: Custom column names
3523
+ skip_repeating_headers: Whether to remove duplicate header rows when extracting from collections.
3524
+ Defaults to True when header is "first" or "all", False otherwise.
3486
3525
 
3487
3526
  Returns:
3488
3527
  TableResult: Extracted table data
@@ -3494,20 +3533,49 @@ class Guides:
3494
3533
  ```python
3495
3534
  from natural_pdf.analyzers import Guides
3496
3535
 
3497
- # Create guides from detected lines
3536
+ # Single page extraction
3498
3537
  guides = Guides.from_lines(page, source_label="detected")
3499
-
3500
- # Extract table directly - no temporary regions left behind
3501
3538
  table_data = guides.extract_table()
3502
-
3503
- # Convert to pandas DataFrame
3504
3539
  df = table_data.to_df()
3540
+
3541
+ # Multiple page extraction
3542
+ guides = Guides(pages[0])
3543
+ guides.vertical.from_content(['Column 1', 'Column 2'])
3544
+ table_result = guides.extract_table(pages, header=['Col1', 'Col2'])
3545
+ df = table_result.to_df()
3546
+
3547
+ # Region collection extraction
3548
+ regions = pdf.find_all('region[type=table]')
3549
+ guides = Guides(regions[0])
3550
+ guides.vertical.from_lines(n=3)
3551
+ table_result = guides.extract_table(regions)
3505
3552
  ```
3506
3553
  """
3507
- target_obj = target or self.context
3508
- if not target_obj:
3554
+ from natural_pdf.core.page_collection import PageCollection
3555
+ from natural_pdf.elements.element_collection import ElementCollection
3556
+
3557
+ target_obj = target if target is not None else self.context
3558
+ if target_obj is None:
3509
3559
  raise ValueError("No target object available. Provide target parameter or context.")
3510
3560
 
3561
+ # Check if target is a collection - if so, delegate to _extract_table_from_collection
3562
+ if isinstance(target_obj, (PageCollection, ElementCollection, list)):
3563
+ # For collections, pass through most parameters as-is
3564
+ return self._extract_table_from_collection(
3565
+ elements=target_obj,
3566
+ header=header,
3567
+ skip_repeating_headers=skip_repeating_headers,
3568
+ method=method,
3569
+ table_settings=table_settings,
3570
+ use_ocr=use_ocr,
3571
+ ocr_config=ocr_config,
3572
+ text_options=text_options,
3573
+ cell_extraction_func=cell_extraction_func,
3574
+ show_progress=show_progress,
3575
+ content_filter=content_filter,
3576
+ apply_exclusions=apply_exclusions,
3577
+ )
3578
+
3511
3579
  # Get the page for cleanup later
3512
3580
  if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
3513
3581
  page = target_obj._page
@@ -3552,6 +3620,7 @@ class Guides:
3552
3620
  cell_extraction_func=cell_extraction_func,
3553
3621
  show_progress=show_progress,
3554
3622
  content_filter=content_filter,
3623
+ apply_exclusions=apply_exclusions,
3555
3624
  )
3556
3625
 
3557
3626
  return table_result
@@ -3577,6 +3646,165 @@ class Guides:
3577
3646
  except Exception as cleanup_err:
3578
3647
  logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
3579
3648
 
3649
+ def _extract_table_from_collection(
3650
+ self,
3651
+ elements: Union["PageCollection", "ElementCollection", List[Union["Page", "Region"]]],
3652
+ header: Union[str, List[str], None] = "first",
3653
+ skip_repeating_headers: Optional[bool] = None,
3654
+ method: Optional[str] = None,
3655
+ table_settings: Optional[dict] = None,
3656
+ use_ocr: bool = False,
3657
+ ocr_config: Optional[dict] = None,
3658
+ text_options: Optional[Dict] = None,
3659
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
3660
+ show_progress: bool = True,
3661
+ content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
3662
+ apply_exclusions: bool = True,
3663
+ ) -> "TableResult":
3664
+ """
3665
+ Extract tables from multiple pages or regions using this guide pattern.
3666
+
3667
+ This method applies the guide to each element, extracts tables, and combines
3668
+ them into a single TableResult. Dynamic guides (using lambdas) are evaluated
3669
+ for each element.
3670
+
3671
+ Args:
3672
+ elements: PageCollection, ElementCollection, or list of Pages/Regions to extract from
3673
+ header: How to handle headers:
3674
+ - "first": Use first row of first element as headers (default)
3675
+ - "all": Expect headers on each element, use from first element
3676
+ - None: No headers, use numeric indices
3677
+ - List[str]: Custom column names
3678
+ skip_repeating_headers: Whether to remove duplicate header rows.
3679
+ Defaults to True when header is "first" or "all", False otherwise.
3680
+ method: Table extraction method (passed to extract_table)
3681
+ table_settings: Settings for pdfplumber table extraction
3682
+ use_ocr: Whether to use OCR for text extraction
3683
+ ocr_config: OCR configuration parameters
3684
+ text_options: Dictionary of options for the 'text' method
3685
+ cell_extraction_func: Optional callable for custom cell text extraction
3686
+ show_progress: Show progress bar for multi-element extraction (default: True)
3687
+ content_filter: Content filtering function or patterns
3688
+ apply_exclusions: Whether to apply exclusion regions during extraction
3689
+
3690
+ Returns:
3691
+ TableResult: Combined table data from all elements
3692
+
3693
+ Example:
3694
+ ```python
3695
+ # Create guide with static vertical, dynamic horizontal
3696
+ guide = Guides(regions[0])
3697
+ guide.vertical.from_content(columns, outer="last")
3698
+ guide.horizontal.from_content(lambda r: r.find_all('text:starts-with(NF-)'))
3699
+
3700
+ # Extract from all regions
3701
+ table_result = guide._extract_table_from_collection(regions, header=columns)
3702
+ df = table_result.to_df()
3703
+ ```
3704
+ """
3705
+ from natural_pdf.core.page_collection import PageCollection
3706
+ from natural_pdf.elements.element_collection import ElementCollection
3707
+ from natural_pdf.tables.result import TableResult
3708
+
3709
+ # Convert to list if it's a collection
3710
+ if isinstance(elements, (PageCollection, ElementCollection)):
3711
+ element_list = list(elements)
3712
+ else:
3713
+ element_list = elements
3714
+
3715
+ if not element_list:
3716
+ return TableResult([])
3717
+
3718
+ # Determine header handling
3719
+ if skip_repeating_headers is None:
3720
+ skip_repeating_headers = header in ["first", "all"] or isinstance(header, list)
3721
+
3722
+ all_rows = []
3723
+ header_row = None
3724
+
3725
+ # Configure progress bar
3726
+ iterator = element_list
3727
+ if show_progress and len(element_list) > 1:
3728
+ try:
3729
+ from tqdm.auto import tqdm
3730
+
3731
+ iterator = tqdm(
3732
+ element_list, desc="Extracting tables from elements", unit="element"
3733
+ )
3734
+ except ImportError:
3735
+ pass
3736
+
3737
+ for i, element in enumerate(iterator):
3738
+ # Create a new Guides object for this element
3739
+ element_guide = Guides(element)
3740
+
3741
+ # Copy vertical guides (usually static)
3742
+ if hasattr(self.vertical, "_callable") and self.vertical._callable is not None:
3743
+ # If vertical is dynamic (lambda), evaluate it
3744
+ element_guide.vertical.from_content(self.vertical._callable(element))
3745
+ else:
3746
+ # Copy static vertical positions
3747
+ element_guide.vertical.data = self.vertical.data.copy()
3748
+
3749
+ # Handle horizontal guides
3750
+ if hasattr(self.horizontal, "_callable") and self.horizontal._callable is not None:
3751
+ # If horizontal is dynamic (lambda), evaluate it
3752
+ element_guide.horizontal.from_content(self.horizontal._callable(element))
3753
+ else:
3754
+ # Copy static horizontal positions
3755
+ element_guide.horizontal.data = self.horizontal.data.copy()
3756
+
3757
+ # Extract table from this element
3758
+ table_result = element_guide.extract_table(
3759
+ method=method,
3760
+ table_settings=table_settings,
3761
+ use_ocr=use_ocr,
3762
+ ocr_config=ocr_config,
3763
+ text_options=text_options,
3764
+ cell_extraction_func=cell_extraction_func,
3765
+ show_progress=False, # Don't show nested progress
3766
+ content_filter=content_filter,
3767
+ apply_exclusions=apply_exclusions,
3768
+ )
3769
+
3770
+ # Convert to list of rows
3771
+ rows = list(table_result)
3772
+
3773
+ # Handle headers based on strategy
3774
+ if i == 0: # First element
3775
+ if header == "first" or header == "all":
3776
+ # Use first row as header
3777
+ if rows:
3778
+ header_row = rows[0]
3779
+ rows = rows[1:] # Remove header from data
3780
+ elif isinstance(header, list):
3781
+ # Custom headers provided
3782
+ header_row = header
3783
+ else: # Subsequent elements
3784
+ if header == "all" and skip_repeating_headers and rows:
3785
+ # Expect and remove header row
3786
+ if rows and header_row and rows[0] == header_row:
3787
+ rows = rows[1:]
3788
+ elif rows:
3789
+ # Still remove first row if it looks like a header
3790
+ rows = rows[1:]
3791
+
3792
+ # Add rows to combined result
3793
+ all_rows.extend(rows)
3794
+
3795
+ # Create final TableResult
3796
+ if isinstance(header, list):
3797
+ # Custom headers - prepend to data
3798
+ final_result = TableResult(all_rows)
3799
+ elif header_row is not None:
3800
+ # Prepend discovered header
3801
+ final_result = TableResult([header_row] + all_rows)
3802
+ else:
3803
+ # No headers
3804
+ final_result = TableResult(all_rows)
3805
+
3806
+ return final_result
3807
+
3580
3808
  def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
3581
3809
  """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
3582
3810
  if not self.is_flow_region or len(self.context.constituent_regions) < 2:
@@ -939,6 +939,11 @@ class ElementManager:
939
939
  self.load_elements()
940
940
  return self._elements.get("chars", [])
941
941
 
942
+ def invalidate_cache(self):
943
+ """Invalidate the cached elements, forcing a reload on next access."""
944
+ self._elements = None
945
+ logger.debug(f"Page {self._page.number}: ElementManager cache invalidated")
946
+
942
947
  @property
943
948
  def words(self):
944
949
  """Get all word elements."""