natural-pdf 0.2.3__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (290) hide show
  1. {natural_pdf-0.2.3/natural_pdf.egg-info → natural_pdf-0.2.5}/PKG-INFO +1 -1
  2. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/guides.py +185 -9
  3. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/element_manager.py +5 -0
  4. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/page.py +42 -4
  5. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/pdf.py +45 -3
  6. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/pdf_collection.py +131 -4
  7. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/render_spec.py +2 -2
  8. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/elements/base.py +18 -14
  9. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/elements/region.py +42 -21
  10. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/tables/result.py +39 -6
  11. natural_pdf-0.2.5/natural_pdf/vision/__init__.py +7 -0
  12. natural_pdf-0.2.5/natural_pdf/vision/mixin.py +209 -0
  13. natural_pdf-0.2.5/natural_pdf/vision/results.py +146 -0
  14. natural_pdf-0.2.5/natural_pdf/vision/similarity.py +321 -0
  15. {natural_pdf-0.2.3 → natural_pdf-0.2.5/natural_pdf.egg-info}/PKG-INFO +1 -1
  16. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf.egg-info/SOURCES.txt +14 -1
  17. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf.egg-info/top_level.txt +0 -1
  18. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_arabic_real_world.py +0 -3
  19. natural_pdf-0.2.5/tests/test_element_show_crop_highlights.py +168 -0
  20. natural_pdf-0.2.5/tests/test_find_similar.py +147 -0
  21. natural_pdf-0.2.5/tests/test_guides_extract_table_exclusions.py +180 -0
  22. natural_pdf-0.2.5/tests/test_guides_extract_table_from_pages.py +142 -0
  23. natural_pdf-0.2.5/tests/test_region_show_crop_highlights.py +219 -0
  24. natural_pdf-0.2.5/tests/test_slice_cache_reuse.py +199 -0
  25. natural_pdf-0.2.5/tests/test_slice_exclusion_fix.py +145 -0
  26. natural_pdf-0.2.5/tests/test_slice_exclusion_issue.py +72 -0
  27. natural_pdf-0.2.5/tests/test_slice_exclusion_mock.py +158 -0
  28. natural_pdf-0.2.5/tests/test_sliced_collection_exclusions.py +158 -0
  29. natural_pdf-0.2.3/test_install.sh +0 -46
  30. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.cursor/rules/analysis_framework.mdc +0 -0
  31. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.cursor/rules/coding-style.mdc +0 -0
  32. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  33. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.cursor/rules/minimal-comments.mdc +0 -0
  34. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  35. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  36. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.github/workflows/ci.yml +0 -0
  37. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.github/workflows/docs.yml +0 -0
  38. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.github/workflows/nightly-tutorials.yml +0 -0
  39. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.gitignore +0 -0
  40. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/.pre-commit-config.yaml +0 -0
  41. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/01-execute_notebooks.py +0 -0
  42. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/02-run_all_tutorials.sh +0 -0
  43. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/CLAUDE.md +0 -0
  44. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/LICENSE +0 -0
  45. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/MANIFEST.in +0 -0
  46. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/README.md +0 -0
  47. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/audit_packaging.py +0 -0
  48. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/check_run_md.sh +0 -0
  49. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/api/index.md +0 -0
  50. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/assets/favicon.png +0 -0
  51. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/assets/favicon.svg +0 -0
  52. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/assets/javascripts/custom.js +0 -0
  53. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/assets/logo.svg +0 -0
  54. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/assets/sample-screen.png +0 -0
  55. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/assets/social-preview.png +0 -0
  56. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/assets/social-preview.svg +0 -0
  57. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/assets/stylesheets/custom.css +0 -0
  58. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/categorizing-documents/index.md +0 -0
  59. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/data-extraction/index.md +0 -0
  60. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/describe/index.md +0 -0
  61. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/document-qa/index.md +0 -0
  62. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/element-selection/index.md +0 -0
  63. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/extracting-clean-text/index.md +0 -0
  64. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/finetuning/index.md +0 -0
  65. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/fix-messy-tables/index.md +0 -0
  66. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/fix-messy-tables/table_1.csv +0 -0
  67. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/fix-messy-tables/table_2.csv +0 -0
  68. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/fix-messy-tables/table_3.csv +0 -0
  69. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/index.md +0 -0
  70. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/installation/index.md +0 -0
  71. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/interactive-widget/index.md +0 -0
  72. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/layout-analysis/index.md +0 -0
  73. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/loops-and-groups/index.md +0 -0
  74. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/ocr/index.md +0 -0
  75. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/pdf-navigation/index.md +0 -0
  76. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  77. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/process-forms-and-invoices/index.md +0 -0
  78. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/quick-reference/index.md +0 -0
  79. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/reflowing-pages/index.md +0 -0
  80. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/regions/index.md +0 -0
  81. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tables/index.md +0 -0
  82. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/text-analysis/index.md +0 -0
  83. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/01-loading-and-extraction.md +0 -0
  84. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/02-finding-elements.md +0 -0
  85. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/03-extracting-blocks.md +0 -0
  86. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/04-table-extraction.md +0 -0
  87. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/05-excluding-content.md +0 -0
  88. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/06-document-qa.md +0 -0
  89. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/07-layout-analysis.md +0 -0
  90. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/07-working-with-regions.md +0 -0
  91. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/08-spatial-navigation.md +0 -0
  92. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/09-section-extraction.md +0 -0
  93. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/10-form-field-extraction.md +0 -0
  94. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  95. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/12-ocr-integration.md +0 -0
  96. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/13-semantic-search.md +0 -0
  97. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/tutorials/14-categorizing-documents.md +0 -0
  98. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/visual-debugging/index.md +0 -0
  99. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/docs/visual-debugging/region.png +0 -0
  100. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/mkdocs.yml +0 -0
  101. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/__init__.py +0 -0
  102. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/__init__.py +0 -0
  103. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/__init__.py +0 -0
  104. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/base.py +0 -0
  105. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/docling.py +0 -0
  106. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/gemini.py +0 -0
  107. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  108. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  109. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  110. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/paddle.py +0 -0
  111. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  112. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/surya.py +0 -0
  113. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  114. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/tatr.py +0 -0
  115. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/layout/yolo.py +0 -0
  116. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  117. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/text_options.py +0 -0
  118. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/text_structure.py +0 -0
  119. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/analyzers/utils.py +0 -0
  120. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/classification/manager.py +0 -0
  121. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/classification/mixin.py +0 -0
  122. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/classification/results.py +0 -0
  123. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/cli.py +0 -0
  124. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/collections/mixins.py +0 -0
  125. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/__init__.py +0 -0
  126. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/highlighting_service.py +0 -0
  127. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/page_collection.py +0 -0
  128. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/core/page_groupby.py +0 -0
  129. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/describe/__init__.py +0 -0
  130. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/describe/base.py +0 -0
  131. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/describe/elements.py +0 -0
  132. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/describe/mixin.py +0 -0
  133. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/describe/summary.py +0 -0
  134. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/elements/__init__.py +0 -0
  135. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/elements/element_collection.py +0 -0
  136. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/elements/image.py +0 -0
  137. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/elements/line.py +0 -0
  138. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/elements/rect.py +0 -0
  139. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/elements/text.py +0 -0
  140. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/export/mixin.py +0 -0
  141. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/__init__.py +0 -0
  142. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/base.py +0 -0
  143. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/data/__init__.py +0 -0
  144. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/data/pdf.ttf +0 -0
  145. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/data/sRGB.icc +0 -0
  146. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/hocr.py +0 -0
  147. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/hocr_font.py +0 -0
  148. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/original_pdf.py +0 -0
  149. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/paddleocr.py +0 -0
  150. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/exporters/searchable_pdf.py +0 -0
  151. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/extraction/manager.py +0 -0
  152. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/extraction/mixin.py +0 -0
  153. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/extraction/result.py +0 -0
  154. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/flows/__init__.py +0 -0
  155. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/flows/collections.py +0 -0
  156. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/flows/element.py +0 -0
  157. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/flows/flow.py +0 -0
  158. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/flows/region.py +0 -0
  159. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/__init__.py +0 -0
  160. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/engine.py +0 -0
  161. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/engine_doctr.py +0 -0
  162. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/engine_easyocr.py +0 -0
  163. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/engine_paddle.py +0 -0
  164. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/engine_surya.py +0 -0
  165. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/ocr_factory.py +0 -0
  166. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/ocr_manager.py +0 -0
  167. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/ocr_options.py +0 -0
  168. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/ocr/utils.py +0 -0
  169. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/qa/__init__.py +0 -0
  170. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/qa/document_qa.py +0 -0
  171. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/qa/qa_result.py +0 -0
  172. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/search/__init__.py +0 -0
  173. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/search/lancedb_search_service.py +0 -0
  174. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/search/numpy_search_service.py +0 -0
  175. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/search/search_options.py +0 -0
  176. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/search/search_service_protocol.py +0 -0
  177. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/search/searchable_mixin.py +0 -0
  178. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/selectors/__init__.py +0 -0
  179. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/selectors/parser.py +0 -0
  180. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/tables/__init__.py +0 -0
  181. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/templates/__init__.py +0 -0
  182. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  183. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/templates/spa/css/style.css +0 -0
  184. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/templates/spa/index.html +0 -0
  185. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/templates/spa/js/app.js +0 -0
  186. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/templates/spa/words.txt +0 -0
  187. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/text_mixin.py +0 -0
  188. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/__init__.py +0 -0
  189. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/bidi_mirror.py +0 -0
  190. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/debug.py +0 -0
  191. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/highlighting.py +0 -0
  192. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/identifiers.py +0 -0
  193. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/layout.py +0 -0
  194. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/locks.py +0 -0
  195. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/packaging.py +0 -0
  196. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/reading_order.py +0 -0
  197. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/text_extraction.py +0 -0
  198. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/utils/visualization.py +0 -0
  199. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/widgets/__init__.py +0 -0
  200. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf/widgets/viewer.py +0 -0
  201. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf.egg-info/dependency_links.txt +0 -0
  202. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf.egg-info/entry_points.txt +0 -0
  203. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/natural_pdf.egg-info/requires.txt +0 -0
  204. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/noxfile.py +0 -0
  205. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/optimization/memory_comparison.py +0 -0
  206. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/optimization/pdf_analyzer.py +0 -0
  207. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/optimization/performance_analysis.py +0 -0
  208. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  209. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  210. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  211. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  212. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/optimization/test_cleanup_methods.py +0 -0
  213. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/optimization/test_memory_fix.py +0 -0
  214. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/publish.sh +0 -0
  215. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/pyproject.toml +0 -0
  216. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/sample-screen.png +0 -0
  217. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/setup.cfg +0 -0
  218. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/conftest.py +0 -0
  219. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/exporters/test_paddleocr_exporter.py +0 -0
  220. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_annotate.py +0 -0
  221. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_arabic_performance.py +0 -0
  222. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_color_conversion.py +0 -0
  223. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_core/test_containment_geometry.py +0 -0
  224. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_core/test_elements.py +0 -0
  225. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_core/test_loading.py +0 -0
  226. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_core/test_spatial.py +0 -0
  227. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_core/test_text_extraction.py +0 -0
  228. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_core/test_text_layer.py +0 -0
  229. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_directional_defaults.py +0 -0
  230. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_document_qa.py +0 -0
  231. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_element_collection_slicing.py +0 -0
  232. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_exclusions.py +0 -0
  233. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_expand.py +0 -0
  234. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_extraction_error.py +0 -0
  235. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_extraction_mixin_fix.py +0 -0
  236. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_extraction_text_and_vision.py +0 -0
  237. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_extraction_working.py +0 -0
  238. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_first_last_selectors.py +0 -0
  239. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_flow_region_directional.py +0 -0
  240. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_groupby.py +0 -0
  241. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_guides.py +0 -0
  242. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_guides_apply_exclusions.py +0 -0
  243. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_guides_apply_exclusions_simple.py +0 -0
  244. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_guides_extract_table.py +0 -0
  245. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_guides_extract_table_real.py +0 -0
  246. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_guides_integration.py +0 -0
  247. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_highlight_detection.py +0 -0
  248. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_highlight_protocol.py +0 -0
  249. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_highlight_protocol_simple.py +0 -0
  250. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_highlight_regions.py +0 -0
  251. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_loading_original.py +0 -0
  252. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_multi_page_table_discovery.py +0 -0
  253. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_optional_deps.py +0 -0
  254. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_page_exclusion_lists.py +0 -0
  255. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  256. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_region_viewer.py +0 -0
  257. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_sections_end_only.py +0 -0
  258. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_show_column_layout.py +0 -0
  259. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_show_edge_cases.py +0 -0
  260. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_show_exclusions.py +0 -0
  261. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_show_exclusions_feature.py +0 -0
  262. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_show_limit.py +0 -0
  263. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_skip_repeating_headers_multipage.py +0 -0
  264. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_strikethrough_detection.py +0 -0
  265. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_table_result_header_mismatch.py +0 -0
  266. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_table_result_keep_blank.py +0 -0
  267. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_tiny_text_tables.py +0 -0
  268. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_tiny_text_tables_table.py +0 -0
  269. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_tutorials.py +0 -0
  270. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_underline_detection.py +0 -0
  271. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tests/test_update_text.py +0 -0
  272. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/todo/bad_pdf_analysis.md +0 -0
  273. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/todo/evaluation.md +0 -0
  274. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  275. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  276. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  277. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/README.md +0 -0
  278. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/__init__.py +0 -0
  279. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/analyser.py +0 -0
  280. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  281. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  282. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/eval_suite.py +0 -0
  283. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  284. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  285. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  286. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  287. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  288. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/reporter.py +0 -0
  289. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/tools/bad_pdf_eval/utils.py +0 -0
  290. {natural_pdf-0.2.3 → natural_pdf-0.2.5}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -143,7 +143,7 @@ class GuidesList(UserList):
143
143
 
144
144
  def from_content(
145
145
  self,
146
- markers: Union[str, List[str], "ElementCollection", None],
146
+ markers: Union[str, List[str], "ElementCollection", Callable, None],
147
147
  obj: Optional[Union["Page", "Region", "FlowRegion"]] = None,
148
148
  align: Literal["left", "right", "center", "between"] = "left",
149
149
  outer: bool = True,
@@ -160,6 +160,7 @@ class GuidesList(UserList):
160
160
  - str: single selector (e.g., 'text:contains("Name")') or literal text
161
161
  - List[str]: list of selectors or literal text strings
162
162
  - ElementCollection: collection of elements to extract text from
163
+ - Callable: function that takes a page and returns markers
163
164
  - None: no markers
164
165
  obj: Page/Region/FlowRegion to search (uses parent's context if None)
165
166
  align: How to align guides relative to found elements
@@ -174,13 +175,22 @@ class GuidesList(UserList):
174
175
  if target_obj is None:
175
176
  raise ValueError("No object provided and no context available")
176
177
 
178
+ # Store callable markers for later evaluation
179
+ if callable(markers):
180
+ self._callable = markers
181
+ # For now, evaluate with the current target object to get initial guides
182
+ actual_markers = markers(target_obj)
183
+ else:
184
+ self._callable = None
185
+ actual_markers = markers
186
+
177
187
  # Check if parent is in flow mode
178
188
  if self._parent.is_flow_region:
179
189
  # Create guides across all constituent regions
180
190
  all_guides = []
181
191
  for region in self._parent.context.constituent_regions:
182
192
  # Normalize markers for this region
183
- marker_texts = _normalize_markers(markers, region)
193
+ marker_texts = _normalize_markers(actual_markers, region)
184
194
 
185
195
  # Create guides for this region
186
196
  region_guides = Guides.from_content(
@@ -263,7 +273,7 @@ class GuidesList(UserList):
263
273
 
264
274
  # Original single-region logic
265
275
  # Normalize markers to list of text strings
266
- marker_texts = _normalize_markers(markers, target_obj)
276
+ marker_texts = _normalize_markers(actual_markers, target_obj)
267
277
 
268
278
  # Create guides for this axis
269
279
  new_guides = Guides.from_content(
@@ -1541,11 +1551,15 @@ class Guides:
1541
1551
  # Add outer guides if requested
1542
1552
  if outer and bounds:
1543
1553
  if axis == "vertical":
1544
- guides_coords.insert(0, bounds[0]) # x0
1545
- guides_coords.append(bounds[2]) # x1
1554
+ if outer == True or outer == "first":
1555
+ guides_coords.insert(0, bounds[0]) # x0
1556
+ if outer == True or outer == "last":
1557
+ guides_coords.append(bounds[2]) # x1
1546
1558
  else:
1547
- guides_coords.insert(0, bounds[1]) # y0
1548
- guides_coords.append(bounds[3]) # y1
1559
+ if outer == True or outer == "first":
1560
+ guides_coords.insert(0, bounds[1]) # y0
1561
+ if outer == True or outer == "last":
1562
+ guides_coords.append(bounds[3]) # y1
1549
1563
 
1550
1564
  # Remove duplicates and sort
1551
1565
  guides_coords = sorted(list(set(guides_coords)))
@@ -3302,7 +3316,7 @@ class Guides:
3302
3316
  markers: Union[str, List[str], "ElementCollection", None] = None,
3303
3317
  obj: Optional[Union["Page", "Region"]] = None,
3304
3318
  align: Literal["left", "right", "center", "between"] = "left",
3305
- outer: bool = True,
3319
+ outer: Union[str, bool] = True,
3306
3320
  tolerance: float = 5,
3307
3321
  apply_exclusions: bool = True,
3308
3322
  ) -> "Guides":
@@ -3319,7 +3333,10 @@ class Guides:
3319
3333
  - None: no markers
3320
3334
  obj: Page or Region to search (uses self.context if None)
3321
3335
  align: How to align guides relative to found elements
3322
- outer: Whether to add outer boundary guides
3336
+ outer: Whether to add outer boundary guides. Can be:
3337
+ - bool: True/False to add/not add both
3338
+ - "first": To add boundary before the first element
3339
+ - "last": To add boundary before the last element
3323
3340
  tolerance: Tolerance for snapping to element edges
3324
3341
  apply_exclusions: Whether to apply exclusion zones when searching for text
3325
3342
 
@@ -3457,6 +3474,7 @@ class Guides:
3457
3474
  cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
3458
3475
  show_progress: bool = False,
3459
3476
  content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
3477
+ apply_exclusions: bool = True,
3460
3478
  *,
3461
3479
  multi_page: Literal["auto", True, False] = "auto",
3462
3480
  ) -> "TableResult":
@@ -3482,6 +3500,7 @@ class Guides:
3482
3500
  cell_extraction_func: Optional callable for custom cell text extraction
3483
3501
  show_progress: Controls progress bar for text method
3484
3502
  content_filter: Content filtering function or patterns
3503
+ apply_exclusions: Whether to apply exclusion regions during text extraction (default: True)
3485
3504
  multi_page: Controls multi-region table creation for FlowRegions
3486
3505
 
3487
3506
  Returns:
@@ -3552,6 +3571,7 @@ class Guides:
3552
3571
  cell_extraction_func=cell_extraction_func,
3553
3572
  show_progress=show_progress,
3554
3573
  content_filter=content_filter,
3574
+ apply_exclusions=apply_exclusions,
3555
3575
  )
3556
3576
 
3557
3577
  return table_result
@@ -3577,6 +3597,162 @@ class Guides:
3577
3597
  except Exception as cleanup_err:
3578
3598
  logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
3579
3599
 
3600
+ def extract_table_from_pages(
3601
+ self,
3602
+ pages: Union["PageCollection", List["Page"]],
3603
+ header: Union[str, List[str], None] = "first",
3604
+ skip_repeating_headers: Optional[bool] = None,
3605
+ method: Optional[str] = None,
3606
+ table_settings: Optional[dict] = None,
3607
+ use_ocr: bool = False,
3608
+ ocr_config: Optional[dict] = None,
3609
+ text_options: Optional[Dict] = None,
3610
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
3611
+ show_progress: bool = True,
3612
+ content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
3613
+ apply_exclusions: bool = True,
3614
+ ) -> "TableResult":
3615
+ """
3616
+ Extract tables from multiple pages using this guide pattern.
3617
+
3618
+ This method applies the guide to each page, extracts tables, and combines
3619
+ them into a single TableResult. Dynamic guides (using lambdas) are evaluated
3620
+ for each page.
3621
+
3622
+ Args:
3623
+ pages: PageCollection or list of Pages to extract from
3624
+ header: How to handle headers:
3625
+ - "first": Use first row of first page as headers (default)
3626
+ - "all": Expect headers on each page, use from first page
3627
+ - None: No headers, use numeric indices
3628
+ - List[str]: Custom column names
3629
+ skip_repeating_headers: Whether to remove duplicate header rows.
3630
+ Defaults to True when header is "first" or "all", False otherwise.
3631
+ method: Table extraction method (passed to extract_table)
3632
+ table_settings: Settings for pdfplumber table extraction
3633
+ use_ocr: Whether to use OCR for text extraction
3634
+ ocr_config: OCR configuration parameters
3635
+ text_options: Dictionary of options for the 'text' method
3636
+ cell_extraction_func: Optional callable for custom cell text extraction
3637
+ show_progress: Show progress bar for multi-page extraction (default: True)
3638
+ content_filter: Content filtering function or patterns
3639
+ apply_exclusions: Whether to apply exclusion regions during extraction
3640
+
3641
+ Returns:
3642
+ TableResult: Combined table data from all pages
3643
+
3644
+ Example:
3645
+ ```python
3646
+ # Create guide with static vertical, dynamic horizontal
3647
+ guide = Guides(pages[0])
3648
+ guide.vertical.from_content(columns, outer="last")
3649
+ guide.horizontal.from_content(lambda p: p.find_all('text:starts-with(NF-)'))
3650
+
3651
+ # Extract from all pages
3652
+ table_result = guide.extract_table_from_pages(pages, header=columns)
3653
+ df = table_result.to_df()
3654
+ ```
3655
+ """
3656
+ from natural_pdf.core.page_collection import PageCollection
3657
+ from natural_pdf.tables.result import TableResult
3658
+
3659
+ # Convert to list if it's a PageCollection
3660
+ if isinstance(pages, PageCollection):
3661
+ page_list = list(pages)
3662
+ else:
3663
+ page_list = pages
3664
+
3665
+ if not page_list:
3666
+ return TableResult([])
3667
+
3668
+ # Determine header handling
3669
+ if skip_repeating_headers is None:
3670
+ skip_repeating_headers = header in ["first", "all"] or isinstance(header, list)
3671
+
3672
+ all_rows = []
3673
+ header_row = None
3674
+
3675
+ # Configure progress bar
3676
+ iterator = page_list
3677
+ if show_progress and len(page_list) > 1:
3678
+ try:
3679
+ from tqdm.auto import tqdm
3680
+
3681
+ iterator = tqdm(page_list, desc="Extracting tables from pages", unit="page")
3682
+ except ImportError:
3683
+ pass
3684
+
3685
+ for i, page in enumerate(iterator):
3686
+ # Create a new Guides object for this page
3687
+ page_guide = Guides(page)
3688
+
3689
+ # Copy vertical guides (usually static)
3690
+ if hasattr(self.vertical, "_callable") and self.vertical._callable is not None:
3691
+ # If vertical is dynamic (lambda), evaluate it
3692
+ page_guide.vertical.from_content(self.vertical._callable(page))
3693
+ else:
3694
+ # Copy static vertical positions
3695
+ page_guide.vertical.data = self.vertical.data.copy()
3696
+
3697
+ # Handle horizontal guides
3698
+ if hasattr(self.horizontal, "_callable") and self.horizontal._callable is not None:
3699
+ # If horizontal is dynamic (lambda), evaluate it
3700
+ page_guide.horizontal.from_content(self.horizontal._callable(page))
3701
+ else:
3702
+ # Copy static horizontal positions
3703
+ page_guide.horizontal.data = self.horizontal.data.copy()
3704
+
3705
+ # Extract table from this page
3706
+ table_result = page_guide.extract_table(
3707
+ method=method,
3708
+ table_settings=table_settings,
3709
+ use_ocr=use_ocr,
3710
+ ocr_config=ocr_config,
3711
+ text_options=text_options,
3712
+ cell_extraction_func=cell_extraction_func,
3713
+ show_progress=False, # Don't show nested progress
3714
+ content_filter=content_filter,
3715
+ apply_exclusions=apply_exclusions,
3716
+ )
3717
+
3718
+ # Convert to list of rows
3719
+ rows = list(table_result)
3720
+
3721
+ # Handle headers based on strategy
3722
+ if i == 0: # First page
3723
+ if header == "first" or header == "all":
3724
+ # Use first row as header
3725
+ if rows:
3726
+ header_row = rows[0]
3727
+ rows = rows[1:] # Remove header from data
3728
+ elif isinstance(header, list):
3729
+ # Custom headers provided
3730
+ header_row = header
3731
+ else: # Subsequent pages
3732
+ if header == "all" and skip_repeating_headers and rows:
3733
+ # Expect and remove header row
3734
+ if rows and header_row and rows[0] == header_row:
3735
+ rows = rows[1:]
3736
+ elif rows:
3737
+ # Still remove first row if it looks like a header
3738
+ rows = rows[1:]
3739
+
3740
+ # Add rows to combined result
3741
+ all_rows.extend(rows)
3742
+
3743
+ # Create final TableResult
3744
+ if isinstance(header, list):
3745
+ # Custom headers - prepend to data
3746
+ final_result = TableResult(all_rows)
3747
+ elif header_row is not None:
3748
+ # Prepend discovered header
3749
+ final_result = TableResult([header_row] + all_rows)
3750
+ else:
3751
+ # No headers
3752
+ final_result = TableResult(all_rows)
3753
+
3754
+ return final_result
3755
+
3580
3756
  def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
3581
3757
  """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
3582
3758
  if not self.is_flow_region or len(self.context.constituent_regions) < 2:
@@ -939,6 +939,11 @@ class ElementManager:
939
939
  self.load_elements()
940
940
  return self._elements.get("chars", [])
941
941
 
942
+ def invalidate_cache(self):
943
+ """Invalidate the cached elements, forcing a reload on next access."""
944
+ self._elements = None
945
+ logger.debug(f"Page {self._page.number}: ElementManager cache invalidated")
946
+
942
947
  @property
943
948
  def words(self):
944
949
  """Get all word elements."""
@@ -78,6 +78,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
78
78
 
79
79
  # # Import new utils
80
80
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
81
+ from natural_pdf.vision.mixin import VisualSearchMixin
81
82
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
82
83
 
83
84
  # --- End Classification Imports --- #
@@ -101,6 +102,7 @@ class Page(
101
102
  ExtractionMixin,
102
103
  ShapeDetectionMixin,
103
104
  DescribeMixin,
105
+ VisualSearchMixin,
104
106
  Visualizable,
105
107
  ):
106
108
  """Enhanced Page wrapper built on top of pdfplumber.Page.
@@ -492,6 +494,9 @@ class Page(
492
494
  exc_info=False,
493
495
  )
494
496
  raise
497
+ # Invalidate ElementManager cache since exclusions affect element filtering
498
+ if hasattr(self, "_element_mgr") and self._element_mgr:
499
+ self._element_mgr.invalidate_cache()
495
500
  return self # Completed processing for selector input
496
501
 
497
502
  # ElementCollection -----------------------------------------------
@@ -524,6 +529,9 @@ class Page(
524
529
  exc_info=False,
525
530
  )
526
531
  raise
532
+ # Invalidate ElementManager cache since exclusions affect element filtering
533
+ if hasattr(self, "_element_mgr") and self._element_mgr:
534
+ self._element_mgr.invalidate_cache()
527
535
  return self # Completed processing for ElementCollection input
528
536
 
529
537
  # ------------------------------------------------------------------
@@ -616,6 +624,9 @@ class Page(
616
624
  f"Page {self.index}: Failed to convert list item to Region: {e}"
617
625
  )
618
626
  continue
627
+ # Invalidate ElementManager cache since exclusions affect element filtering
628
+ if hasattr(self, "_element_mgr") and self._element_mgr:
629
+ self._element_mgr.invalidate_cache()
619
630
  return self
620
631
  else:
621
632
  # Reject invalid types
@@ -627,6 +638,10 @@ class Page(
627
638
  if exclusion_data:
628
639
  self._exclusions.append(exclusion_data)
629
640
 
641
+ # Invalidate ElementManager cache since exclusions affect element filtering
642
+ if hasattr(self, "_element_mgr") and self._element_mgr:
643
+ self._element_mgr.invalidate_cache()
644
+
630
645
  return self
631
646
 
632
647
  def add_region(self, region: "Region", name: Optional[str] = None) -> "Page":
@@ -697,10 +712,26 @@ class Page(
697
712
  """
698
713
  regions = []
699
714
 
715
+ # Combine page-specific exclusions with PDF-level exclusions
716
+ all_exclusions = list(self._exclusions) # Start with page-specific
717
+
718
+ # Add PDF-level exclusions if we have a parent PDF
719
+ if hasattr(self, "_parent") and self._parent and hasattr(self._parent, "_exclusions"):
720
+ for pdf_exclusion in self._parent._exclusions:
721
+ # Check if this exclusion is already in our list (avoid duplicates)
722
+ if pdf_exclusion not in all_exclusions:
723
+ # Ensure consistent format (PDF exclusions might be 2-tuples, need to be 3-tuples)
724
+ if len(pdf_exclusion) == 2:
725
+ # Convert to 3-tuple format with default method
726
+ pdf_exclusion = (pdf_exclusion[0], pdf_exclusion[1], "region")
727
+ all_exclusions.append(pdf_exclusion)
728
+
700
729
  if debug:
701
- print(f"\nPage {self.index}: Evaluating {len(self._exclusions)} exclusions")
730
+ print(
731
+ f"\nPage {self.index}: Evaluating {len(all_exclusions)} exclusions ({len(self._exclusions)} page-specific, {len(all_exclusions) - len(self._exclusions)} from PDF)"
732
+ )
702
733
 
703
- for i, exclusion_data in enumerate(self._exclusions):
734
+ for i, exclusion_data in enumerate(all_exclusions):
704
735
  # Handle both old format (2-tuple) and new format (3-tuple) for backward compatibility
705
736
  if len(exclusion_data) == 2:
706
737
  # Old format: (exclusion_item, label)
@@ -1596,7 +1627,14 @@ class Page(
1596
1627
  return ""
1597
1628
 
1598
1629
  # 2. Apply element-based exclusions if enabled
1599
- if use_exclusions and self._exclusions:
1630
+ # Check both page-level and PDF-level exclusions
1631
+ has_exclusions = bool(self._exclusions) or (
1632
+ hasattr(self, "_parent")
1633
+ and self._parent
1634
+ and hasattr(self._parent, "_exclusions")
1635
+ and self._parent._exclusions
1636
+ )
1637
+ if use_exclusions and has_exclusions:
1600
1638
  # Filter word elements through _filter_elements_by_exclusions
1601
1639
  # This handles both element-based and region-based exclusions
1602
1640
  word_elements = self._filter_elements_by_exclusions(
@@ -1610,7 +1648,7 @@ class Page(
1610
1648
  # 3. Get region-based exclusions for spatial filtering
1611
1649
  apply_exclusions_flag = kwargs.get("use_exclusions", use_exclusions)
1612
1650
  exclusion_regions = []
1613
- if apply_exclusions_flag and self._exclusions:
1651
+ if apply_exclusions_flag and has_exclusions:
1614
1652
  exclusion_regions = self._get_exclusion_regions(include_callable=True, debug=debug)
1615
1653
  if debug:
1616
1654
  logger.debug(
@@ -42,6 +42,7 @@ from natural_pdf.ocr import OCRManager, OCROptions
42
42
  from natural_pdf.selectors.parser import parse_selector
43
43
  from natural_pdf.text_mixin import TextMixin
44
44
  from natural_pdf.utils.locks import pdf_render_lock
45
+ from natural_pdf.vision.mixin import VisualSearchMixin
45
46
 
46
47
  if TYPE_CHECKING:
47
48
  from natural_pdf.elements.element_collection import ElementCollection
@@ -172,11 +173,26 @@ class _LazyPageList(Sequence):
172
173
  """Create and cache a page at the given index within this list."""
173
174
  cached = self._cache[index]
174
175
  if cached is None:
176
+ # Get the actual page index in the full PDF
177
+ actual_page_index = self._indices[index]
178
+
179
+ # First check if this page is already cached in the parent PDF's main page list
180
+ if (
181
+ hasattr(self._parent_pdf, "_pages")
182
+ and hasattr(self._parent_pdf._pages, "_cache")
183
+ and actual_page_index < len(self._parent_pdf._pages._cache)
184
+ and self._parent_pdf._pages._cache[actual_page_index] is not None
185
+ ):
186
+ # Reuse the already-cached page from the parent PDF
187
+ # This ensures we get any exclusions that were already applied
188
+ cached = self._parent_pdf._pages._cache[actual_page_index]
189
+ self._cache[index] = cached
190
+ return cached
191
+
175
192
  # Import here to avoid circular import problems
176
193
  from natural_pdf.core.page import Page
177
194
 
178
- # Get the actual page index in the full PDF
179
- actual_page_index = self._indices[index]
195
+ # Create new page
180
196
  plumber_page = self._plumber_pdf.pages[actual_page_index]
181
197
  cached = Page(
182
198
  plumber_page,
@@ -195,6 +211,30 @@ class _LazyPageList(Sequence):
195
211
  except Exception as e:
196
212
  logger.warning(f"Failed to apply exclusion to page {cached.number}: {e}")
197
213
 
214
+ # Check if the parent PDF already has a cached page with page-specific exclusions
215
+ if hasattr(self._parent_pdf, "_pages") and hasattr(self._parent_pdf._pages, "_cache"):
216
+ parent_cache = self._parent_pdf._pages._cache
217
+ if (
218
+ actual_page_index < len(parent_cache)
219
+ and parent_cache[actual_page_index] is not None
220
+ ):
221
+ existing_page = parent_cache[actual_page_index]
222
+ # Copy over any page-specific exclusions from the existing page
223
+ # Only copy non-callable exclusions (regions/elements) to avoid duplicating PDF-level exclusions
224
+ if hasattr(existing_page, "_exclusions") and existing_page._exclusions:
225
+ for exclusion_data in existing_page._exclusions:
226
+ exclusion_item = exclusion_data[0]
227
+ # Skip callable exclusions as they're PDF-level and already applied above
228
+ if not callable(exclusion_item):
229
+ try:
230
+ cached.add_exclusion(
231
+ *exclusion_data[:2]
232
+ ) # exclusion_item and label
233
+ except Exception as e:
234
+ logger.warning(
235
+ f"Failed to copy page-specific exclusion to page {cached.number}: {e}"
236
+ )
237
+
198
238
  # Apply any stored regions to the newly created page
199
239
  if hasattr(self._parent_pdf, "_regions"):
200
240
  for region_data in self._parent_pdf._regions:
@@ -252,7 +292,9 @@ class _LazyPageList(Sequence):
252
292
  # --- End Lazy Page List Helper --- #
253
293
 
254
294
 
255
- class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
295
+ class PDF(
296
+ TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
297
+ ):
256
298
  """Enhanced PDF wrapper built on top of pdfplumber.
257
299
 
258
300
  This class provides a fluent interface for working with PDF documents,
@@ -40,6 +40,7 @@ logger = logging.getLogger(__name__)
40
40
  from natural_pdf.core.pdf import PDF
41
41
  from natural_pdf.elements.region import Region
42
42
  from natural_pdf.export.mixin import ExportMixin
43
+ from natural_pdf.vision.mixin import VisualSearchMixin
43
44
 
44
45
  # --- Search Imports ---
45
46
  try:
@@ -69,8 +70,8 @@ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the ne
69
70
 
70
71
 
71
72
  class PDFCollection(
72
- SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
73
- ): # Add ExportMixin and ShapeDetectionMixin
73
+ SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin, VisualSearchMixin
74
+ ):
74
75
  def __init__(
75
76
  self,
76
77
  source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -258,8 +259,6 @@ class PDFCollection(
258
259
  return iter(self._pdfs)
259
260
 
260
261
  def __repr__(self) -> str:
261
- # Removed search status
262
- return f"<PDFCollection(count={len(self._pdfs)})>"
263
262
  return f"<PDFCollection(count={len(self._pdfs)})>"
264
263
 
265
264
  @property
@@ -267,6 +266,134 @@ class PDFCollection(
267
266
  """Returns the list of PDF objects held by the collection."""
268
267
  return self._pdfs
269
268
 
269
+ def show(self, limit: Optional[int] = 30, per_pdf_limit: Optional[int] = 10, **kwargs):
270
+ """
271
+ Display all PDFs in the collection with labels.
272
+
273
+ Each PDF is shown with its pages in a grid layout (6 columns by default),
274
+ and all PDFs are stacked vertically with labels.
275
+
276
+ Args:
277
+ limit: Maximum total pages to show across all PDFs (default: 30)
278
+ per_pdf_limit: Maximum pages to show per PDF (default: 10)
279
+ **kwargs: Additional arguments passed to each PDF's show() method
280
+ (e.g., columns, exclusions, resolution, etc.)
281
+
282
+ Returns:
283
+ Displayed image in Jupyter or None
284
+ """
285
+ if not self._pdfs:
286
+ print("Empty collection")
287
+ return None
288
+
289
+ # Import here to avoid circular imports
290
+ import numpy as np
291
+ from PIL import Image, ImageDraw, ImageFont
292
+
293
+ # Calculate pages per PDF if total limit is set
294
+ if limit and not per_pdf_limit:
295
+ per_pdf_limit = max(1, limit // len(self._pdfs))
296
+
297
+ # Collect images from each PDF
298
+ all_images = []
299
+ total_pages_shown = 0
300
+
301
+ for pdf in self._pdfs:
302
+ if limit and total_pages_shown >= limit:
303
+ break
304
+
305
+ # Calculate limit for this PDF
306
+ pdf_limit = per_pdf_limit
307
+ if limit:
308
+ remaining = limit - total_pages_shown
309
+ pdf_limit = min(per_pdf_limit or remaining, remaining)
310
+
311
+ # Get PDF identifier
312
+ pdf_name = getattr(pdf, "filename", None) or getattr(pdf, "path", "Unknown")
313
+ if isinstance(pdf_name, Path):
314
+ pdf_name = pdf_name.name
315
+ elif "/" in str(pdf_name):
316
+ pdf_name = str(pdf_name).split("/")[-1]
317
+
318
+ # Render this PDF
319
+ try:
320
+ # Get render specs from the PDF
321
+ render_specs = pdf._get_render_specs(mode="show", max_pages=pdf_limit, **kwargs)
322
+
323
+ if not render_specs:
324
+ continue
325
+
326
+ # Get the highlighter and render without displaying
327
+ highlighter = pdf._get_highlighter()
328
+ pdf_image = highlighter.unified_render(
329
+ specs=render_specs,
330
+ layout="grid" if len(render_specs) > 1 else "single",
331
+ columns=6,
332
+ **kwargs,
333
+ )
334
+
335
+ if pdf_image:
336
+ # Add label above the PDF image
337
+ label_height = 40
338
+ label_bg_color = (240, 240, 240)
339
+ label_text_color = (0, 0, 0)
340
+
341
+ # Create new image with space for label
342
+ width, height = pdf_image.size
343
+ labeled_image = Image.new("RGB", (width, height + label_height), "white")
344
+
345
+ # Draw label background
346
+ draw = ImageDraw.Draw(labeled_image)
347
+ draw.rectangle([0, 0, width, label_height], fill=label_bg_color)
348
+
349
+ # Draw label text
350
+ try:
351
+ # Try to use a nice font if available
352
+ font = ImageFont.truetype("Arial", 20)
353
+ except:
354
+ # Fallback to default font
355
+ font = ImageFont.load_default()
356
+
357
+ label_text = f"{pdf_name} ({len(pdf.pages)} pages)"
358
+ draw.text((10, 10), label_text, fill=label_text_color, font=font)
359
+
360
+ # Paste PDF image below label
361
+ labeled_image.paste(pdf_image, (0, label_height))
362
+
363
+ all_images.append(labeled_image)
364
+ total_pages_shown += min(pdf_limit, len(pdf.pages))
365
+
366
+ except Exception as e:
367
+ logger.warning(f"Failed to render PDF {pdf_name}: {e}")
368
+ continue
369
+
370
+ if not all_images:
371
+ print("No PDFs could be rendered")
372
+ return None
373
+
374
+ # Combine all images vertically
375
+ if len(all_images) == 1:
376
+ combined = all_images[0]
377
+ else:
378
+ # Add spacing between PDFs
379
+ spacing = 20
380
+ total_height = sum(img.height for img in all_images) + spacing * (len(all_images) - 1)
381
+ max_width = max(img.width for img in all_images)
382
+
383
+ combined = Image.new("RGB", (max_width, total_height), "white")
384
+
385
+ y_offset = 0
386
+ for i, img in enumerate(all_images):
387
+ # Center images if they're narrower than max width
388
+ x_offset = (max_width - img.width) // 2
389
+ combined.paste(img, (x_offset, y_offset))
390
+ y_offset += img.height
391
+ if i < len(all_images) - 1:
392
+ y_offset += spacing
393
+
394
+ # Return the combined image (Jupyter will display it automatically)
395
+ return combined
396
+
270
397
  @overload
271
398
  def find_all(
272
399
  self,
@@ -186,7 +186,7 @@ class Visualizable:
186
186
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
187
187
  labels: bool = True,
188
188
  label_format: Optional[str] = None,
189
- highlights: Optional[List[Dict[str, Any]]] = None,
189
+ highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
190
190
  legend_position: str = "right",
191
191
  annotate: Optional[Union[str, List[str]]] = None,
192
192
  # Layout options for multi-page/region
@@ -211,7 +211,7 @@ class Visualizable:
211
211
  color: Default highlight color
212
212
  labels: Whether to show labels for highlights
213
213
  label_format: Format string for labels (e.g., "Element {index}")
214
- highlights: Additional highlight groups to show
214
+ highlights: Additional highlight groups to show, or False to disable all highlights
215
215
  legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
216
216
  annotate: Attribute name(s) to display on highlights (string or list)
217
217
  layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)