natural-pdf 0.2.1.dev0__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (277) hide show
  1. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.gitignore +1 -1
  2. natural_pdf-0.2.3/CLAUDE.md +85 -0
  3. {natural_pdf-0.2.1.dev0/natural_pdf.egg-info → natural_pdf-0.2.3}/PKG-INFO +2 -2
  4. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/layout-analysis/index.md +1 -1
  5. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/quick-reference/index.md +15 -1
  6. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/visual-debugging/index.md +63 -1
  7. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/guides.py +159 -3
  8. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/collections/mixins.py +16 -3
  9. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/highlighting_service.py +33 -9
  10. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/page.py +138 -7
  11. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/page_collection.py +51 -14
  12. natural_pdf-0.2.3/natural_pdf/core/page_groupby.py +229 -0
  13. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/render_spec.py +62 -4
  14. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/base.py +102 -20
  15. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/element_collection.py +11 -10
  16. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/region.py +21 -21
  17. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/text.py +5 -0
  18. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/extraction/manager.py +8 -14
  19. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/extraction/mixin.py +35 -21
  20. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/selectors/parser.py +2 -2
  21. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/tables/result.py +37 -0
  22. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3/natural_pdf.egg-info}/PKG-INFO +2 -2
  23. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/SOURCES.txt +24 -0
  24. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/requires.txt +1 -1
  25. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/top_level.txt +1 -1
  26. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_analysis.py +1 -1
  27. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/pyproject.toml +1 -1
  28. natural_pdf-0.2.3/tests/test_color_conversion.py +193 -0
  29. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_containment_geometry.py +6 -6
  30. natural_pdf-0.2.3/tests/test_directional_defaults.py +248 -0
  31. natural_pdf-0.2.3/tests/test_expand.py +150 -0
  32. natural_pdf-0.2.3/tests/test_extraction_error.py +85 -0
  33. natural_pdf-0.2.3/tests/test_extraction_mixin_fix.py +131 -0
  34. natural_pdf-0.2.3/tests/test_extraction_text_and_vision.py +250 -0
  35. natural_pdf-0.2.3/tests/test_extraction_working.py +147 -0
  36. natural_pdf-0.2.3/tests/test_first_last_selectors.py +99 -0
  37. natural_pdf-0.2.3/tests/test_groupby.py +307 -0
  38. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_guides.py +1 -1
  39. natural_pdf-0.2.3/tests/test_guides_apply_exclusions.py +216 -0
  40. natural_pdf-0.2.3/tests/test_guides_apply_exclusions_simple.py +72 -0
  41. natural_pdf-0.2.3/tests/test_guides_extract_table.py +252 -0
  42. natural_pdf-0.2.3/tests/test_guides_extract_table_real.py +247 -0
  43. natural_pdf-0.2.3/tests/test_highlight_regions.py +161 -0
  44. natural_pdf-0.2.3/tests/test_page_exclusion_lists.py +220 -0
  45. natural_pdf-0.2.3/tests/test_pdf_add_exclusion_elementcollection.py +170 -0
  46. natural_pdf-0.2.3/tests/test_show_column_layout.py +180 -0
  47. natural_pdf-0.2.3/tests/test_show_edge_cases.py +191 -0
  48. natural_pdf-0.2.3/tests/test_show_exclusions.py +77 -0
  49. natural_pdf-0.2.3/tests/test_show_exclusions_feature.py +125 -0
  50. natural_pdf-0.2.3/tests/test_show_limit.py +173 -0
  51. natural_pdf-0.2.3/tests/test_table_result_header_mismatch.py +138 -0
  52. natural_pdf-0.2.3/tests/test_table_result_keep_blank.py +198 -0
  53. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/todo/evaluation.md +1 -1
  54. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/analyser.py +1 -1
  55. natural_pdf-0.2.1.dev0/CLAUDE.md +0 -524
  56. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/analysis_framework.mdc +0 -0
  57. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/coding-style.mdc +0 -0
  58. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  59. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/minimal-comments.mdc +0 -0
  60. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  61. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  62. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.github/workflows/ci.yml +0 -0
  63. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.github/workflows/docs.yml +0 -0
  64. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.github/workflows/nightly-tutorials.yml +0 -0
  65. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/.pre-commit-config.yaml +0 -0
  66. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/01-execute_notebooks.py +0 -0
  67. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/02-run_all_tutorials.sh +0 -0
  68. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/LICENSE +0 -0
  69. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/MANIFEST.in +0 -0
  70. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/README.md +0 -0
  71. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/audit_packaging.py +0 -0
  72. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/check_run_md.sh +0 -0
  73. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/api/index.md +0 -0
  74. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/favicon.png +0 -0
  75. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/favicon.svg +0 -0
  76. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/javascripts/custom.js +0 -0
  77. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/logo.svg +0 -0
  78. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/sample-screen.png +0 -0
  79. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/social-preview.png +0 -0
  80. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/social-preview.svg +0 -0
  81. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/assets/stylesheets/custom.css +0 -0
  82. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/categorizing-documents/index.md +0 -0
  83. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/data-extraction/index.md +0 -0
  84. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/describe/index.md +0 -0
  85. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/document-qa/index.md +0 -0
  86. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/element-selection/index.md +0 -0
  87. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/extracting-clean-text/index.md +0 -0
  88. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/finetuning/index.md +0 -0
  89. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/fix-messy-tables/index.md +0 -0
  90. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/fix-messy-tables/table_1.csv +0 -0
  91. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/fix-messy-tables/table_2.csv +0 -0
  92. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/fix-messy-tables/table_3.csv +0 -0
  93. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/index.md +0 -0
  94. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/installation/index.md +0 -0
  95. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/interactive-widget/index.md +0 -0
  96. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/loops-and-groups/index.md +0 -0
  97. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/ocr/index.md +0 -0
  98. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/pdf-navigation/index.md +0 -0
  99. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  100. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/process-forms-and-invoices/index.md +0 -0
  101. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/reflowing-pages/index.md +0 -0
  102. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/regions/index.md +0 -0
  103. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tables/index.md +0 -0
  104. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/text-analysis/index.md +0 -0
  105. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/01-loading-and-extraction.md +0 -0
  106. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/02-finding-elements.md +0 -0
  107. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/03-extracting-blocks.md +0 -0
  108. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/04-table-extraction.md +0 -0
  109. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/05-excluding-content.md +0 -0
  110. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/06-document-qa.md +0 -0
  111. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/07-layout-analysis.md +0 -0
  112. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/07-working-with-regions.md +0 -0
  113. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/08-spatial-navigation.md +0 -0
  114. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/09-section-extraction.md +0 -0
  115. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/10-form-field-extraction.md +0 -0
  116. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  117. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/12-ocr-integration.md +0 -0
  118. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/13-semantic-search.md +0 -0
  119. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/tutorials/14-categorizing-documents.md +0 -0
  120. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/docs/visual-debugging/region.png +0 -0
  121. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/mkdocs.yml +0 -0
  122. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/__init__.py +0 -0
  123. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/__init__.py +0 -0
  124. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/__init__.py +0 -0
  125. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/base.py +0 -0
  126. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/docling.py +0 -0
  127. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/gemini.py +0 -0
  128. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  129. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  130. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  131. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/paddle.py +0 -0
  132. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  133. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/surya.py +0 -0
  134. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  135. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/tatr.py +0 -0
  136. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/layout/yolo.py +0 -0
  137. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  138. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/text_options.py +0 -0
  139. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/text_structure.py +0 -0
  140. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/analyzers/utils.py +0 -0
  141. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/classification/manager.py +0 -0
  142. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/classification/mixin.py +0 -0
  143. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/classification/results.py +0 -0
  144. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/cli.py +0 -0
  145. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/__init__.py +0 -0
  146. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/element_manager.py +0 -0
  147. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/pdf.py +0 -0
  148. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/core/pdf_collection.py +0 -0
  149. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/__init__.py +0 -0
  150. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/base.py +0 -0
  151. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/elements.py +0 -0
  152. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/mixin.py +0 -0
  153. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/describe/summary.py +0 -0
  154. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/__init__.py +0 -0
  155. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/image.py +0 -0
  156. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/line.py +0 -0
  157. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/elements/rect.py +0 -0
  158. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/export/mixin.py +0 -0
  159. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/__init__.py +0 -0
  160. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/base.py +0 -0
  161. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/data/__init__.py +0 -0
  162. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/data/pdf.ttf +0 -0
  163. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/data/sRGB.icc +0 -0
  164. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/hocr.py +0 -0
  165. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/hocr_font.py +0 -0
  166. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/original_pdf.py +0 -0
  167. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/paddleocr.py +0 -0
  168. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/exporters/searchable_pdf.py +0 -0
  169. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/extraction/result.py +0 -0
  170. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/__init__.py +0 -0
  171. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/collections.py +0 -0
  172. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/element.py +0 -0
  173. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/flow.py +0 -0
  174. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/flows/region.py +0 -0
  175. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/__init__.py +0 -0
  176. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine.py +0 -0
  177. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine_doctr.py +0 -0
  178. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine_easyocr.py +0 -0
  179. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine_paddle.py +0 -0
  180. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/engine_surya.py +0 -0
  181. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/ocr_factory.py +0 -0
  182. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/ocr_manager.py +0 -0
  183. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/ocr_options.py +0 -0
  184. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/ocr/utils.py +0 -0
  185. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/qa/__init__.py +0 -0
  186. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/qa/document_qa.py +0 -0
  187. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/qa/qa_result.py +0 -0
  188. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/__init__.py +0 -0
  189. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/lancedb_search_service.py +0 -0
  190. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/numpy_search_service.py +0 -0
  191. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/search_options.py +0 -0
  192. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/search_service_protocol.py +0 -0
  193. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/search/searchable_mixin.py +0 -0
  194. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/selectors/__init__.py +0 -0
  195. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/tables/__init__.py +0 -0
  196. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/__init__.py +0 -0
  197. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  198. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/spa/css/style.css +0 -0
  199. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/spa/index.html +0 -0
  200. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/spa/js/app.js +0 -0
  201. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/templates/spa/words.txt +0 -0
  202. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/text_mixin.py +0 -0
  203. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/__init__.py +0 -0
  204. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/bidi_mirror.py +0 -0
  205. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/debug.py +0 -0
  206. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/highlighting.py +0 -0
  207. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/identifiers.py +0 -0
  208. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/layout.py +0 -0
  209. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/locks.py +0 -0
  210. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/packaging.py +0 -0
  211. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/reading_order.py +0 -0
  212. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/text_extraction.py +0 -0
  213. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/utils/visualization.py +0 -0
  214. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/widgets/__init__.py +0 -0
  215. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf/widgets/viewer.py +0 -0
  216. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/dependency_links.txt +0 -0
  217. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/natural_pdf.egg-info/entry_points.txt +0 -0
  218. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/noxfile.py +0 -0
  219. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/memory_comparison.py +0 -0
  220. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/pdf_analyzer.py +0 -0
  221. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  222. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  223. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  224. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  225. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/test_cleanup_methods.py +0 -0
  226. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/optimization/test_memory_fix.py +0 -0
  227. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/publish.sh +0 -0
  228. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/sample-screen.png +0 -0
  229. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/setup.cfg +0 -0
  230. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/test_install.sh +0 -0
  231. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/conftest.py +0 -0
  232. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/exporters/test_paddleocr_exporter.py +0 -0
  233. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_annotate.py +0 -0
  234. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_arabic_performance.py +0 -0
  235. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_arabic_real_world.py +0 -0
  236. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_elements.py +0 -0
  237. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_loading.py +0 -0
  238. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_spatial.py +0 -0
  239. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_text_extraction.py +0 -0
  240. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_core/test_text_layer.py +0 -0
  241. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_document_qa.py +0 -0
  242. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_element_collection_slicing.py +0 -0
  243. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_exclusions.py +0 -0
  244. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_flow_region_directional.py +0 -0
  245. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_guides_integration.py +0 -0
  246. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_highlight_detection.py +0 -0
  247. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_highlight_protocol.py +0 -0
  248. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_highlight_protocol_simple.py +0 -0
  249. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_loading_original.py +0 -0
  250. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_multi_page_table_discovery.py +0 -0
  251. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_optional_deps.py +0 -0
  252. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_region_viewer.py +0 -0
  253. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_sections_end_only.py +0 -0
  254. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_skip_repeating_headers_multipage.py +0 -0
  255. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_strikethrough_detection.py +0 -0
  256. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_tiny_text_tables.py +0 -0
  257. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_tiny_text_tables_table.py +0 -0
  258. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_tutorials.py +0 -0
  259. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_underline_detection.py +0 -0
  260. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tests/test_update_text.py +0 -0
  261. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/todo/bad_pdf_analysis.md +0 -0
  262. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  263. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  264. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  265. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/README.md +0 -0
  266. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/__init__.py +0 -0
  267. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  268. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  269. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/eval_suite.py +0 -0
  270. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  271. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  272. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  273. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  274. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  275. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/reporter.py +0 -0
  276. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/tools/bad_pdf_eval/utils.py +0 -0
  277. {natural_pdf-0.2.1.dev0 → natural_pdf-0.2.3}/uv.lock +0 -0
@@ -292,4 +292,4 @@ build/
292
292
 
293
293
  # Ignore evaluation results generated by bad_pdf_eval suite
294
294
  eval_results/
295
- bad_pdf_analysis
295
+ bad_pdf_analysis
@@ -0,0 +1,85 @@
1
+ # Natural PDF Library Analysis
2
+
3
+ ## Library Overview
4
+ Natural PDF is a Python library for intelligent PDF document processing that combines traditional PDF parsing with modern AI capabilities. It provides a jQuery-like API for selecting and manipulating PDF elements with spatial awareness.
5
+
6
+ ## Core Goals & Purpose
7
+ - **Intelligent PDF Processing**: Goes beyond simple text extraction to understand document structure and spatial relationships
8
+ - **AI-Enhanced Workflows**: Integrates OCR, document Q&A, classification, and LLM-based data extraction
9
+ - **Spatial Navigation**: Provides methods like `.below()`, `.above()`, `.left()` for intuitive element selection
10
+ - **Multi-format Support**: Handles both text-based PDFs and image-based (OCR-required) documents
11
+
12
+ ## Key Use Cases & Workflows
13
+
14
+ ### 1. Basic Text and Table Extraction
15
+ - Load PDFs from local files or URLs
16
+ - Extract text with layout preservation
17
+ - Find and extract tables automatically
18
+ - Use spatial selectors: `page.find('text:contains(Violations)').below()`
19
+
20
+ ### 2. OCR Integration
21
+ - Multiple OCR engines supported: EasyOCR (default), Surya, PaddleOCR, DocTR
22
+ - Configurable resolution and detection modes
23
+ - OCR correction using LLMs
24
+ - Human-in-the-loop correction workflows with exportable packages
25
+
26
+ ### 3. AI-Powered Data Extraction
27
+ - **Document Q&A**: Extractive question answering with confidence scores
28
+ - **Structured Data**: Extract specific fields with schema validation using Pydantic
29
+ - **LLM Integration**: OpenAI/Gemini compatible for advanced extraction
30
+ - **Classification**: Document/page categorization using text or vision models
31
+
32
+ ### 4. Advanced Document Processing
33
+ - **Multi-column/Page Flows**: Reflow content across columns or pages for proper reading order
34
+ - **Layout Analysis**: YOLO, TATR for automatic document structure detection
35
+ - **Visual Element Detection**: Checkbox classification, form field extraction
36
+ - **Table Structure Detection**: Manual line detection for complex tables
37
+
38
+ ### 5. Visualization and Display
39
+ - **Page Limit for show()**: By default, `pdf.show()` displays only the first 30 pages to prevent overwhelming displays
40
+ - Use `pdf.show(limit=10)` to show fewer pages
41
+ - Use `pdf.show(limit=None)` to display all pages
42
+ - Works with all layout options: `pdf.show(limit=20, layout='grid', columns=4)`
43
+ - **Exclusion Zone Visualization**: Use `exclusions='red'` parameter to visualize exclusion zones
44
+ - `page.show(exclusions='red')` highlights exclusions in red
45
+ - `page.show(exclusions='blue')` highlights exclusions in blue
46
+ - `page.show(exclusions=True)` uses default red color
47
+ - Works at PDF level too: `pdf.show(exclusions='green')`
48
+
49
+ ### 6. Directional Navigation Improvements
50
+ - **Smart defaults for spatial methods**:
51
+ - `.left()` and `.right()` now default to `height='element'` (matches element height)
52
+ - `.above()` and `.below()` continue to default to `width='full'` (full page width)
53
+ - This matches common use cases: looking sideways usually wants same height, looking up/down wants full width
54
+ - **Enhanced discoverability**:
55
+ - Docstrings include examples showing different height/width options
56
+ - Clear parameter names ('height' for left/right, 'width' for above/below)
57
+
58
+ ### 6a. Enhanced Exclusion Support
59
+ - **ElementCollection support in callable exclusions**: `pdf.add_exclusion(lambda page: page.find_all('text:contains("Header")'))` now works
60
+ - **List/iterable support**: Callable exclusions can return lists or other iterables of elements
61
+ - **Automatic conversion**: Elements from iterables are automatically converted to exclusion regions
62
+ - **Backward compatibility**: Existing Region and callable exclusions continue to work unchanged
63
+
64
+ ### 7. Page Grouping with groupby()
65
+ - **Simple grouping by selector text**: `pages.groupby('text[size=16]')` groups by header text
66
+ - **Callable functions for complex logic**: `pages.groupby(lambda p: p.find('text:contains("CITY")').extract_text())`
67
+ - **Pandas-style iteration**: `for title, pages in grouped:` (no `.items()` needed)
68
+ - **Dict-like access**: `grouped.get('CITY OF MADISON')` or `grouped.get_group('key')`
69
+ - **Index-based access**: `grouped[0]` (first group), `grouped[-1]` (last group), `grouped['key']` (by name)
70
+ - **Group exploration**: `grouped.info()` shows all groups with indexes and page counts
71
+ - **Batch operations**: `grouped.apply(lambda pages: len(pages.find_all('table')))`
72
+ - **Visual inspection**: `grouped.show(limit=2)` shows first 2 pages of each group
73
+ - **Progress bar support**: Automatic progress bars for large collections, disable with `show_progress=False`
74
+ - **None handling**: Pages with no matching elements group under `None` key
75
+
76
+ ## Development Best Practices
77
+
78
+ ### File and Resource Management
79
+ - When making temp files, put them in temp/
80
+ - When creating test files, put them in tests/
81
+ - Most fixes and changes need a test, and should be done with test-driven development
82
+
83
+ ### Environment and Tooling
84
+ - Always use the virtual environment in .venv
85
+ - Use uv when possible for efficient package management
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.1.dev0
3
+ Version: 0.2.3
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -14,7 +14,7 @@ License-File: LICENSE
14
14
  Requires-Dist: scikit-learn
15
15
  Requires-Dist: markdown
16
16
  Requires-Dist: pandas
17
- Requires-Dist: pdfplumber
17
+ Requires-Dist: pdfplumber>=0.11.7
18
18
  Requires-Dist: colormath2
19
19
  Requires-Dist: pillow
20
20
  Requires-Dist: colour
@@ -105,7 +105,7 @@ page.find_all('region[model=tatr]').show(group_by='region_type', width=700)
105
105
 
106
106
  # page.analyze_layout(engine="docling")
107
107
  # page.find_all('region[model=docling]').show(group_by='region_type')
108
- # page.to_image(width=700)
108
+ # page.render(width=700)
109
109
  ```
110
110
 
111
111
  ```python
@@ -156,11 +156,25 @@ elements.show(color="red") # Single collection
156
156
  elements.show(color="blue", label="Headers") # With label
157
157
  elements.show(group_by='type') # Color by type
158
158
 
159
- # Multiple collections together
159
+ # Quick highlighting (one-liner)
160
+ page.highlight(elements1, elements2, elements3) # Multiple elements
161
+ page.highlight( # With custom colors
162
+ (elements1, 'red'),
163
+ (elements2, 'blue'),
164
+ (elements3, 'green')
165
+ )
166
+
167
+ # Multiple collections with context manager
160
168
  with page.highlights() as h:
161
169
  h.add(elements1, color="red", label="Type 1")
162
170
  h.add(elements2, color="blue", label="Type 2")
163
171
  h.show()
172
+
173
+ # Auto-display in Jupyter/Colab
174
+ with page.highlights(show=True) as h:
175
+ h.add(elements1, label="Headers")
176
+ h.add(elements2, label="Content")
177
+ # Displays automatically when exiting context
164
178
  ```
165
179
 
166
180
  ### Viewing
@@ -83,6 +83,47 @@ with page.highlights() as h:
83
83
  h.show()
84
84
  ```
85
85
 
86
+ ### Jupyter/Colab Support
87
+
88
+ In Jupyter notebooks and Google Colab, you can use `show=True` to automatically display the highlights when exiting the context:
89
+
90
+ ```python
91
+ # Automatically displays the image in Jupyter/Colab
92
+ with page.highlights(show=True) as h:
93
+ h.add(summary_elements, label='Summary')
94
+ h.add(date_elements, label='Date')
95
+ h.add(line_elements, label='Lines')
96
+ # No need to call h.show() - displays automatically!
97
+ ```
98
+
99
+ ### Quick Highlighting with `.highlight()`
100
+
101
+ For simple highlighting tasks, use the `.highlight()` convenience method:
102
+
103
+ ```python
104
+ # Highlight multiple elements in one line
105
+ page.highlight(summary_elements, date_elements, line_elements)
106
+
107
+ # With custom colors
108
+ page.highlight(
109
+ (summary_elements, 'red'),
110
+ (date_elements, 'blue'),
111
+ (line_elements, 'green')
112
+ )
113
+
114
+ # With colors and labels
115
+ page.highlight(
116
+ (summary_elements, 'red', 'Summary Text'),
117
+ (date_elements, 'blue', 'Date Fields'),
118
+ (line_elements, 'green', 'Separator Lines')
119
+ )
120
+
121
+ # Pass additional parameters like width or resolution
122
+ page.highlight(summary_elements, date_elements, width=800, labels=True)
123
+ ```
124
+
125
+ This method is particularly useful in Jupyter/Colab environments where the image displays automatically as the cell output.
126
+
86
127
  ## Customizing Multiple Highlights
87
128
 
88
129
  Customize the appearance of multiple highlights using the context manager:
@@ -133,7 +174,7 @@ content = title.below(height=200)
133
174
  content.show()
134
175
  ```
135
176
 
136
- Or look at just the region by itself
177
+ Or look at just the region by itself:
137
178
 
138
179
  ```python
139
180
  # Find a title and create a region below it
@@ -144,6 +185,27 @@ content = title.below(height=200)
144
185
  content.show(crop=True)
145
186
  ```
146
187
 
188
+ ### Highlighting Multiple Regions
189
+
190
+ The `.highlight()` method works with regions too:
191
+
192
+ ```python
193
+ # Create multiple regions
194
+ left = page.region(left=0, right=page.width/3, top=0, bottom=page.height)
195
+ mid = page.region(left=page.width/3, right=page.width/3*2, top=0, bottom=page.height)
196
+ right = page.region(left=page.width/3*2, right=page.width, top=0, bottom=page.height)
197
+
198
+ # Highlight all three regions
199
+ page.highlight(left, mid, right)
200
+
201
+ # Or with custom colors
202
+ page.highlight(
203
+ (left, 'red', 'Left Column'),
204
+ (mid, 'green', 'Middle Column'),
205
+ (right, 'blue', 'Right Column')
206
+ )
207
+ ```
208
+
147
209
  ## Working with Text Styles
148
210
 
149
211
  Visualize text styles to understand the document structure:
@@ -3,7 +3,7 @@
3
3
  import json
4
4
  import logging
5
5
  from collections import UserList
6
- from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
6
+ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, Tuple, Union
7
7
 
8
8
  import numpy as np
9
9
  from PIL import Image, ImageDraw
@@ -16,6 +16,7 @@ if TYPE_CHECKING:
16
16
  from natural_pdf.elements.element_collection import ElementCollection
17
17
  from natural_pdf.elements.region import Region
18
18
  from natural_pdf.flows.region import FlowRegion
19
+ from natural_pdf.tables.result import TableResult
19
20
 
20
21
  logger = logging.getLogger(__name__)
21
22
 
@@ -131,6 +132,15 @@ class GuidesList(UserList):
131
132
  self._parent = parent_guides
132
133
  self._axis = axis
133
134
 
135
+ def __getitem__(self, i):
136
+ """Override to handle slicing properly."""
137
+ if isinstance(i, slice):
138
+ # Return a new GuidesList with the sliced data
139
+ return self.__class__(self._parent, self._axis, self.data[i])
140
+ else:
141
+ # For single index, return the value directly
142
+ return self.data[i]
143
+
134
144
  def from_content(
135
145
  self,
136
146
  markers: Union[str, List[str], "ElementCollection", None],
@@ -140,6 +150,7 @@ class GuidesList(UserList):
140
150
  tolerance: float = 5,
141
151
  *,
142
152
  append: bool = False,
153
+ apply_exclusions: bool = True,
143
154
  ) -> "Guides":
144
155
  """
145
156
  Create guides from content markers and add to this axis.
@@ -154,6 +165,7 @@ class GuidesList(UserList):
154
165
  align: How to align guides relative to found elements
155
166
  outer: Whether to add outer boundary guides
156
167
  tolerance: Tolerance for snapping to element edges
168
+ apply_exclusions: Whether to apply exclusion zones when searching for text
157
169
 
158
170
  Returns:
159
171
  Parent Guides object for chaining
@@ -178,6 +190,7 @@ class GuidesList(UserList):
178
190
  align=align,
179
191
  outer=outer,
180
192
  tolerance=tolerance,
193
+ apply_exclusions=apply_exclusions,
181
194
  )
182
195
 
183
196
  # Collect guides from this region
@@ -260,6 +273,7 @@ class GuidesList(UserList):
260
273
  align=align,
261
274
  outer=outer,
262
275
  tolerance=tolerance,
276
+ apply_exclusions=apply_exclusions,
263
277
  )
264
278
 
265
279
  # Replace or append based on parameter
@@ -1398,6 +1412,7 @@ class Guides:
1398
1412
  align: Literal["left", "right", "center", "between"] = "left",
1399
1413
  outer: bool = True,
1400
1414
  tolerance: float = 5,
1415
+ apply_exclusions: bool = True,
1401
1416
  ) -> "Guides":
1402
1417
  """
1403
1418
  Create guides based on text content positions.
@@ -1413,6 +1428,7 @@ class Guides:
1413
1428
  align: Where to place guides relative to found text
1414
1429
  outer: Whether to add guides at the boundaries
1415
1430
  tolerance: Maximum distance to search for text
1431
+ apply_exclusions: Whether to apply exclusion zones when searching for text
1416
1432
 
1417
1433
  Returns:
1418
1434
  New Guides object aligned to text content
@@ -1431,6 +1447,7 @@ class Guides:
1431
1447
  align=align,
1432
1448
  outer=outer,
1433
1449
  tolerance=tolerance,
1450
+ apply_exclusions=apply_exclusions,
1434
1451
  )
1435
1452
 
1436
1453
  # Store in flow guides
@@ -1469,7 +1486,7 @@ class Guides:
1469
1486
  # Find each marker and determine guide position
1470
1487
  for marker in marker_texts:
1471
1488
  if hasattr(obj, "find"):
1472
- element = obj.find(f'text:contains("{marker}")')
1489
+ element = obj.find(f'text:contains("{marker}")', apply_exclusions=apply_exclusions)
1473
1490
  if element:
1474
1491
  if axis == "vertical":
1475
1492
  if align == "left":
@@ -1498,7 +1515,9 @@ class Guides:
1498
1515
  marker_bounds = []
1499
1516
  for marker in marker_texts:
1500
1517
  if hasattr(obj, "find"):
1501
- element = obj.find(f'text:contains("{marker}")')
1518
+ element = obj.find(
1519
+ f'text:contains("{marker}")', apply_exclusions=apply_exclusions
1520
+ )
1502
1521
  if element:
1503
1522
  if axis == "vertical":
1504
1523
  marker_bounds.append((element.x0, element.x1))
@@ -3285,6 +3304,7 @@ class Guides:
3285
3304
  align: Literal["left", "right", "center", "between"] = "left",
3286
3305
  outer: bool = True,
3287
3306
  tolerance: float = 5,
3307
+ apply_exclusions: bool = True,
3288
3308
  ) -> "Guides":
3289
3309
  """
3290
3310
  Instance method: Add guides from content, allowing chaining.
@@ -3301,6 +3321,7 @@ class Guides:
3301
3321
  align: How to align guides relative to found elements
3302
3322
  outer: Whether to add outer boundary guides
3303
3323
  tolerance: Tolerance for snapping to element edges
3324
+ apply_exclusions: Whether to apply exclusion zones when searching for text
3304
3325
 
3305
3326
  Returns:
3306
3327
  Self for method chaining
@@ -3318,6 +3339,7 @@ class Guides:
3318
3339
  align=align,
3319
3340
  outer=outer,
3320
3341
  tolerance=tolerance,
3342
+ apply_exclusions=apply_exclusions,
3321
3343
  )
3322
3344
 
3323
3345
  # Add the appropriate coordinates to this object
@@ -3421,6 +3443,140 @@ class Guides:
3421
3443
 
3422
3444
  return self
3423
3445
 
3446
+ def extract_table(
3447
+ self,
3448
+ target: Optional[Union["Page", "Region"]] = None,
3449
+ source: str = "guides_temp",
3450
+ cell_padding: float = 0.5,
3451
+ include_outer_boundaries: bool = False,
3452
+ method: Optional[str] = None,
3453
+ table_settings: Optional[dict] = None,
3454
+ use_ocr: bool = False,
3455
+ ocr_config: Optional[dict] = None,
3456
+ text_options: Optional[Dict] = None,
3457
+ cell_extraction_func: Optional[Callable[["Region"], Optional[str]]] = None,
3458
+ show_progress: bool = False,
3459
+ content_filter: Optional[Union[str, Callable[[str], bool], List[str]]] = None,
3460
+ *,
3461
+ multi_page: Literal["auto", True, False] = "auto",
3462
+ ) -> "TableResult":
3463
+ """
3464
+ Extract table data directly from guides without leaving temporary regions.
3465
+
3466
+ This method:
3467
+ 1. Creates table structure using build_grid()
3468
+ 2. Extracts table data from the created table region
3469
+ 3. Cleans up all temporary regions
3470
+ 4. Returns the TableResult
3471
+
3472
+ Args:
3473
+ target: Page or Region to create regions on (uses self.context if None)
3474
+ source: Source label for temporary regions (will be cleaned up)
3475
+ cell_padding: Internal padding for cell regions in points
3476
+ include_outer_boundaries: Whether to add boundaries at edges if missing
3477
+ method: Table extraction method ('tatr', 'pdfplumber', 'text', etc.)
3478
+ table_settings: Settings for pdfplumber table extraction
3479
+ use_ocr: Whether to use OCR for text extraction
3480
+ ocr_config: OCR configuration parameters
3481
+ text_options: Dictionary of options for the 'text' method
3482
+ cell_extraction_func: Optional callable for custom cell text extraction
3483
+ show_progress: Controls progress bar for text method
3484
+ content_filter: Content filtering function or patterns
3485
+ multi_page: Controls multi-region table creation for FlowRegions
3486
+
3487
+ Returns:
3488
+ TableResult: Extracted table data
3489
+
3490
+ Raises:
3491
+ ValueError: If no table region is created from the guides
3492
+
3493
+ Example:
3494
+ ```python
3495
+ from natural_pdf.analyzers import Guides
3496
+
3497
+ # Create guides from detected lines
3498
+ guides = Guides.from_lines(page, source_label="detected")
3499
+
3500
+ # Extract table directly - no temporary regions left behind
3501
+ table_data = guides.extract_table()
3502
+
3503
+ # Convert to pandas DataFrame
3504
+ df = table_data.to_df()
3505
+ ```
3506
+ """
3507
+ target_obj = target or self.context
3508
+ if not target_obj:
3509
+ raise ValueError("No target object available. Provide target parameter or context.")
3510
+
3511
+ # Get the page for cleanup later
3512
+ if hasattr(target_obj, "x0") and hasattr(target_obj, "top"): # Region
3513
+ page = target_obj._page
3514
+ element_manager = page._element_mgr
3515
+ elif hasattr(target_obj, "_element_mgr"): # Page
3516
+ page = target_obj
3517
+ element_manager = page._element_mgr
3518
+ else:
3519
+ raise ValueError(f"Target object {target_obj} is not a Page or Region")
3520
+
3521
+ try:
3522
+ # Step 1: Build grid structure (creates temporary regions)
3523
+ grid_result = self.build_grid(
3524
+ target=target_obj,
3525
+ source=source,
3526
+ cell_padding=cell_padding,
3527
+ include_outer_boundaries=include_outer_boundaries,
3528
+ multi_page=multi_page,
3529
+ )
3530
+
3531
+ # Step 2: Get the table region and extract table data
3532
+ table_region = grid_result["regions"]["table"]
3533
+ if table_region is None:
3534
+ raise ValueError(
3535
+ "No table region was created from the guides. Check that you have both vertical and horizontal guides."
3536
+ )
3537
+
3538
+ # Handle multi-page case where table_region might be a list
3539
+ if isinstance(table_region, list):
3540
+ if not table_region:
3541
+ raise ValueError("No table regions were created from the guides.")
3542
+ # Use the first table region for extraction
3543
+ table_region = table_region[0]
3544
+
3545
+ # Step 3: Extract table data using the region's extract_table method
3546
+ table_result = table_region.extract_table(
3547
+ method=method,
3548
+ table_settings=table_settings,
3549
+ use_ocr=use_ocr,
3550
+ ocr_config=ocr_config,
3551
+ text_options=text_options,
3552
+ cell_extraction_func=cell_extraction_func,
3553
+ show_progress=show_progress,
3554
+ content_filter=content_filter,
3555
+ )
3556
+
3557
+ return table_result
3558
+
3559
+ finally:
3560
+ # Step 4: Clean up all temporary regions created by build_grid
3561
+ # This ensures no regions are left behind regardless of success/failure
3562
+ try:
3563
+ regions_to_remove = [
3564
+ r
3565
+ for r in element_manager.regions
3566
+ if getattr(r, "source", None) == source
3567
+ and getattr(r, "region_type", None)
3568
+ in {"table", "table_row", "table_column", "table_cell"}
3569
+ ]
3570
+
3571
+ for region in regions_to_remove:
3572
+ element_manager.remove_element(region, element_type="regions")
3573
+
3574
+ if regions_to_remove:
3575
+ logger.debug(f"Cleaned up {len(regions_to_remove)} temporary regions")
3576
+
3577
+ except Exception as cleanup_err:
3578
+ logger.warning(f"Failed to clean up temporary regions: {cleanup_err}")
3579
+
3424
3580
  def _get_flow_orientation(self) -> Literal["vertical", "horizontal", "unknown"]:
3425
3581
  """Determines if a FlowRegion's constituent parts are arranged vertically or horizontally."""
3426
3582
  if not self.is_flow_region or len(self.context.constituent_regions) < 2:
@@ -29,9 +29,22 @@ class DirectionalCollectionMixin:
29
29
  """Find regions to the right of all elements in this collection."""
30
30
  return self.apply(lambda element: element.right(**kwargs))
31
31
 
32
- def expand(self, **kwargs) -> "ElementCollection":
33
- """Expand all elements in this collection."""
34
- return self.apply(lambda element: element.expand(**kwargs))
32
+ def expand(self, *args, **kwargs) -> "ElementCollection":
33
+ """Expand all elements in this collection.
34
+
35
+ Args:
36
+ *args: If a single positional argument is provided, expands all elements
37
+ by that amount in all directions.
38
+ **kwargs: Keyword arguments for directional expansion (left, right, top, bottom, etc.)
39
+
40
+ Examples:
41
+ # Expand all elements by 5 pixels in all directions
42
+ collection.expand(5)
43
+
44
+ # Expand with different amounts in each direction
45
+ collection.expand(left=10, right=5, top=3, bottom=7)
46
+ """
47
+ return self.apply(lambda element: element.expand(*args, **kwargs))
35
48
 
36
49
 
37
50
  class ApplyMixin:
@@ -335,6 +335,7 @@ class HighlightContext:
335
335
  self.show_on_exit = show_on_exit
336
336
  self.highlight_groups = []
337
337
  self._color_manager = ColorManager()
338
+ self._exit_image = None # Store image for Jupyter display
338
339
 
339
340
  def add(
340
341
  self,
@@ -421,6 +422,11 @@ class HighlightContext:
421
422
  )
422
423
  return None
423
424
 
425
+ @property
426
+ def image(self) -> Optional[Image.Image]:
427
+ """Get the last generated image (useful after context exit)."""
428
+ return self._exit_image
429
+
424
430
  def __enter__(self) -> "HighlightContext":
425
431
  """Enter the context."""
426
432
  return self
@@ -428,7 +434,25 @@ class HighlightContext:
428
434
  def __exit__(self, exc_type, exc_val, exc_tb):
429
435
  """Exit the context, optionally showing highlights."""
430
436
  if self.show_on_exit and not exc_type:
431
- self.show()
437
+ self._exit_image = self.show()
438
+
439
+ # Check if we're in a Jupyter/IPython environment
440
+ try:
441
+ # Try to get IPython instance
442
+ from IPython import get_ipython
443
+
444
+ ipython = get_ipython()
445
+ if ipython is not None:
446
+ # We're in IPython/Jupyter
447
+ from IPython.display import display
448
+
449
+ if self._exit_image is not None:
450
+ display(self._exit_image)
451
+ except (ImportError, NameError):
452
+ # Not in Jupyter or IPython not available - that's OK
453
+ pass
454
+
455
+ # __exit__ must return False to not suppress exceptions
432
456
  return False
433
457
 
434
458
 
@@ -689,7 +713,7 @@ class HighlightingService:
689
713
  logger.debug(f"Added highlight to page {page_index}: {highlight}")
690
714
 
691
715
  # --- Invalidate page-level image cache --------------------------------
692
- # The Page.to_image method maintains an internal cache keyed by rendering
716
+ # The Page.render method maintains an internal cache keyed by rendering
693
717
  # parameters. Because the cache key currently does **not** incorporate
694
718
  # any information about the highlights themselves, it can return stale
695
719
  # images after highlights are added or removed. To ensure the next
@@ -700,11 +724,11 @@ class HighlightingService:
700
724
  if hasattr(page_obj, "_to_image_cache"):
701
725
  page_obj._to_image_cache.clear()
702
726
  logger.debug(
703
- f"Cleared cached to_image renders for page {page_index} after adding a highlight."
727
+ f"Cleared cached render images for page {page_index} after adding a highlight."
704
728
  )
705
729
  except Exception as cache_err: # pragma: no cover – never fail highlight creation
706
730
  logger.warning(
707
- f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
731
+ f"Failed to invalidate render cache for page {page_index}: {cache_err}",
708
732
  exc_info=True,
709
733
  )
710
734
 
@@ -737,11 +761,11 @@ class HighlightingService:
737
761
  if hasattr(page_obj, "_to_image_cache"):
738
762
  page_obj._to_image_cache.clear()
739
763
  logger.debug(
740
- f"Cleared cached to_image renders for page {page_index} after removing highlights."
764
+ f"Cleared cached render images for page {page_index} after removing highlights."
741
765
  )
742
766
  except Exception as cache_err: # pragma: no cover
743
767
  logger.warning(
744
- f"Failed to invalidate to_image cache for page {page_index}: {cache_err}",
768
+ f"Failed to invalidate render cache for page {page_index}: {cache_err}",
745
769
  exc_info=True,
746
770
  )
747
771
 
@@ -760,7 +784,7 @@ class HighlightingService:
760
784
  labels: bool = True,
761
785
  legend_position: str = "right",
762
786
  render_ocr: bool = False,
763
- **kwargs, # Pass other args to pdfplumber.page.to_image if needed
787
+ **kwargs, # Pass other args to pdfplumber.page.to_image if needed (internal API)
764
788
  ) -> Optional[Image.Image]:
765
789
  """
766
790
  Renders a specific page with its highlights.
@@ -773,7 +797,7 @@ class HighlightingService:
773
797
  labels: Whether to include a legend for highlights.
774
798
  legend_position: Position of the legend.
775
799
  render_ocr: Whether to render OCR text on the image.
776
- kwargs: Additional keyword arguments for pdfplumber's page.to_image (e.g., width, height).
800
+ kwargs: Additional keyword arguments for pdfplumber's internal page.to_image (e.g., width, height).
777
801
 
778
802
  Returns:
779
803
  A PIL Image object of the rendered page, or None if rendering fails.
@@ -957,7 +981,7 @@ class HighlightingService:
957
981
  crop_bbox: Optional bounding box (x0, top, x1, bottom) in PDF coordinate
958
982
  space to crop the output image to, before legends or other overlays are
959
983
  applied. If None, no cropping is performed.
960
- **kwargs: Additional args for pdfplumber's to_image (e.g., width, height).
984
+ **kwargs: Additional args for pdfplumber's internal to_image (e.g., width, height).
961
985
 
962
986
  Returns:
963
987
  PIL Image of the preview, or None if rendering fails.