natural-pdf 0.2.13__tar.gz → 0.2.16__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (355) hide show
  1. {natural_pdf-0.2.13/natural_pdf.egg-info → natural_pdf-0.2.16}/PKG-INFO +1 -1
  2. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/elements/base.py +4 -1
  3. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/elements/element_collection.py +153 -15
  4. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/elements/rect.py +34 -0
  5. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/elements/region.py +23 -1
  6. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/elements/text.py +20 -2
  7. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/flows/element.py +47 -46
  8. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/selectors/parser.py +28 -1
  9. {natural_pdf-0.2.13 → natural_pdf-0.2.16/natural_pdf.egg-info}/PKG-INFO +1 -1
  10. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.cursor/rules/analysis_framework.mdc +0 -0
  11. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.cursor/rules/coding-style.mdc +0 -0
  12. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  13. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.cursor/rules/minimal-comments.mdc +0 -0
  14. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  15. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  16. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.github/workflows/ci.yml +0 -0
  17. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.github/workflows/docs.yml +0 -0
  18. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.github/workflows/nightly-tutorials.yml +0 -0
  19. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.gitignore +0 -0
  20. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/.pre-commit-config.yaml +0 -0
  21. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/01-execute_notebooks.py +0 -0
  22. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/02-run_all_tutorials.sh +0 -0
  23. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/CLAUDE.md +0 -0
  24. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/LICENSE +0 -0
  25. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/MANIFEST.in +0 -0
  26. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/README.md +0 -0
  27. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/audit_packaging.py +0 -0
  28. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/check_run_md.sh +0 -0
  29. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/api/index.md +0 -0
  30. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/assets/favicon.png +0 -0
  31. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/assets/favicon.svg +0 -0
  32. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/assets/javascripts/custom.js +0 -0
  33. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/assets/logo.svg +0 -0
  34. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/assets/sample-screen.png +0 -0
  35. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/assets/social-preview.png +0 -0
  36. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/assets/social-preview.svg +0 -0
  37. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/assets/stylesheets/custom.css +0 -0
  38. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/categorizing-documents/index.md +0 -0
  39. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/data-extraction/index.md +0 -0
  40. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/describe/index.md +0 -0
  41. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/document-qa/index.md +0 -0
  42. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/element-selection/index.md +0 -0
  43. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/extracting-clean-text/index.md +0 -0
  44. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/finetuning/index.md +0 -0
  45. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/fix-messy-tables/index.md +0 -0
  46. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/fix-messy-tables/table_1.csv +0 -0
  47. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/fix-messy-tables/table_2.csv +0 -0
  48. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/fix-messy-tables/table_3.csv +0 -0
  49. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/index.md +0 -0
  50. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/installation/index.md +0 -0
  51. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/interactive-widget/index.md +0 -0
  52. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/layout-analysis/index.md +0 -0
  53. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/loops-and-groups/index.md +0 -0
  54. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/ocr/index.md +0 -0
  55. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/pdf-navigation/index.md +0 -0
  56. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  57. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/process-forms-and-invoices/index.md +0 -0
  58. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/quick-reference/index.md +0 -0
  59. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/reflowing-pages/index.md +0 -0
  60. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/regions/index.md +0 -0
  61. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tables/index.md +0 -0
  62. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/text-analysis/index.md +0 -0
  63. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/01-loading-and-extraction.md +0 -0
  64. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/02-finding-elements.md +0 -0
  65. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/03-extracting-blocks.md +0 -0
  66. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/04-table-extraction.md +0 -0
  67. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/05-excluding-content.md +0 -0
  68. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/06-document-qa.md +0 -0
  69. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/07-layout-analysis.md +0 -0
  70. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/07-working-with-regions.md +0 -0
  71. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/08-spatial-navigation.md +0 -0
  72. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/09-section-extraction.md +0 -0
  73. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/10-form-field-extraction.md +0 -0
  74. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  75. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/12-ocr-integration.md +0 -0
  76. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/13-semantic-search.md +0 -0
  77. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/tutorials/14-categorizing-documents.md +0 -0
  78. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/visual-debugging/index.md +0 -0
  79. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/docs/visual-debugging/region.png +0 -0
  80. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/mkdocs.yml +0 -0
  81. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/__init__.py +0 -0
  82. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/__init__.py +0 -0
  83. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/guides.py +0 -0
  84. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/__init__.py +0 -0
  85. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/base.py +0 -0
  86. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/docling.py +0 -0
  87. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/gemini.py +0 -0
  88. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  89. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  90. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  91. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/paddle.py +0 -0
  92. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  93. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/surya.py +0 -0
  94. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  95. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/tatr.py +0 -0
  96. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/layout/yolo.py +0 -0
  97. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  98. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/text_options.py +0 -0
  99. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/text_structure.py +0 -0
  100. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/analyzers/utils.py +0 -0
  101. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/classification/manager.py +0 -0
  102. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/classification/mixin.py +0 -0
  103. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/classification/results.py +0 -0
  104. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/cli.py +0 -0
  105. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/collections/mixins.py +0 -0
  106. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/core/__init__.py +0 -0
  107. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/core/element_manager.py +0 -0
  108. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/core/highlighting_service.py +0 -0
  109. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/core/page.py +0 -0
  110. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/core/page_collection.py +0 -0
  111. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/core/page_groupby.py +0 -0
  112. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/core/pdf.py +0 -0
  113. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/core/pdf_collection.py +0 -0
  114. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/core/render_spec.py +0 -0
  115. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/describe/__init__.py +0 -0
  116. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/describe/base.py +0 -0
  117. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/describe/elements.py +0 -0
  118. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/describe/mixin.py +0 -0
  119. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/describe/summary.py +0 -0
  120. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/elements/__init__.py +0 -0
  121. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/elements/image.py +0 -0
  122. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/elements/line.py +0 -0
  123. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/export/mixin.py +0 -0
  124. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/__init__.py +0 -0
  125. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/base.py +0 -0
  126. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/data/__init__.py +0 -0
  127. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/data/pdf.ttf +0 -0
  128. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/data/sRGB.icc +0 -0
  129. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/hocr.py +0 -0
  130. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/hocr_font.py +0 -0
  131. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/original_pdf.py +0 -0
  132. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/paddleocr.py +0 -0
  133. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/exporters/searchable_pdf.py +0 -0
  134. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/extraction/manager.py +0 -0
  135. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/extraction/mixin.py +0 -0
  136. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/extraction/result.py +0 -0
  137. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/flows/__init__.py +0 -0
  138. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/flows/collections.py +0 -0
  139. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/flows/flow.py +0 -0
  140. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/flows/region.py +0 -0
  141. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/__init__.py +0 -0
  142. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/engine.py +0 -0
  143. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/engine_doctr.py +0 -0
  144. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/engine_easyocr.py +0 -0
  145. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/engine_paddle.py +0 -0
  146. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/engine_surya.py +0 -0
  147. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/ocr_factory.py +0 -0
  148. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/ocr_manager.py +0 -0
  149. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/ocr_options.py +0 -0
  150. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/ocr/utils.py +0 -0
  151. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/qa/__init__.py +0 -0
  152. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/qa/document_qa.py +0 -0
  153. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/qa/qa_result.py +0 -0
  154. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/search/__init__.py +0 -0
  155. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/search/lancedb_search_service.py +0 -0
  156. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/search/numpy_search_service.py +0 -0
  157. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/search/search_options.py +0 -0
  158. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/search/search_service_protocol.py +0 -0
  159. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/search/searchable_mixin.py +0 -0
  160. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/selectors/__init__.py +0 -0
  161. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/tables/__init__.py +0 -0
  162. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/tables/result.py +0 -0
  163. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/templates/__init__.py +0 -0
  164. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  165. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/templates/spa/css/style.css +0 -0
  166. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/templates/spa/index.html +0 -0
  167. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/templates/spa/js/app.js +0 -0
  168. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/templates/spa/words.txt +0 -0
  169. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/text_mixin.py +0 -0
  170. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/__init__.py +0 -0
  171. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/bidi_mirror.py +0 -0
  172. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/color_utils.py +0 -0
  173. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/debug.py +0 -0
  174. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/highlighting.py +0 -0
  175. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/identifiers.py +0 -0
  176. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/layout.py +0 -0
  177. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/locks.py +0 -0
  178. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/packaging.py +0 -0
  179. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/reading_order.py +0 -0
  180. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/text_extraction.py +0 -0
  181. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/utils/visualization.py +0 -0
  182. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/vision/__init__.py +0 -0
  183. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/vision/mixin.py +0 -0
  184. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/vision/results.py +0 -0
  185. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/vision/similarity.py +0 -0
  186. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/vision/template_matching.py +0 -0
  187. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/widgets/__init__.py +0 -0
  188. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf/widgets/viewer.py +0 -0
  189. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf.egg-info/SOURCES.txt +0 -0
  190. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf.egg-info/dependency_links.txt +0 -0
  191. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf.egg-info/entry_points.txt +0 -0
  192. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf.egg-info/requires.txt +0 -0
  193. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/natural_pdf.egg-info/top_level.txt +0 -0
  194. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/noxfile.py +0 -0
  195. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/optimization/memory_comparison.py +0 -0
  196. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/optimization/pdf_analyzer.py +0 -0
  197. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/optimization/performance_analysis.py +0 -0
  198. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  199. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  200. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  201. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  202. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/optimization/test_cleanup_methods.py +0 -0
  203. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/optimization/test_memory_fix.py +0 -0
  204. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/publish.sh +0 -0
  205. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/pyproject.toml +0 -0
  206. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/sample-screen.png +0 -0
  207. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/setup.cfg +0 -0
  208. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/fix_page_exclusions.py +0 -0
  209. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_draw_guides.py +0 -0
  210. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_draw_guides_interactive.py +0 -0
  211. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_exclusion_with_debug.py +0 -0
  212. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_find_exclusions_fix.py +0 -0
  213. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_find_exclusions_fix_no_recursion.py +0 -0
  214. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_fix_real_pdf.py +0 -0
  215. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_fix_working.py +0 -0
  216. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_fixed_pdf_exclusions.py +0 -0
  217. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_guide_draw_notebook.py +0 -0
  218. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_horizontal_top_bottom.py +0 -0
  219. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_inline_js.py +0 -0
  220. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_marker_order.py +0 -0
  221. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_original_exclusions_now_work.py +0 -0
  222. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_pdf_exclusions_with_guides.py +0 -0
  223. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_region_exclusions_detailed.py +0 -0
  224. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_stripes_real_pdf.py +0 -0
  225. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_vertical_stripes.py +0 -0
  226. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_widget_functionality.py +0 -0
  227. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/temp/test_widget_simple.py +0 -0
  228. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/conftest.py +0 -0
  229. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/exporters/test_paddleocr_exporter.py +0 -0
  230. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_annotate.py +0 -0
  231. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_arabic_performance.py +0 -0
  232. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_arabic_real_world.py +0 -0
  233. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_color_conversion.py +0 -0
  234. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_color_hex_display.py +0 -0
  235. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_core/test_containment_geometry.py +0 -0
  236. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_core/test_elements.py +0 -0
  237. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_core/test_loading.py +0 -0
  238. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_core/test_spatial.py +0 -0
  239. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_core/test_text_extraction.py +0 -0
  240. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_core/test_text_layer.py +0 -0
  241. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_crop_enhancements.py +0 -0
  242. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_crop_region_highlights.py +0 -0
  243. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_directional_defaults.py +0 -0
  244. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_dissolve.py +0 -0
  245. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_dissolve_cross_page_bug.py +0 -0
  246. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_dissolve_debug_issue.py +0 -0
  247. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_dissolve_real_world_issue.py +0 -0
  248. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_dissolve_single_elements.py +0 -0
  249. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_dissolve_vertical_offset_issue.py +0 -0
  250. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_document_qa.py +0 -0
  251. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_element_addition.py +0 -0
  252. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_element_collection_guides.py +0 -0
  253. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_element_collection_show_cols.py +0 -0
  254. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_element_collection_slicing.py +0 -0
  255. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_element_exclusions.py +0 -0
  256. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_element_show_crop_highlights.py +0 -0
  257. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_empty_pseudo_class.py +0 -0
  258. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_exclusions.py +0 -0
  259. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_expand.py +0 -0
  260. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_extraction_error.py +0 -0
  261. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_extraction_mixin_fix.py +0 -0
  262. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_extraction_text_and_vision.py +0 -0
  263. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_extraction_working.py +0 -0
  264. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_find_similar.py +0 -0
  265. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_first_last_selectors.py +0 -0
  266. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_fix_get_sections_zero_height.py +0 -0
  267. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_flow_region_directional.py +0 -0
  268. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_get_sections_fix_comprehensive.py +0 -0
  269. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_get_sections_zero_height.py +0 -0
  270. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_groupby.py +0 -0
  271. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides.py +0 -0
  272. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides_apply_exclusions.py +0 -0
  273. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides_apply_exclusions_simple.py +0 -0
  274. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides_extract_table.py +0 -0
  275. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides_extract_table_collections.py +0 -0
  276. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides_extract_table_exclusions.py +0 -0
  277. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides_extract_table_real.py +0 -0
  278. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides_from_stripes.py +0 -0
  279. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides_integration.py +0 -0
  280. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_guides_marker_sorting.py +0 -0
  281. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_highlight_detection.py +0 -0
  282. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_highlight_detection_comprehensive.py +0 -0
  283. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_highlight_offset.py +0 -0
  284. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_highlight_protocol.py +0 -0
  285. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_highlight_protocol_simple.py +0 -0
  286. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_highlight_regions.py +0 -0
  287. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_horizontal_guides_alignment.py +0 -0
  288. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_comprehensive.py +0 -0
  289. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_debug.py +0 -0
  290. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_final.py +0 -0
  291. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_final_verification.py +0 -0
  292. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_fix.py +0 -0
  293. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_mock.py +0 -0
  294. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_simple.py +0 -0
  295. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_types_pdf.py +0 -0
  296. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_verification.py +0 -0
  297. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_include_boundaries_with_real_text.py +0 -0
  298. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_loading_original.py +0 -0
  299. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_match_results_sorting.py +0 -0
  300. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_merge_connected.py +0 -0
  301. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_merge_connected_real_world.py +0 -0
  302. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_merge_method.py +0 -0
  303. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_multi_page_table_discovery.py +0 -0
  304. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_negative_bounds_pdf.py +0 -0
  305. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_optional_deps.py +0 -0
  306. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_page_exclusion_lists.py +0 -0
  307. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  308. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_pdf_exclusions_in_find_methods.py +0 -0
  309. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_phash_masking.py +0 -0
  310. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_region_find_similar.py +0 -0
  311. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_region_show_crop_highlights.py +0 -0
  312. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_region_viewer.py +0 -0
  313. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_sections_end_only.py +0 -0
  314. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_sections_with_start_and_end.py +0 -0
  315. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_show_column_layout.py +0 -0
  316. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_show_edge_cases.py +0 -0
  317. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_show_exclusions.py +0 -0
  318. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_show_exclusions_feature.py +0 -0
  319. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_show_limit.py +0 -0
  320. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_skip_repeating_headers_multipage.py +0 -0
  321. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_slice_cache_reuse.py +0 -0
  322. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_slice_exclusion_fix.py +0 -0
  323. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_slice_exclusion_issue.py +0 -0
  324. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_slice_exclusion_mock.py +0 -0
  325. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_sliced_collection_exclusions.py +0 -0
  326. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_spatial_offset.py +0 -0
  327. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_strikethrough_detection.py +0 -0
  328. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_table_result_header_mismatch.py +0 -0
  329. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_table_result_keep_blank.py +0 -0
  330. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_template_matching.py +0 -0
  331. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_template_white_masking.py +0 -0
  332. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_tiny_text_tables.py +0 -0
  333. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_tiny_text_tables_table.py +0 -0
  334. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_tutorials.py +0 -0
  335. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_underline_detection.py +0 -0
  336. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tests/test_update_text.py +0 -0
  337. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/todo/bad_pdf_analysis.md +0 -0
  338. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/todo/evaluation.md +0 -0
  339. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  340. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  341. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  342. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/README.md +0 -0
  343. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/__init__.py +0 -0
  344. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/analyser.py +0 -0
  345. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  346. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  347. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/eval_suite.py +0 -0
  348. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  349. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  350. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  351. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  352. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  353. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/reporter.py +0 -0
  354. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/tools/bad_pdf_eval/utils.py +0 -0
  355. {natural_pdf-0.2.13 → natural_pdf-0.2.16}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.13
3
+ Version: 0.2.16
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -106,7 +106,7 @@ class DirectionalMixin:
106
106
  include_source: bool = False,
107
107
  until: Optional[str] = None,
108
108
  include_endpoint: bool = True,
109
- offset: float = 0.1,
109
+ offset: float = 0.0,
110
110
  **kwargs,
111
111
  ) -> "Region":
112
112
  """
@@ -1209,6 +1209,9 @@ class Element(
1209
1209
 
1210
1210
  return self
1211
1211
 
1212
+ def exclude(self):
1213
+ self.page.add_exclusion(self)
1214
+
1212
1215
  def _get_render_specs(
1213
1216
  self,
1214
1217
  mode: Literal["show", "render"] = "show",
@@ -888,6 +888,9 @@ class ElementCollection(
888
888
  self._elements.sort(key=key, reverse=reverse)
889
889
  return self
890
890
 
891
+ def exclude(self):
892
+ self.page.add_exclusion(self)
893
+
891
894
  def highlight(
892
895
  self,
893
896
  label: Optional[str] = None,
@@ -1902,13 +1905,87 @@ class ElementCollection(
1902
1905
 
1903
1906
  return ElementCollection(all_found_elements)
1904
1907
 
1905
- def extract_each_text(self, **kwargs) -> List[str]:
1906
- """
1907
- Extract text from each element in this region.
1908
+ def extract_each_text(
1909
+ self,
1910
+ order: Optional[Union[str, Callable[[T], Any]]] = None,
1911
+ *,
1912
+ newlines: bool = True,
1913
+ **kwargs,
1914
+ ) -> List[str]:
1915
+ """Return a list with the extracted text for every element.
1916
+
1917
+ Parameters
1918
+ ----------
1919
+ order
1920
+ Controls the ordering of elements **before** extraction:
1921
+
1922
+ * ``None`` (default) – keep the collection's current order.
1923
+ * ``callable`` – a function that will be used as ``key`` for :pyfunc:`sorted`.
1924
+ * ``"ltr"`` – left-to-right ordering (x0, then y-top).
1925
+ * ``"rtl"`` – right-to-left ordering (−x0, then y-top).
1926
+ * ``"natural"`` – natural reading order (y-top, then x0).
1927
+
1928
+ Remaining keyword arguments are forwarded to each element's
1929
+ :py:meth:`extract_text` method.
1908
1930
  """
1909
- return self.apply(
1910
- lambda element: element.extract_text(**kwargs) if element is not None else None
1911
- )
1931
+
1932
+ # -- Determine ordering --------------------------------------------------
1933
+ elements: List[T] = list(self._elements) # make a shallow copy we can sort
1934
+
1935
+ if order is not None and len(elements) > 1:
1936
+ try:
1937
+ if callable(order):
1938
+ elements.sort(key=order)
1939
+ elif isinstance(order, str):
1940
+ preset = order.lower()
1941
+ if preset in {"ltr", "left-to-right"}:
1942
+ elements.sort(
1943
+ key=lambda el: (
1944
+ (
1945
+ getattr(el, "page", None).index
1946
+ if hasattr(el, "page") and el.page
1947
+ else 0
1948
+ ),
1949
+ getattr(el, "x0", 0),
1950
+ getattr(el, "top", 0),
1951
+ )
1952
+ )
1953
+ elif preset in {"rtl", "right-to-left"}:
1954
+ elements.sort(
1955
+ key=lambda el: (
1956
+ (
1957
+ getattr(el, "page", None).index
1958
+ if hasattr(el, "page") and el.page
1959
+ else 0
1960
+ ),
1961
+ -getattr(el, "x0", 0),
1962
+ getattr(el, "top", 0),
1963
+ )
1964
+ )
1965
+ elif preset in {"natural", "tdlr", "top-down"}:
1966
+ elements.sort(
1967
+ key=lambda el: (
1968
+ (
1969
+ getattr(el, "page", None).index
1970
+ if hasattr(el, "page") and el.page
1971
+ else 0
1972
+ ),
1973
+ getattr(el, "top", 0),
1974
+ getattr(el, "x0", 0),
1975
+ )
1976
+ )
1977
+ else:
1978
+ # Unknown preset – silently ignore to keep original order
1979
+ pass
1980
+ except Exception:
1981
+ # If anything goes wrong, fall back to original order
1982
+ pass
1983
+
1984
+ # -- Extract ----------------------------------------------------------------
1985
+ return [
1986
+ el.extract_text(newlines=newlines, **kwargs) if el is not None else None # type: ignore[arg-type]
1987
+ for el in elements
1988
+ ]
1912
1989
 
1913
1990
  def correct_ocr(
1914
1991
  self,
@@ -2673,10 +2750,17 @@ class ElementCollection(
2673
2750
  else:
2674
2751
  v_dist = 0 # Vertically overlapping
2675
2752
 
2676
- # Use Chebyshev distance (max of horizontal and vertical)
2677
- # This creates a square proximity zone
2678
- distance = max(h_dist, v_dist)
2753
+ # ------------------------------------------------------------------
2754
+ # Decide connection logic based on vertical_gap parameter
2755
+ # ------------------------------------------------------------------
2756
+ if vertical_gap is not None:
2757
+ # Consider elements connected when they vertically stack within
2758
+ # the allowed gap **and** have some horizontal overlap
2759
+ horizontal_overlap = not (h_dist > 0)
2760
+ return horizontal_overlap and v_dist <= vertical_gap
2679
2761
 
2762
+ # Fallback to legacy Chebyshev distance using ``threshold``
2763
+ distance = max(h_dist, v_dist)
2680
2764
  return distance <= threshold
2681
2765
 
2682
2766
  def _merge_region_group(
@@ -2752,6 +2836,9 @@ class ElementCollection(
2752
2836
  def dissolve(
2753
2837
  self,
2754
2838
  padding: float = 2.0,
2839
+ *,
2840
+ vertical_gap: Optional[float] = None,
2841
+ vertical: Optional[bool] = False,
2755
2842
  geometry: Literal["rect", "polygon"] = "rect",
2756
2843
  group_by: List[str] = None,
2757
2844
  ) -> "ElementCollection":
@@ -2764,8 +2851,19 @@ class ElementCollection(
2764
2851
  bounding boxes.
2765
2852
 
2766
2853
  Args:
2767
- padding: Maximum distance in points between elements to consider
2768
- them connected. Default is 2.0 points.
2854
+ padding: Maximum chebyshev distance (in any direction) between
2855
+ elements to consider them connected **when ``vertical_gap`` is
2856
+ not provided**. Default 2.0 pt.
2857
+
2858
+ vertical_gap: If given, switches to *stack-aware* dissolve:
2859
+ two elements are connected when their horizontal projections
2860
+ overlap (any amount) **and** the vertical distance between them
2861
+ is ≤ ``vertical_gap``. This lets you combine multi-line labels
2862
+ that share the same column but have blank space between lines.
2863
+
2864
+ vertical: If given, automatically sets vertical_gap to maximum to
2865
+ allow for easy vertical stacking.
2866
+
2769
2867
  geometry: Type of geometry to use for merged regions. Currently only
2770
2868
  "rect" (bounding box) is supported. "polygon" will raise
2771
2869
  NotImplementedError.
@@ -2807,6 +2905,9 @@ class ElementCollection(
2807
2905
  if geometry not in ["rect", "polygon"]:
2808
2906
  raise ValueError(f"Invalid geometry type: {geometry}. Must be 'rect' or 'polygon'")
2809
2907
 
2908
+ if vertical:
2909
+ vertical_gap = float("inf")
2910
+
2810
2911
  from natural_pdf.elements.region import Region
2811
2912
 
2812
2913
  # Filter to elements with bbox (all elements that can be dissolved)
@@ -2835,7 +2936,9 @@ class ElementCollection(
2835
2936
  logger.debug(f"Processing group {group_key} with {len(group_elements)} elements")
2836
2937
 
2837
2938
  # Find connected components within this group
2838
- components = self._find_connected_components_elements(group_elements, padding)
2939
+ components = self._find_connected_components_elements(
2940
+ group_elements, padding, vertical_gap
2941
+ )
2839
2942
 
2840
2943
  # Merge each component
2841
2944
  for component_elements in components:
@@ -2894,7 +2997,7 @@ class ElementCollection(
2894
2997
  return groups
2895
2998
 
2896
2999
  def _find_connected_components_elements(
2897
- self, elements: List["Element"], padding: float
3000
+ self, elements: List["Element"], padding: float, vertical_gap: Optional[float] = None
2898
3001
  ) -> List[List["Element"]]:
2899
3002
  """Find connected components among elements using union-find."""
2900
3003
  if not elements:
@@ -2919,7 +3022,7 @@ class ElementCollection(
2919
3022
  # Check all pairs of elements for connectivity
2920
3023
  for i in range(len(elements)):
2921
3024
  for j in range(i + 1, len(elements)):
2922
- if self._are_elements_connected(elements[i], elements[j], padding):
3025
+ if self._are_elements_connected(elements[i], elements[j], padding, vertical_gap):
2923
3026
  union(i, j)
2924
3027
 
2925
3028
  # Group elements by their connected component
@@ -3004,7 +3107,9 @@ class ElementCollection(
3004
3107
 
3005
3108
  return merged_region
3006
3109
 
3007
- def _are_elements_connected(self, elem1: "Element", elem2: "Element", threshold: float) -> bool:
3110
+ def _are_elements_connected(
3111
+ self, elem1: "Element", elem2: "Element", threshold: float, vertical_gap: float | None
3112
+ ) -> bool:
3008
3113
  """Check if two elements are connected (adjacent or overlapping)."""
3009
3114
  # Check if elements are on the same page
3010
3115
  # Handle edge cases where elements might not have a page attribute
@@ -3057,6 +3162,12 @@ class ElementCollection(
3057
3162
  # This creates a square proximity zone
3058
3163
  distance = max(h_dist, v_dist)
3059
3164
 
3165
+ if vertical_gap is not None:
3166
+ # 1. vertical distance ≤ vertical_gap
3167
+ # 2. horizontal ranges overlap OR touch
3168
+ h_overlap = (min(x1_1, x1_2) - max(x0_1, x0_2)) >= 0
3169
+ return h_overlap and v_dist <= vertical_gap
3170
+
3060
3171
  return distance <= threshold
3061
3172
 
3062
3173
  def _copy_element_attributes_to_region(
@@ -3163,3 +3274,30 @@ class ElementCollection(
3163
3274
  return self
3164
3275
 
3165
3276
  # ------------------------------------------------------------------
3277
+
3278
+ # ------------------------------------------------------------------
3279
+ # Public alias: combine
3280
+ # ------------------------------------------------------------------
3281
+ def combine(
3282
+ self,
3283
+ padding: float = 2.0,
3284
+ *,
3285
+ vertical_gap: Optional[float] = None,
3286
+ vertical: Optional[bool] = False,
3287
+ geometry: Literal["rect", "polygon"] = "rect",
3288
+ group_by: List[str] = None,
3289
+ ) -> "ElementCollection":
3290
+ """Alias for :py:meth:`dissolve` – retained for discoverability.
3291
+
3292
+ Many users find the verb *combine* more intuitive than *dissolve* when
3293
+ merging nearby or stacked elements into unified Regions. The parameters
3294
+ are identical; see :py:meth:`dissolve` for full documentation.
3295
+ """
3296
+
3297
+ return self.dissolve(
3298
+ padding=padding,
3299
+ vertical_gap=vertical_gap,
3300
+ vertical=vertical,
3301
+ geometry=geometry,
3302
+ group_by=group_by,
3303
+ )
@@ -88,6 +88,40 @@ class RectangleElement(Element):
88
88
  """Get the stroke width of the rectangle."""
89
89
  return self._obj.get("linewidth", 0)
90
90
 
91
+ @property
92
+ def is_horizontal(self) -> bool:
93
+ """Check if this is a horizontal line based on coordinates."""
94
+ # Calculate absolute difference in coordinates
95
+ dx = abs(self.x1 - self.x0)
96
+ dy = abs(self.top - self.bottom)
97
+
98
+ # Define a tolerance for near-horizontal lines (e.g., 1 point)
99
+ tolerance = 1.0
100
+
101
+ # Horizontal if y-change is within tolerance and x-change is significant
102
+ return dy <= tolerance and dx > tolerance
103
+
104
+ @property
105
+ def is_vertical(self) -> bool:
106
+ """Check if this is a vertical line based on coordinates."""
107
+ # Calculate absolute difference in coordinates
108
+ dx = abs(self.x1 - self.x0)
109
+ dy = abs(self.top - self.bottom)
110
+
111
+ # Define a tolerance for near-vertical lines (e.g., 1 point)
112
+ tolerance = 1.0
113
+
114
+ # Vertical if x-change is within tolerance and y-change is significant
115
+ return dx <= tolerance and dy > tolerance
116
+
117
+ @property
118
+ def orientation(self) -> str:
119
+ """Get the orientation of the line ('horizontal', 'vertical', or 'diagonal')."""
120
+ if self.is_horizontal:
121
+ return "horizontal"
122
+ elif self.is_vertical:
123
+ return "vertical"
124
+
91
125
  def extract_text(self, **kwargs) -> str:
92
126
  """
93
127
  Extract text from inside this rectangle.
@@ -738,6 +738,9 @@ class Region(
738
738
  and self.bottom > element.top
739
739
  )
740
740
 
741
+ def exclude(self):
742
+ self.page.add_exclusion(self)
743
+
741
744
  def highlight(
742
745
  self,
743
746
  label: Optional[str] = None,
@@ -1229,7 +1232,13 @@ class Region(
1229
1232
  return [e for e in page_elements if self._is_element_in_region(e)]
1230
1233
 
1231
1234
  def extract_text(
1232
- self, apply_exclusions=True, debug=False, content_filter=None, **kwargs
1235
+ self,
1236
+ apply_exclusions: bool = True,
1237
+ debug: bool = False,
1238
+ *,
1239
+ newlines: Union[bool, str] = True,
1240
+ content_filter=None,
1241
+ **kwargs,
1233
1242
  ) -> str:
1234
1243
  """
1235
1244
  Extract text from this region, respecting page exclusions and using pdfplumber's
@@ -1238,6 +1247,7 @@ class Region(
1238
1247
  Args:
1239
1248
  apply_exclusions: Whether to apply exclusion regions defined on the parent page.
1240
1249
  debug: Enable verbose debugging output for filtering steps.
1250
+ newlines: Whether to strip newline characters from the extracted text.
1241
1251
  content_filter: Optional content filter to exclude specific text patterns. Can be:
1242
1252
  - A regex pattern string (characters matching the pattern are EXCLUDED)
1243
1253
  - A callable that takes text and returns True to KEEP the character
@@ -1311,6 +1321,18 @@ class Region(
1311
1321
  user_kwargs=final_kwargs, # Pass kwargs including content_filter
1312
1322
  )
1313
1323
 
1324
+ # Flexible newline handling (same logic as TextElement)
1325
+ if isinstance(newlines, bool):
1326
+ if newlines is False:
1327
+ replacement = " "
1328
+ else:
1329
+ replacement = None
1330
+ else:
1331
+ replacement = str(newlines)
1332
+
1333
+ if replacement is not None:
1334
+ result = result.replace("\n", replacement).replace("\r", replacement)
1335
+
1314
1336
  logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
1315
1337
  return result
1316
1338
 
@@ -2,7 +2,7 @@
2
2
  Text element classes for natural-pdf.
3
3
  """
4
4
 
5
- from typing import TYPE_CHECKING, Any, Dict, Optional
5
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Union
6
6
 
7
7
  from natural_pdf.elements.base import Element
8
8
 
@@ -236,7 +236,13 @@ class TextElement(Element):
236
236
  return (0, 0, 0)
237
237
 
238
238
  def extract_text(
239
- self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs
239
+ self,
240
+ keep_blank_chars: bool = True,
241
+ strip: Optional[bool] = True,
242
+ *,
243
+ newlines: Union[bool, str] = True,
244
+ content_filter=None,
245
+ **kwargs,
240
246
  ) -> str:
241
247
  """
242
248
  Extract text from this element.
@@ -292,6 +298,18 @@ class TextElement(Element):
292
298
  if strip:
293
299
  result = result.strip()
294
300
 
301
+ # Flexible newline handling
302
+ if isinstance(newlines, bool):
303
+ if newlines is False:
304
+ replacement = " " # single space when False
305
+ else:
306
+ replacement = None # keep as-is when True
307
+ else:
308
+ replacement = str(newlines)
309
+
310
+ if replacement is not None:
311
+ result = result.replace("\n", replacement).replace("\r", replacement)
312
+
295
313
  return result
296
314
 
297
315
  def contains(self, substring: str, case_sensitive: bool = True) -> bool:
@@ -106,6 +106,7 @@ class FlowElement:
106
106
  cross_size_absolute: Optional[float] = None,
107
107
  cross_alignment: str = "center", # "start", "center", "end"
108
108
  until: Optional[str] = None,
109
+ include_source: bool = False,
109
110
  include_endpoint: bool = True,
110
111
  **kwargs,
111
112
  ) -> "FlowRegion":
@@ -178,13 +179,9 @@ class FlowElement:
178
179
  is_forward = False
179
180
  segment_iterator = range(start_segment_index, -1, -1)
180
181
  elif direction == "right":
181
- if is_primary_vertical:
182
- raise NotImplementedError("'right' is for horizontal flows.")
183
182
  is_forward = True
184
183
  segment_iterator = range(start_segment_index, len(self.flow.segments))
185
184
  elif direction == "left":
186
- if is_primary_vertical:
187
- raise NotImplementedError("'left' is for horizontal flows.")
188
185
  is_forward = False
189
186
  segment_iterator = range(start_segment_index, -1, -1)
190
187
  else:
@@ -206,28 +203,34 @@ class FlowElement:
206
203
  "direction": direction,
207
204
  "until": until,
208
205
  "include_endpoint": include_endpoint,
206
+ "include_source": include_source,
209
207
  **kwargs,
210
208
  }
211
209
 
212
- # --- Cross-size logic: Default to "full" if no specific ratio or absolute is given ---
210
+ # --- Cross-size logic: Default based on direction ---
213
211
  cross_size_for_op: Union[str, float]
214
212
  if cross_size_absolute is not None:
215
213
  cross_size_for_op = cross_size_absolute
216
214
  elif cross_size_ratio is not None: # User explicitly provided a ratio
215
+ # Cross dimension depends on direction, not flow arrangement
217
216
  base_cross_dim = (
218
217
  self.physical_object.width
219
- if is_primary_vertical
218
+ if direction in ["above", "below"]
220
219
  else self.physical_object.height
221
220
  )
222
221
  cross_size_for_op = base_cross_dim * cross_size_ratio
223
- else: # Default case: neither absolute nor ratio provided, so use "full"
224
- cross_size_for_op = "full"
222
+ else: # Default case: neither absolute nor ratio provided
223
+ # Default to element size for left/right, full for above/below
224
+ if direction in ["left", "right"]:
225
+ cross_size_for_op = self.physical_object.height
226
+ else:
227
+ cross_size_for_op = "full"
225
228
  op_direction_params["cross_size"] = cross_size_for_op
226
229
 
227
230
  if current_segment_idx == start_segment_index:
228
231
  op_source = self.physical_object
229
232
  op_direction_params["size"] = remaining_size if size is not None else None
230
- op_direction_params["include_source"] = False
233
+ op_direction_params["include_source"] = include_source
231
234
 
232
235
  source_for_op_call = op_source
233
236
  if not isinstance(source_for_op_call, PhysicalRegion_Class):
@@ -245,7 +248,7 @@ class FlowElement:
245
248
  "size": remaining_size if size is not None else None,
246
249
  "cross_size": cross_size_for_op,
247
250
  "cross_alignment": cross_alignment, # Pass alignment
248
- "include_source": False,
251
+ "include_source": include_source,
249
252
  # Pass other relevant kwargs if Region._direction uses them (e.g. strict_type)
250
253
  **{k: v for k, v in kwargs.items() if k in ["strict_type", "first_match_only"]},
251
254
  }
@@ -283,7 +286,7 @@ class FlowElement:
283
286
  if potential_hit:
284
287
  boundary_element_hit = potential_hit # Set the overall boundary flag
285
288
  # Adjust segment_contribution to stop at this boundary_element_hit.
286
- if is_primary_vertical:
289
+ if direction in ["below", "above"]:
287
290
  if direction == "below":
288
291
  edge = (
289
292
  boundary_element_hit.bottom
@@ -300,7 +303,7 @@ class FlowElement:
300
303
  bottom=edge if direction == "below" else None,
301
304
  top=edge if direction == "above" else None,
302
305
  )
303
- else:
306
+ else: # direction in ["right", "left"]
304
307
  if direction == "right":
305
308
  edge = (
306
309
  boundary_element_hit.x1
@@ -338,7 +341,7 @@ class FlowElement:
338
341
 
339
342
  if potential_hit:
340
343
  boundary_element_hit = potential_hit
341
- if is_primary_vertical:
344
+ if direction in ["below", "above"]:
342
345
  if direction == "below":
343
346
  edge = (
344
347
  boundary_element_hit.bottom
@@ -355,7 +358,7 @@ class FlowElement:
355
358
  bottom=edge if direction == "below" else None,
356
359
  top=edge if direction == "above" else None,
357
360
  )
358
- else:
361
+ else: # direction in ["right", "left"]
359
362
  if direction == "right":
360
363
  edge = (
361
364
  boundary_element_hit.x1
@@ -381,7 +384,7 @@ class FlowElement:
381
384
  and size is not None
382
385
  ):
383
386
  current_part_consumed_size = 0.0
384
- if is_primary_vertical:
387
+ if direction in ["below", "above"]:
385
388
  current_part_consumed_size = segment_contribution.height
386
389
  if current_part_consumed_size > remaining_size:
387
390
  new_edge = (
@@ -394,7 +397,7 @@ class FlowElement:
394
397
  top=new_edge if not is_forward else None,
395
398
  )
396
399
  current_part_consumed_size = remaining_size
397
- else:
400
+ else: # direction in ["left", "right"]
398
401
  current_part_consumed_size = segment_contribution.width
399
402
  if current_part_consumed_size > remaining_size:
400
403
  new_edge = (
@@ -451,6 +454,7 @@ class FlowElement:
451
454
  width_absolute: Optional[float] = None,
452
455
  width_alignment: str = "center",
453
456
  until: Optional[str] = None,
457
+ include_source: bool = False,
454
458
  include_endpoint: bool = True,
455
459
  **kwargs,
456
460
  ) -> "FlowRegion": # Stringized
@@ -462,6 +466,7 @@ class FlowElement:
462
466
  cross_size_absolute=width_absolute,
463
467
  cross_alignment=width_alignment,
464
468
  until=until,
469
+ include_source=include_source,
465
470
  include_endpoint=include_endpoint,
466
471
  **kwargs,
467
472
  )
@@ -477,6 +482,7 @@ class FlowElement:
477
482
  width_absolute: Optional[float] = None,
478
483
  width_alignment: str = "center",
479
484
  until: Optional[str] = None,
485
+ include_source: bool = False,
480
486
  include_endpoint: bool = True,
481
487
  **kwargs,
482
488
  ) -> "FlowRegion": # Stringized
@@ -488,6 +494,7 @@ class FlowElement:
488
494
  cross_size_absolute=width_absolute,
489
495
  cross_alignment=width_alignment,
490
496
  until=until,
497
+ include_source=include_source,
491
498
  include_endpoint=include_endpoint,
492
499
  **kwargs,
493
500
  )
@@ -503,24 +510,21 @@ class FlowElement:
503
510
  height_absolute: Optional[float] = None,
504
511
  height_alignment: str = "center",
505
512
  until: Optional[str] = None,
513
+ include_source: bool = False,
506
514
  include_endpoint: bool = True,
507
515
  **kwargs,
508
516
  ) -> "FlowRegion": # Stringized
509
- if self.flow.arrangement == "horizontal":
510
- return self._flow_direction(
511
- direction="left",
512
- size=width,
513
- cross_size_ratio=height_ratio,
514
- cross_size_absolute=height_absolute,
515
- cross_alignment=height_alignment,
516
- until=until,
517
- include_endpoint=include_endpoint,
518
- **kwargs,
519
- )
520
- else:
521
- raise NotImplementedError(
522
- "'left' in a vertical flow is ambiguous with current 1D flow logic and not yet implemented."
523
- )
517
+ return self._flow_direction(
518
+ direction="left",
519
+ size=width,
520
+ cross_size_ratio=height_ratio,
521
+ cross_size_absolute=height_absolute,
522
+ cross_alignment=height_alignment,
523
+ until=until,
524
+ include_source=include_source,
525
+ include_endpoint=include_endpoint,
526
+ **kwargs,
527
+ )
524
528
 
525
529
  def right(
526
530
  self,
@@ -529,24 +533,21 @@ class FlowElement:
529
533
  height_absolute: Optional[float] = None,
530
534
  height_alignment: str = "center",
531
535
  until: Optional[str] = None,
536
+ include_source: bool = False,
532
537
  include_endpoint: bool = True,
533
538
  **kwargs,
534
539
  ) -> "FlowRegion": # Stringized
535
- if self.flow.arrangement == "horizontal":
536
- return self._flow_direction(
537
- direction="right",
538
- size=width,
539
- cross_size_ratio=height_ratio,
540
- cross_size_absolute=height_absolute,
541
- cross_alignment=height_alignment,
542
- until=until,
543
- include_endpoint=include_endpoint,
544
- **kwargs,
545
- )
546
- else:
547
- raise NotImplementedError(
548
- "'right' in a vertical flow is ambiguous with current 1D flow logic and not yet implemented."
549
- )
540
+ return self._flow_direction(
541
+ direction="right",
542
+ size=width,
543
+ cross_size_ratio=height_ratio,
544
+ cross_size_absolute=height_absolute,
545
+ cross_alignment=height_alignment,
546
+ until=until,
547
+ include_source=include_source,
548
+ include_endpoint=include_endpoint,
549
+ **kwargs,
550
+ )
550
551
 
551
552
  def __repr__(self) -> str:
552
553
  return f"<FlowElement for {self.physical_object.__class__.__name__} {self.bbox} in {self.flow}>"