natural-pdf 0.2.13__tar.gz → 0.2.15__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (355) hide show
  1. {natural_pdf-0.2.13/natural_pdf.egg-info → natural_pdf-0.2.15}/PKG-INFO +1 -1
  2. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/elements/base.py +4 -1
  3. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/elements/element_collection.py +153 -15
  4. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/elements/rect.py +34 -0
  5. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/elements/region.py +23 -1
  6. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/elements/text.py +20 -2
  7. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/selectors/parser.py +28 -1
  8. {natural_pdf-0.2.13 → natural_pdf-0.2.15/natural_pdf.egg-info}/PKG-INFO +1 -1
  9. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.cursor/rules/analysis_framework.mdc +0 -0
  10. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.cursor/rules/coding-style.mdc +0 -0
  11. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  12. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.cursor/rules/minimal-comments.mdc +0 -0
  13. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  14. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  15. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.github/workflows/ci.yml +0 -0
  16. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.github/workflows/docs.yml +0 -0
  17. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.github/workflows/nightly-tutorials.yml +0 -0
  18. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.gitignore +0 -0
  19. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/.pre-commit-config.yaml +0 -0
  20. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/01-execute_notebooks.py +0 -0
  21. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/02-run_all_tutorials.sh +0 -0
  22. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/CLAUDE.md +0 -0
  23. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/LICENSE +0 -0
  24. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/MANIFEST.in +0 -0
  25. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/README.md +0 -0
  26. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/audit_packaging.py +0 -0
  27. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/check_run_md.sh +0 -0
  28. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/api/index.md +0 -0
  29. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/assets/favicon.png +0 -0
  30. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/assets/favicon.svg +0 -0
  31. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/assets/javascripts/custom.js +0 -0
  32. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/assets/logo.svg +0 -0
  33. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/assets/sample-screen.png +0 -0
  34. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/assets/social-preview.png +0 -0
  35. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/assets/social-preview.svg +0 -0
  36. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/assets/stylesheets/custom.css +0 -0
  37. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/categorizing-documents/index.md +0 -0
  38. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/data-extraction/index.md +0 -0
  39. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/describe/index.md +0 -0
  40. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/document-qa/index.md +0 -0
  41. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/element-selection/index.md +0 -0
  42. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/extracting-clean-text/index.md +0 -0
  43. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/finetuning/index.md +0 -0
  44. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/fix-messy-tables/index.md +0 -0
  45. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/fix-messy-tables/table_1.csv +0 -0
  46. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/fix-messy-tables/table_2.csv +0 -0
  47. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/fix-messy-tables/table_3.csv +0 -0
  48. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/index.md +0 -0
  49. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/installation/index.md +0 -0
  50. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/interactive-widget/index.md +0 -0
  51. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/layout-analysis/index.md +0 -0
  52. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/loops-and-groups/index.md +0 -0
  53. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/ocr/index.md +0 -0
  54. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/pdf-navigation/index.md +0 -0
  55. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  56. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/process-forms-and-invoices/index.md +0 -0
  57. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/quick-reference/index.md +0 -0
  58. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/reflowing-pages/index.md +0 -0
  59. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/regions/index.md +0 -0
  60. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tables/index.md +0 -0
  61. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/text-analysis/index.md +0 -0
  62. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/01-loading-and-extraction.md +0 -0
  63. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/02-finding-elements.md +0 -0
  64. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/03-extracting-blocks.md +0 -0
  65. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/04-table-extraction.md +0 -0
  66. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/05-excluding-content.md +0 -0
  67. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/06-document-qa.md +0 -0
  68. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/07-layout-analysis.md +0 -0
  69. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/07-working-with-regions.md +0 -0
  70. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/08-spatial-navigation.md +0 -0
  71. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/09-section-extraction.md +0 -0
  72. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/10-form-field-extraction.md +0 -0
  73. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  74. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/12-ocr-integration.md +0 -0
  75. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/13-semantic-search.md +0 -0
  76. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/tutorials/14-categorizing-documents.md +0 -0
  77. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/visual-debugging/index.md +0 -0
  78. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/docs/visual-debugging/region.png +0 -0
  79. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/mkdocs.yml +0 -0
  80. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/__init__.py +0 -0
  81. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/__init__.py +0 -0
  82. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/guides.py +0 -0
  83. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/__init__.py +0 -0
  84. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/base.py +0 -0
  85. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/docling.py +0 -0
  86. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/gemini.py +0 -0
  87. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  88. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  89. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  90. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/paddle.py +0 -0
  91. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  92. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/surya.py +0 -0
  93. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  94. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/tatr.py +0 -0
  95. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/layout/yolo.py +0 -0
  96. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  97. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/text_options.py +0 -0
  98. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/text_structure.py +0 -0
  99. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/analyzers/utils.py +0 -0
  100. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/classification/manager.py +0 -0
  101. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/classification/mixin.py +0 -0
  102. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/classification/results.py +0 -0
  103. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/cli.py +0 -0
  104. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/collections/mixins.py +0 -0
  105. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/core/__init__.py +0 -0
  106. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/core/element_manager.py +0 -0
  107. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/core/highlighting_service.py +0 -0
  108. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/core/page.py +0 -0
  109. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/core/page_collection.py +0 -0
  110. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/core/page_groupby.py +0 -0
  111. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/core/pdf.py +0 -0
  112. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/core/pdf_collection.py +0 -0
  113. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/core/render_spec.py +0 -0
  114. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/describe/__init__.py +0 -0
  115. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/describe/base.py +0 -0
  116. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/describe/elements.py +0 -0
  117. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/describe/mixin.py +0 -0
  118. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/describe/summary.py +0 -0
  119. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/elements/__init__.py +0 -0
  120. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/elements/image.py +0 -0
  121. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/elements/line.py +0 -0
  122. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/export/mixin.py +0 -0
  123. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/__init__.py +0 -0
  124. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/base.py +0 -0
  125. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/data/__init__.py +0 -0
  126. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/data/pdf.ttf +0 -0
  127. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/data/sRGB.icc +0 -0
  128. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/hocr.py +0 -0
  129. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/hocr_font.py +0 -0
  130. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/original_pdf.py +0 -0
  131. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/paddleocr.py +0 -0
  132. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/exporters/searchable_pdf.py +0 -0
  133. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/extraction/manager.py +0 -0
  134. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/extraction/mixin.py +0 -0
  135. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/extraction/result.py +0 -0
  136. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/flows/__init__.py +0 -0
  137. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/flows/collections.py +0 -0
  138. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/flows/element.py +0 -0
  139. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/flows/flow.py +0 -0
  140. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/flows/region.py +0 -0
  141. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/__init__.py +0 -0
  142. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/engine.py +0 -0
  143. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/engine_doctr.py +0 -0
  144. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/engine_easyocr.py +0 -0
  145. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/engine_paddle.py +0 -0
  146. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/engine_surya.py +0 -0
  147. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/ocr_factory.py +0 -0
  148. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/ocr_manager.py +0 -0
  149. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/ocr_options.py +0 -0
  150. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/ocr/utils.py +0 -0
  151. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/qa/__init__.py +0 -0
  152. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/qa/document_qa.py +0 -0
  153. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/qa/qa_result.py +0 -0
  154. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/search/__init__.py +0 -0
  155. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/search/lancedb_search_service.py +0 -0
  156. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/search/numpy_search_service.py +0 -0
  157. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/search/search_options.py +0 -0
  158. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/search/search_service_protocol.py +0 -0
  159. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/search/searchable_mixin.py +0 -0
  160. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/selectors/__init__.py +0 -0
  161. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/tables/__init__.py +0 -0
  162. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/tables/result.py +0 -0
  163. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/templates/__init__.py +0 -0
  164. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  165. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/templates/spa/css/style.css +0 -0
  166. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/templates/spa/index.html +0 -0
  167. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/templates/spa/js/app.js +0 -0
  168. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/templates/spa/words.txt +0 -0
  169. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/text_mixin.py +0 -0
  170. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/__init__.py +0 -0
  171. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/bidi_mirror.py +0 -0
  172. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/color_utils.py +0 -0
  173. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/debug.py +0 -0
  174. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/highlighting.py +0 -0
  175. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/identifiers.py +0 -0
  176. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/layout.py +0 -0
  177. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/locks.py +0 -0
  178. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/packaging.py +0 -0
  179. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/reading_order.py +0 -0
  180. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/text_extraction.py +0 -0
  181. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/utils/visualization.py +0 -0
  182. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/vision/__init__.py +0 -0
  183. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/vision/mixin.py +0 -0
  184. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/vision/results.py +0 -0
  185. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/vision/similarity.py +0 -0
  186. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/vision/template_matching.py +0 -0
  187. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/widgets/__init__.py +0 -0
  188. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf/widgets/viewer.py +0 -0
  189. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf.egg-info/SOURCES.txt +0 -0
  190. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf.egg-info/dependency_links.txt +0 -0
  191. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf.egg-info/entry_points.txt +0 -0
  192. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf.egg-info/requires.txt +0 -0
  193. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/natural_pdf.egg-info/top_level.txt +0 -0
  194. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/noxfile.py +0 -0
  195. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/optimization/memory_comparison.py +0 -0
  196. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/optimization/pdf_analyzer.py +0 -0
  197. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/optimization/performance_analysis.py +0 -0
  198. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  199. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  200. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  201. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  202. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/optimization/test_cleanup_methods.py +0 -0
  203. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/optimization/test_memory_fix.py +0 -0
  204. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/publish.sh +0 -0
  205. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/pyproject.toml +0 -0
  206. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/sample-screen.png +0 -0
  207. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/setup.cfg +0 -0
  208. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/fix_page_exclusions.py +0 -0
  209. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_draw_guides.py +0 -0
  210. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_draw_guides_interactive.py +0 -0
  211. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_exclusion_with_debug.py +0 -0
  212. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_find_exclusions_fix.py +0 -0
  213. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_find_exclusions_fix_no_recursion.py +0 -0
  214. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_fix_real_pdf.py +0 -0
  215. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_fix_working.py +0 -0
  216. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_fixed_pdf_exclusions.py +0 -0
  217. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_guide_draw_notebook.py +0 -0
  218. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_horizontal_top_bottom.py +0 -0
  219. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_inline_js.py +0 -0
  220. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_marker_order.py +0 -0
  221. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_original_exclusions_now_work.py +0 -0
  222. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_pdf_exclusions_with_guides.py +0 -0
  223. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_region_exclusions_detailed.py +0 -0
  224. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_stripes_real_pdf.py +0 -0
  225. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_vertical_stripes.py +0 -0
  226. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_widget_functionality.py +0 -0
  227. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/temp/test_widget_simple.py +0 -0
  228. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/conftest.py +0 -0
  229. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/exporters/test_paddleocr_exporter.py +0 -0
  230. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_annotate.py +0 -0
  231. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_arabic_performance.py +0 -0
  232. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_arabic_real_world.py +0 -0
  233. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_color_conversion.py +0 -0
  234. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_color_hex_display.py +0 -0
  235. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_core/test_containment_geometry.py +0 -0
  236. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_core/test_elements.py +0 -0
  237. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_core/test_loading.py +0 -0
  238. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_core/test_spatial.py +0 -0
  239. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_core/test_text_extraction.py +0 -0
  240. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_core/test_text_layer.py +0 -0
  241. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_crop_enhancements.py +0 -0
  242. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_crop_region_highlights.py +0 -0
  243. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_directional_defaults.py +0 -0
  244. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_dissolve.py +0 -0
  245. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_dissolve_cross_page_bug.py +0 -0
  246. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_dissolve_debug_issue.py +0 -0
  247. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_dissolve_real_world_issue.py +0 -0
  248. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_dissolve_single_elements.py +0 -0
  249. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_dissolve_vertical_offset_issue.py +0 -0
  250. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_document_qa.py +0 -0
  251. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_element_addition.py +0 -0
  252. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_element_collection_guides.py +0 -0
  253. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_element_collection_show_cols.py +0 -0
  254. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_element_collection_slicing.py +0 -0
  255. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_element_exclusions.py +0 -0
  256. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_element_show_crop_highlights.py +0 -0
  257. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_empty_pseudo_class.py +0 -0
  258. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_exclusions.py +0 -0
  259. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_expand.py +0 -0
  260. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_extraction_error.py +0 -0
  261. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_extraction_mixin_fix.py +0 -0
  262. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_extraction_text_and_vision.py +0 -0
  263. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_extraction_working.py +0 -0
  264. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_find_similar.py +0 -0
  265. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_first_last_selectors.py +0 -0
  266. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_fix_get_sections_zero_height.py +0 -0
  267. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_flow_region_directional.py +0 -0
  268. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_get_sections_fix_comprehensive.py +0 -0
  269. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_get_sections_zero_height.py +0 -0
  270. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_groupby.py +0 -0
  271. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides.py +0 -0
  272. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides_apply_exclusions.py +0 -0
  273. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides_apply_exclusions_simple.py +0 -0
  274. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides_extract_table.py +0 -0
  275. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides_extract_table_collections.py +0 -0
  276. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides_extract_table_exclusions.py +0 -0
  277. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides_extract_table_real.py +0 -0
  278. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides_from_stripes.py +0 -0
  279. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides_integration.py +0 -0
  280. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_guides_marker_sorting.py +0 -0
  281. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_highlight_detection.py +0 -0
  282. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_highlight_detection_comprehensive.py +0 -0
  283. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_highlight_offset.py +0 -0
  284. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_highlight_protocol.py +0 -0
  285. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_highlight_protocol_simple.py +0 -0
  286. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_highlight_regions.py +0 -0
  287. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_horizontal_guides_alignment.py +0 -0
  288. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_comprehensive.py +0 -0
  289. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_debug.py +0 -0
  290. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_final.py +0 -0
  291. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_final_verification.py +0 -0
  292. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_fix.py +0 -0
  293. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_mock.py +0 -0
  294. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_simple.py +0 -0
  295. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_types_pdf.py +0 -0
  296. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_verification.py +0 -0
  297. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_include_boundaries_with_real_text.py +0 -0
  298. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_loading_original.py +0 -0
  299. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_match_results_sorting.py +0 -0
  300. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_merge_connected.py +0 -0
  301. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_merge_connected_real_world.py +0 -0
  302. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_merge_method.py +0 -0
  303. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_multi_page_table_discovery.py +0 -0
  304. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_negative_bounds_pdf.py +0 -0
  305. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_optional_deps.py +0 -0
  306. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_page_exclusion_lists.py +0 -0
  307. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  308. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_pdf_exclusions_in_find_methods.py +0 -0
  309. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_phash_masking.py +0 -0
  310. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_region_find_similar.py +0 -0
  311. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_region_show_crop_highlights.py +0 -0
  312. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_region_viewer.py +0 -0
  313. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_sections_end_only.py +0 -0
  314. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_sections_with_start_and_end.py +0 -0
  315. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_show_column_layout.py +0 -0
  316. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_show_edge_cases.py +0 -0
  317. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_show_exclusions.py +0 -0
  318. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_show_exclusions_feature.py +0 -0
  319. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_show_limit.py +0 -0
  320. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_skip_repeating_headers_multipage.py +0 -0
  321. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_slice_cache_reuse.py +0 -0
  322. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_slice_exclusion_fix.py +0 -0
  323. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_slice_exclusion_issue.py +0 -0
  324. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_slice_exclusion_mock.py +0 -0
  325. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_sliced_collection_exclusions.py +0 -0
  326. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_spatial_offset.py +0 -0
  327. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_strikethrough_detection.py +0 -0
  328. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_table_result_header_mismatch.py +0 -0
  329. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_table_result_keep_blank.py +0 -0
  330. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_template_matching.py +0 -0
  331. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_template_white_masking.py +0 -0
  332. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_tiny_text_tables.py +0 -0
  333. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_tiny_text_tables_table.py +0 -0
  334. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_tutorials.py +0 -0
  335. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_underline_detection.py +0 -0
  336. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tests/test_update_text.py +0 -0
  337. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/todo/bad_pdf_analysis.md +0 -0
  338. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/todo/evaluation.md +0 -0
  339. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  340. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  341. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  342. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/README.md +0 -0
  343. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/__init__.py +0 -0
  344. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/analyser.py +0 -0
  345. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  346. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  347. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/eval_suite.py +0 -0
  348. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  349. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  350. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  351. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  352. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  353. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/reporter.py +0 -0
  354. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/tools/bad_pdf_eval/utils.py +0 -0
  355. {natural_pdf-0.2.13 → natural_pdf-0.2.15}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.13
3
+ Version: 0.2.15
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -106,7 +106,7 @@ class DirectionalMixin:
106
106
  include_source: bool = False,
107
107
  until: Optional[str] = None,
108
108
  include_endpoint: bool = True,
109
- offset: float = 0.1,
109
+ offset: float = 0.0,
110
110
  **kwargs,
111
111
  ) -> "Region":
112
112
  """
@@ -1209,6 +1209,9 @@ class Element(
1209
1209
 
1210
1210
  return self
1211
1211
 
1212
+ def exclude(self):
1213
+ self.page.add_exclusion(self)
1214
+
1212
1215
  def _get_render_specs(
1213
1216
  self,
1214
1217
  mode: Literal["show", "render"] = "show",
@@ -888,6 +888,9 @@ class ElementCollection(
888
888
  self._elements.sort(key=key, reverse=reverse)
889
889
  return self
890
890
 
891
+ def exclude(self):
892
+ self.page.add_exclusion(self)
893
+
891
894
  def highlight(
892
895
  self,
893
896
  label: Optional[str] = None,
@@ -1902,13 +1905,87 @@ class ElementCollection(
1902
1905
 
1903
1906
  return ElementCollection(all_found_elements)
1904
1907
 
1905
- def extract_each_text(self, **kwargs) -> List[str]:
1906
- """
1907
- Extract text from each element in this region.
1908
+ def extract_each_text(
1909
+ self,
1910
+ order: Optional[Union[str, Callable[[T], Any]]] = None,
1911
+ *,
1912
+ newlines: bool = True,
1913
+ **kwargs,
1914
+ ) -> List[str]:
1915
+ """Return a list with the extracted text for every element.
1916
+
1917
+ Parameters
1918
+ ----------
1919
+ order
1920
+ Controls the ordering of elements **before** extraction:
1921
+
1922
+ * ``None`` (default) – keep the collection's current order.
1923
+ * ``callable`` – a function that will be used as ``key`` for :pyfunc:`sorted`.
1924
+ * ``"ltr"`` – left-to-right ordering (x0, then y-top).
1925
+ * ``"rtl"`` – right-to-left ordering (−x0, then y-top).
1926
+ * ``"natural"`` – natural reading order (y-top, then x0).
1927
+
1928
+ Remaining keyword arguments are forwarded to each element's
1929
+ :py:meth:`extract_text` method.
1908
1930
  """
1909
- return self.apply(
1910
- lambda element: element.extract_text(**kwargs) if element is not None else None
1911
- )
1931
+
1932
+ # -- Determine ordering --------------------------------------------------
1933
+ elements: List[T] = list(self._elements) # make a shallow copy we can sort
1934
+
1935
+ if order is not None and len(elements) > 1:
1936
+ try:
1937
+ if callable(order):
1938
+ elements.sort(key=order)
1939
+ elif isinstance(order, str):
1940
+ preset = order.lower()
1941
+ if preset in {"ltr", "left-to-right"}:
1942
+ elements.sort(
1943
+ key=lambda el: (
1944
+ (
1945
+ getattr(el, "page", None).index
1946
+ if hasattr(el, "page") and el.page
1947
+ else 0
1948
+ ),
1949
+ getattr(el, "x0", 0),
1950
+ getattr(el, "top", 0),
1951
+ )
1952
+ )
1953
+ elif preset in {"rtl", "right-to-left"}:
1954
+ elements.sort(
1955
+ key=lambda el: (
1956
+ (
1957
+ getattr(el, "page", None).index
1958
+ if hasattr(el, "page") and el.page
1959
+ else 0
1960
+ ),
1961
+ -getattr(el, "x0", 0),
1962
+ getattr(el, "top", 0),
1963
+ )
1964
+ )
1965
+ elif preset in {"natural", "tdlr", "top-down"}:
1966
+ elements.sort(
1967
+ key=lambda el: (
1968
+ (
1969
+ getattr(el, "page", None).index
1970
+ if hasattr(el, "page") and el.page
1971
+ else 0
1972
+ ),
1973
+ getattr(el, "top", 0),
1974
+ getattr(el, "x0", 0),
1975
+ )
1976
+ )
1977
+ else:
1978
+ # Unknown preset – silently ignore to keep original order
1979
+ pass
1980
+ except Exception:
1981
+ # If anything goes wrong, fall back to original order
1982
+ pass
1983
+
1984
+ # -- Extract ----------------------------------------------------------------
1985
+ return [
1986
+ el.extract_text(newlines=newlines, **kwargs) if el is not None else None # type: ignore[arg-type]
1987
+ for el in elements
1988
+ ]
1912
1989
 
1913
1990
  def correct_ocr(
1914
1991
  self,
@@ -2673,10 +2750,17 @@ class ElementCollection(
2673
2750
  else:
2674
2751
  v_dist = 0 # Vertically overlapping
2675
2752
 
2676
- # Use Chebyshev distance (max of horizontal and vertical)
2677
- # This creates a square proximity zone
2678
- distance = max(h_dist, v_dist)
2753
+ # ------------------------------------------------------------------
2754
+ # Decide connection logic based on vertical_gap parameter
2755
+ # ------------------------------------------------------------------
2756
+ if vertical_gap is not None:
2757
+ # Consider elements connected when they vertically stack within
2758
+ # the allowed gap **and** have some horizontal overlap
2759
+ horizontal_overlap = not (h_dist > 0)
2760
+ return horizontal_overlap and v_dist <= vertical_gap
2679
2761
 
2762
+ # Fallback to legacy Chebyshev distance using ``threshold``
2763
+ distance = max(h_dist, v_dist)
2680
2764
  return distance <= threshold
2681
2765
 
2682
2766
  def _merge_region_group(
@@ -2752,6 +2836,9 @@ class ElementCollection(
2752
2836
  def dissolve(
2753
2837
  self,
2754
2838
  padding: float = 2.0,
2839
+ *,
2840
+ vertical_gap: Optional[float] = None,
2841
+ vertical: Optional[bool] = False,
2755
2842
  geometry: Literal["rect", "polygon"] = "rect",
2756
2843
  group_by: List[str] = None,
2757
2844
  ) -> "ElementCollection":
@@ -2764,8 +2851,19 @@ class ElementCollection(
2764
2851
  bounding boxes.
2765
2852
 
2766
2853
  Args:
2767
- padding: Maximum distance in points between elements to consider
2768
- them connected. Default is 2.0 points.
2854
+ padding: Maximum chebyshev distance (in any direction) between
2855
+ elements to consider them connected **when ``vertical_gap`` is
2856
+ not provided**. Default 2.0 pt.
2857
+
2858
+ vertical_gap: If given, switches to *stack-aware* dissolve:
2859
+ two elements are connected when their horizontal projections
2860
+ overlap (any amount) **and** the vertical distance between them
2861
+ is ≤ ``vertical_gap``. This lets you combine multi-line labels
2862
+ that share the same column but have blank space between lines.
2863
+
2864
+ vertical: If given, automatically sets vertical_gap to maximum to
2865
+ allow for easy vertical stacking.
2866
+
2769
2867
  geometry: Type of geometry to use for merged regions. Currently only
2770
2868
  "rect" (bounding box) is supported. "polygon" will raise
2771
2869
  NotImplementedError.
@@ -2807,6 +2905,9 @@ class ElementCollection(
2807
2905
  if geometry not in ["rect", "polygon"]:
2808
2906
  raise ValueError(f"Invalid geometry type: {geometry}. Must be 'rect' or 'polygon'")
2809
2907
 
2908
+ if vertical:
2909
+ vertical_gap = float("inf")
2910
+
2810
2911
  from natural_pdf.elements.region import Region
2811
2912
 
2812
2913
  # Filter to elements with bbox (all elements that can be dissolved)
@@ -2835,7 +2936,9 @@ class ElementCollection(
2835
2936
  logger.debug(f"Processing group {group_key} with {len(group_elements)} elements")
2836
2937
 
2837
2938
  # Find connected components within this group
2838
- components = self._find_connected_components_elements(group_elements, padding)
2939
+ components = self._find_connected_components_elements(
2940
+ group_elements, padding, vertical_gap
2941
+ )
2839
2942
 
2840
2943
  # Merge each component
2841
2944
  for component_elements in components:
@@ -2894,7 +2997,7 @@ class ElementCollection(
2894
2997
  return groups
2895
2998
 
2896
2999
  def _find_connected_components_elements(
2897
- self, elements: List["Element"], padding: float
3000
+ self, elements: List["Element"], padding: float, vertical_gap: Optional[float] = None
2898
3001
  ) -> List[List["Element"]]:
2899
3002
  """Find connected components among elements using union-find."""
2900
3003
  if not elements:
@@ -2919,7 +3022,7 @@ class ElementCollection(
2919
3022
  # Check all pairs of elements for connectivity
2920
3023
  for i in range(len(elements)):
2921
3024
  for j in range(i + 1, len(elements)):
2922
- if self._are_elements_connected(elements[i], elements[j], padding):
3025
+ if self._are_elements_connected(elements[i], elements[j], padding, vertical_gap):
2923
3026
  union(i, j)
2924
3027
 
2925
3028
  # Group elements by their connected component
@@ -3004,7 +3107,9 @@ class ElementCollection(
3004
3107
 
3005
3108
  return merged_region
3006
3109
 
3007
- def _are_elements_connected(self, elem1: "Element", elem2: "Element", threshold: float) -> bool:
3110
+ def _are_elements_connected(
3111
+ self, elem1: "Element", elem2: "Element", threshold: float, vertical_gap: float | None
3112
+ ) -> bool:
3008
3113
  """Check if two elements are connected (adjacent or overlapping)."""
3009
3114
  # Check if elements are on the same page
3010
3115
  # Handle edge cases where elements might not have a page attribute
@@ -3057,6 +3162,12 @@ class ElementCollection(
3057
3162
  # This creates a square proximity zone
3058
3163
  distance = max(h_dist, v_dist)
3059
3164
 
3165
+ if vertical_gap is not None:
3166
+ # 1. vertical distance ≤ vertical_gap
3167
+ # 2. horizontal ranges overlap OR touch
3168
+ h_overlap = (min(x1_1, x1_2) - max(x0_1, x0_2)) >= 0
3169
+ return h_overlap and v_dist <= vertical_gap
3170
+
3060
3171
  return distance <= threshold
3061
3172
 
3062
3173
  def _copy_element_attributes_to_region(
@@ -3163,3 +3274,30 @@ class ElementCollection(
3163
3274
  return self
3164
3275
 
3165
3276
  # ------------------------------------------------------------------
3277
+
3278
+ # ------------------------------------------------------------------
3279
+ # Public alias: combine
3280
+ # ------------------------------------------------------------------
3281
+ def combine(
3282
+ self,
3283
+ padding: float = 2.0,
3284
+ *,
3285
+ vertical_gap: Optional[float] = None,
3286
+ vertical: Optional[bool] = False,
3287
+ geometry: Literal["rect", "polygon"] = "rect",
3288
+ group_by: List[str] = None,
3289
+ ) -> "ElementCollection":
3290
+ """Alias for :py:meth:`dissolve` – retained for discoverability.
3291
+
3292
+ Many users find the verb *combine* more intuitive than *dissolve* when
3293
+ merging nearby or stacked elements into unified Regions. The parameters
3294
+ are identical; see :py:meth:`dissolve` for full documentation.
3295
+ """
3296
+
3297
+ return self.dissolve(
3298
+ padding=padding,
3299
+ vertical_gap=vertical_gap,
3300
+ vertical=vertical,
3301
+ geometry=geometry,
3302
+ group_by=group_by,
3303
+ )
@@ -88,6 +88,40 @@ class RectangleElement(Element):
88
88
  """Get the stroke width of the rectangle."""
89
89
  return self._obj.get("linewidth", 0)
90
90
 
91
+ @property
92
+ def is_horizontal(self) -> bool:
93
+ """Check if this is a horizontal line based on coordinates."""
94
+ # Calculate absolute difference in coordinates
95
+ dx = abs(self.x1 - self.x0)
96
+ dy = abs(self.top - self.bottom)
97
+
98
+ # Define a tolerance for near-horizontal lines (e.g., 1 point)
99
+ tolerance = 1.0
100
+
101
+ # Horizontal if y-change is within tolerance and x-change is significant
102
+ return dy <= tolerance and dx > tolerance
103
+
104
+ @property
105
+ def is_vertical(self) -> bool:
106
+ """Check if this is a vertical line based on coordinates."""
107
+ # Calculate absolute difference in coordinates
108
+ dx = abs(self.x1 - self.x0)
109
+ dy = abs(self.top - self.bottom)
110
+
111
+ # Define a tolerance for near-vertical lines (e.g., 1 point)
112
+ tolerance = 1.0
113
+
114
+ # Vertical if x-change is within tolerance and y-change is significant
115
+ return dx <= tolerance and dy > tolerance
116
+
117
+ @property
118
+ def orientation(self) -> str:
119
+ """Get the orientation of the line ('horizontal', 'vertical', or 'diagonal')."""
120
+ if self.is_horizontal:
121
+ return "horizontal"
122
+ elif self.is_vertical:
123
+ return "vertical"
124
+
91
125
  def extract_text(self, **kwargs) -> str:
92
126
  """
93
127
  Extract text from inside this rectangle.
@@ -738,6 +738,9 @@ class Region(
738
738
  and self.bottom > element.top
739
739
  )
740
740
 
741
+ def exclude(self):
742
+ self.page.add_exclusion(self)
743
+
741
744
  def highlight(
742
745
  self,
743
746
  label: Optional[str] = None,
@@ -1229,7 +1232,13 @@ class Region(
1229
1232
  return [e for e in page_elements if self._is_element_in_region(e)]
1230
1233
 
1231
1234
  def extract_text(
1232
- self, apply_exclusions=True, debug=False, content_filter=None, **kwargs
1235
+ self,
1236
+ apply_exclusions: bool = True,
1237
+ debug: bool = False,
1238
+ *,
1239
+ newlines: Union[bool, str] = True,
1240
+ content_filter=None,
1241
+ **kwargs,
1233
1242
  ) -> str:
1234
1243
  """
1235
1244
  Extract text from this region, respecting page exclusions and using pdfplumber's
@@ -1238,6 +1247,7 @@ class Region(
1238
1247
  Args:
1239
1248
  apply_exclusions: Whether to apply exclusion regions defined on the parent page.
1240
1249
  debug: Enable verbose debugging output for filtering steps.
1250
+ newlines: Whether to strip newline characters from the extracted text.
1241
1251
  content_filter: Optional content filter to exclude specific text patterns. Can be:
1242
1252
  - A regex pattern string (characters matching the pattern are EXCLUDED)
1243
1253
  - A callable that takes text and returns True to KEEP the character
@@ -1311,6 +1321,18 @@ class Region(
1311
1321
  user_kwargs=final_kwargs, # Pass kwargs including content_filter
1312
1322
  )
1313
1323
 
1324
+ # Flexible newline handling (same logic as TextElement)
1325
+ if isinstance(newlines, bool):
1326
+ if newlines is False:
1327
+ replacement = " "
1328
+ else:
1329
+ replacement = None
1330
+ else:
1331
+ replacement = str(newlines)
1332
+
1333
+ if replacement is not None:
1334
+ result = result.replace("\n", replacement).replace("\r", replacement)
1335
+
1314
1336
  logger.debug(f"Region {self.bbox}: extract_text finished, result length: {len(result)}.")
1315
1337
  return result
1316
1338
 
@@ -2,7 +2,7 @@
2
2
  Text element classes for natural-pdf.
3
3
  """
4
4
 
5
- from typing import TYPE_CHECKING, Any, Dict, Optional
5
+ from typing import TYPE_CHECKING, Any, Dict, Optional, Union
6
6
 
7
7
  from natural_pdf.elements.base import Element
8
8
 
@@ -236,7 +236,13 @@ class TextElement(Element):
236
236
  return (0, 0, 0)
237
237
 
238
238
  def extract_text(
239
- self, keep_blank_chars=True, strip: Optional[bool] = True, content_filter=None, **kwargs
239
+ self,
240
+ keep_blank_chars: bool = True,
241
+ strip: Optional[bool] = True,
242
+ *,
243
+ newlines: Union[bool, str] = True,
244
+ content_filter=None,
245
+ **kwargs,
240
246
  ) -> str:
241
247
  """
242
248
  Extract text from this element.
@@ -292,6 +298,18 @@ class TextElement(Element):
292
298
  if strip:
293
299
  result = result.strip()
294
300
 
301
+ # Flexible newline handling
302
+ if isinstance(newlines, bool):
303
+ if newlines is False:
304
+ replacement = " " # single space when False
305
+ else:
306
+ replacement = None # keep as-is when True
307
+ else:
308
+ replacement = str(newlines)
309
+
310
+ if replacement is not None:
311
+ result = result.replace("\n", replacement).replace("\r", replacement)
312
+
295
313
  return result
296
314
 
297
315
  def contains(self, substring: str, case_sensitive: bool = True) -> bool:
@@ -423,7 +423,33 @@ def parse_selector(selector: str) -> Dict[str, Any]:
423
423
  # Check for other pseudo-class blocks `:name` or `:name(...)`
424
424
  pseudo_match = pseudo_pattern.match(selector)
425
425
  if pseudo_match:
426
+ # --- NEW: robustly capture arguments that may contain nested parentheses --- #
426
427
  name, args_str = pseudo_match.groups()
428
+ match_end_idx = pseudo_match.end()
429
+
430
+ # If the args_str contains unmatched opening parens, continue scanning the
431
+ # selector until parentheses are balanced. This allows patterns like
432
+ # :contains((Tre) Ofertu) or complex regex with grouping.
433
+ if args_str is not None and args_str.count("(") > args_str.count(")"):
434
+ balance = args_str.count("(") - args_str.count(")")
435
+ i = match_end_idx
436
+ while i < len(selector) and balance > 0:
437
+ char = selector[i]
438
+ # Append char to args_str as we extend the capture
439
+ args_str += char
440
+ if char == "(":
441
+ balance += 1
442
+ elif char == ")":
443
+ balance -= 1
444
+ i += 1
445
+ # After loop, ensure parentheses are balanced; otherwise raise error
446
+ if balance != 0:
447
+ raise ValueError(
448
+ f"Mismatched parentheses in pseudo-class :{name}(). Full selector: '{original_selector_for_error}'"
449
+ )
450
+ # Update where the selector should be sliced off from
451
+ match_end_idx = i
452
+
427
453
  name = name.lower() # Normalize pseudo-class name
428
454
  processed_args = args_str # Keep as string initially, or None
429
455
 
@@ -436,7 +462,8 @@ def parse_selector(selector: str) -> Dict[str, Any]:
436
462
  # else: args remain None
437
463
 
438
464
  result["pseudo_classes"].append({"name": name, "args": processed_args})
439
- selector = selector[pseudo_match.end() :].strip()
465
+ # IMPORTANT: use match_end_idx (may have been extended)
466
+ selector = selector[match_end_idx:].strip()
440
467
  processed_chunk = True
441
468
  continue
442
469
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.13
3
+ Version: 0.2.15
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes