natural-pdf 0.2.12__tar.gz → 0.2.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (361) hide show
  1. {natural_pdf-0.2.12/natural_pdf.egg-info → natural_pdf-0.2.13}/PKG-INFO +1 -1
  2. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/core/highlighting_service.py +40 -10
  3. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/elements/base.py +15 -1
  4. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/elements/region.py +32 -2
  5. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/vision/__init__.py +1 -2
  6. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/vision/mixin.py +67 -27
  7. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/vision/results.py +49 -5
  8. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/vision/similarity.py +195 -23
  9. natural_pdf-0.2.13/natural_pdf/vision/template_matching.py +209 -0
  10. {natural_pdf-0.2.12 → natural_pdf-0.2.13/natural_pdf.egg-info}/PKG-INFO +1 -1
  11. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf.egg-info/SOURCES.txt +15 -6
  12. natural_pdf-0.2.13/temp/test_draw_guides.py +25 -0
  13. natural_pdf-0.2.13/temp/test_draw_guides_interactive.py +30 -0
  14. natural_pdf-0.2.13/temp/test_guide_draw_notebook.py +47 -0
  15. natural_pdf-0.2.13/temp/test_inline_js.py +22 -0
  16. natural_pdf-0.2.13/temp/test_widget_functionality.py +68 -0
  17. natural_pdf-0.2.13/temp/test_widget_simple.py +41 -0
  18. natural_pdf-0.2.13/tests/test_highlight_offset.py +102 -0
  19. natural_pdf-0.2.13/tests/test_match_results_sorting.py +192 -0
  20. natural_pdf-0.2.13/tests/test_negative_bounds_pdf.py +53 -0
  21. natural_pdf-0.2.13/tests/test_phash_masking.py +175 -0
  22. natural_pdf-0.2.13/tests/test_region_find_similar.py +145 -0
  23. natural_pdf-0.2.13/tests/test_spatial_offset.py +89 -0
  24. natural_pdf-0.2.13/tests/test_template_matching.py +226 -0
  25. natural_pdf-0.2.13/tests/test_template_white_masking.py +178 -0
  26. natural_pdf-0.2.12/temp/debug_cell_extraction.py +0 -42
  27. natural_pdf-0.2.12/temp/debug_exclusion_overlap.py +0 -43
  28. natural_pdf-0.2.12/temp/debug_exclusions_guides.py +0 -67
  29. natural_pdf-0.2.12/temp/debug_extra_guide.py +0 -41
  30. natural_pdf-0.2.12/temp/debug_outer_boundaries.py +0 -46
  31. natural_pdf-0.2.12/temp/debug_st_search.py +0 -33
  32. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.cursor/rules/analysis_framework.mdc +0 -0
  33. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.cursor/rules/coding-style.mdc +0 -0
  34. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  35. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.cursor/rules/minimal-comments.mdc +0 -0
  36. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  37. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  38. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.github/workflows/ci.yml +0 -0
  39. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.github/workflows/docs.yml +0 -0
  40. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.github/workflows/nightly-tutorials.yml +0 -0
  41. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.gitignore +0 -0
  42. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/.pre-commit-config.yaml +0 -0
  43. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/01-execute_notebooks.py +0 -0
  44. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/02-run_all_tutorials.sh +0 -0
  45. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/CLAUDE.md +0 -0
  46. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/LICENSE +0 -0
  47. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/MANIFEST.in +0 -0
  48. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/README.md +0 -0
  49. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/audit_packaging.py +0 -0
  50. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/check_run_md.sh +0 -0
  51. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/api/index.md +0 -0
  52. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/assets/favicon.png +0 -0
  53. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/assets/favicon.svg +0 -0
  54. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/assets/javascripts/custom.js +0 -0
  55. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/assets/logo.svg +0 -0
  56. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/assets/sample-screen.png +0 -0
  57. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/assets/social-preview.png +0 -0
  58. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/assets/social-preview.svg +0 -0
  59. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/assets/stylesheets/custom.css +0 -0
  60. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/categorizing-documents/index.md +0 -0
  61. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/data-extraction/index.md +0 -0
  62. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/describe/index.md +0 -0
  63. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/document-qa/index.md +0 -0
  64. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/element-selection/index.md +0 -0
  65. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/extracting-clean-text/index.md +0 -0
  66. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/finetuning/index.md +0 -0
  67. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/fix-messy-tables/index.md +0 -0
  68. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/fix-messy-tables/table_1.csv +0 -0
  69. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/fix-messy-tables/table_2.csv +0 -0
  70. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/fix-messy-tables/table_3.csv +0 -0
  71. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/index.md +0 -0
  72. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/installation/index.md +0 -0
  73. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/interactive-widget/index.md +0 -0
  74. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/layout-analysis/index.md +0 -0
  75. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/loops-and-groups/index.md +0 -0
  76. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/ocr/index.md +0 -0
  77. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/pdf-navigation/index.md +0 -0
  78. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  79. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/process-forms-and-invoices/index.md +0 -0
  80. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/quick-reference/index.md +0 -0
  81. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/reflowing-pages/index.md +0 -0
  82. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/regions/index.md +0 -0
  83. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tables/index.md +0 -0
  84. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/text-analysis/index.md +0 -0
  85. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/01-loading-and-extraction.md +0 -0
  86. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/02-finding-elements.md +0 -0
  87. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/03-extracting-blocks.md +0 -0
  88. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/04-table-extraction.md +0 -0
  89. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/05-excluding-content.md +0 -0
  90. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/06-document-qa.md +0 -0
  91. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/07-layout-analysis.md +0 -0
  92. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/07-working-with-regions.md +0 -0
  93. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/08-spatial-navigation.md +0 -0
  94. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/09-section-extraction.md +0 -0
  95. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/10-form-field-extraction.md +0 -0
  96. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  97. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/12-ocr-integration.md +0 -0
  98. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/13-semantic-search.md +0 -0
  99. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/tutorials/14-categorizing-documents.md +0 -0
  100. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/visual-debugging/index.md +0 -0
  101. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/docs/visual-debugging/region.png +0 -0
  102. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/mkdocs.yml +0 -0
  103. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/__init__.py +0 -0
  104. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/__init__.py +0 -0
  105. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/guides.py +0 -0
  106. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/__init__.py +0 -0
  107. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/base.py +0 -0
  108. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/docling.py +0 -0
  109. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/gemini.py +0 -0
  110. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  111. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  112. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  113. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/paddle.py +0 -0
  114. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  115. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/surya.py +0 -0
  116. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  117. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/tatr.py +0 -0
  118. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/layout/yolo.py +0 -0
  119. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  120. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/text_options.py +0 -0
  121. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/text_structure.py +0 -0
  122. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/analyzers/utils.py +0 -0
  123. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/classification/manager.py +0 -0
  124. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/classification/mixin.py +0 -0
  125. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/classification/results.py +0 -0
  126. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/cli.py +0 -0
  127. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/collections/mixins.py +0 -0
  128. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/core/__init__.py +0 -0
  129. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/core/element_manager.py +0 -0
  130. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/core/page.py +0 -0
  131. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/core/page_collection.py +0 -0
  132. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/core/page_groupby.py +0 -0
  133. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/core/pdf.py +0 -0
  134. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/core/pdf_collection.py +0 -0
  135. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/core/render_spec.py +0 -0
  136. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/describe/__init__.py +0 -0
  137. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/describe/base.py +0 -0
  138. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/describe/elements.py +0 -0
  139. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/describe/mixin.py +0 -0
  140. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/describe/summary.py +0 -0
  141. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/elements/__init__.py +0 -0
  142. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/elements/element_collection.py +0 -0
  143. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/elements/image.py +0 -0
  144. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/elements/line.py +0 -0
  145. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/elements/rect.py +0 -0
  146. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/elements/text.py +0 -0
  147. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/export/mixin.py +0 -0
  148. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/__init__.py +0 -0
  149. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/base.py +0 -0
  150. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/data/__init__.py +0 -0
  151. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/data/pdf.ttf +0 -0
  152. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/data/sRGB.icc +0 -0
  153. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/hocr.py +0 -0
  154. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/hocr_font.py +0 -0
  155. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/original_pdf.py +0 -0
  156. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/paddleocr.py +0 -0
  157. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/exporters/searchable_pdf.py +0 -0
  158. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/extraction/manager.py +0 -0
  159. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/extraction/mixin.py +0 -0
  160. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/extraction/result.py +0 -0
  161. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/flows/__init__.py +0 -0
  162. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/flows/collections.py +0 -0
  163. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/flows/element.py +0 -0
  164. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/flows/flow.py +0 -0
  165. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/flows/region.py +0 -0
  166. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/__init__.py +0 -0
  167. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/engine.py +0 -0
  168. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/engine_doctr.py +0 -0
  169. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/engine_easyocr.py +0 -0
  170. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/engine_paddle.py +0 -0
  171. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/engine_surya.py +0 -0
  172. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/ocr_factory.py +0 -0
  173. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/ocr_manager.py +0 -0
  174. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/ocr_options.py +0 -0
  175. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/ocr/utils.py +0 -0
  176. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/qa/__init__.py +0 -0
  177. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/qa/document_qa.py +0 -0
  178. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/qa/qa_result.py +0 -0
  179. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/search/__init__.py +0 -0
  180. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/search/lancedb_search_service.py +0 -0
  181. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/search/numpy_search_service.py +0 -0
  182. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/search/search_options.py +0 -0
  183. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/search/search_service_protocol.py +0 -0
  184. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/search/searchable_mixin.py +0 -0
  185. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/selectors/__init__.py +0 -0
  186. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/selectors/parser.py +0 -0
  187. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/tables/__init__.py +0 -0
  188. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/tables/result.py +0 -0
  189. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/templates/__init__.py +0 -0
  190. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  191. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/templates/spa/css/style.css +0 -0
  192. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/templates/spa/index.html +0 -0
  193. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/templates/spa/js/app.js +0 -0
  194. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/templates/spa/words.txt +0 -0
  195. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/text_mixin.py +0 -0
  196. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/__init__.py +0 -0
  197. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/bidi_mirror.py +0 -0
  198. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/color_utils.py +0 -0
  199. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/debug.py +0 -0
  200. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/highlighting.py +0 -0
  201. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/identifiers.py +0 -0
  202. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/layout.py +0 -0
  203. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/locks.py +0 -0
  204. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/packaging.py +0 -0
  205. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/reading_order.py +0 -0
  206. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/text_extraction.py +0 -0
  207. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/utils/visualization.py +0 -0
  208. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/widgets/__init__.py +0 -0
  209. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf/widgets/viewer.py +0 -0
  210. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf.egg-info/dependency_links.txt +0 -0
  211. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf.egg-info/entry_points.txt +0 -0
  212. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf.egg-info/requires.txt +0 -0
  213. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/natural_pdf.egg-info/top_level.txt +0 -0
  214. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/noxfile.py +0 -0
  215. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/optimization/memory_comparison.py +0 -0
  216. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/optimization/pdf_analyzer.py +0 -0
  217. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/optimization/performance_analysis.py +0 -0
  218. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  219. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  220. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  221. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  222. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/optimization/test_cleanup_methods.py +0 -0
  223. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/optimization/test_memory_fix.py +0 -0
  224. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/publish.sh +0 -0
  225. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/pyproject.toml +0 -0
  226. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/sample-screen.png +0 -0
  227. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/setup.cfg +0 -0
  228. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/fix_page_exclusions.py +0 -0
  229. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_exclusion_with_debug.py +0 -0
  230. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_find_exclusions_fix.py +0 -0
  231. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_find_exclusions_fix_no_recursion.py +0 -0
  232. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_fix_real_pdf.py +0 -0
  233. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_fix_working.py +0 -0
  234. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_fixed_pdf_exclusions.py +0 -0
  235. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_horizontal_top_bottom.py +0 -0
  236. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_marker_order.py +0 -0
  237. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_original_exclusions_now_work.py +0 -0
  238. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_pdf_exclusions_with_guides.py +0 -0
  239. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_region_exclusions_detailed.py +0 -0
  240. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_stripes_real_pdf.py +0 -0
  241. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/temp/test_vertical_stripes.py +0 -0
  242. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/conftest.py +0 -0
  243. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/exporters/test_paddleocr_exporter.py +0 -0
  244. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_annotate.py +0 -0
  245. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_arabic_performance.py +0 -0
  246. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_arabic_real_world.py +0 -0
  247. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_color_conversion.py +0 -0
  248. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_color_hex_display.py +0 -0
  249. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_core/test_containment_geometry.py +0 -0
  250. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_core/test_elements.py +0 -0
  251. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_core/test_loading.py +0 -0
  252. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_core/test_spatial.py +0 -0
  253. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_core/test_text_extraction.py +0 -0
  254. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_core/test_text_layer.py +0 -0
  255. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_crop_enhancements.py +0 -0
  256. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_crop_region_highlights.py +0 -0
  257. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_directional_defaults.py +0 -0
  258. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_dissolve.py +0 -0
  259. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_dissolve_cross_page_bug.py +0 -0
  260. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_dissolve_debug_issue.py +0 -0
  261. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_dissolve_real_world_issue.py +0 -0
  262. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_dissolve_single_elements.py +0 -0
  263. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_dissolve_vertical_offset_issue.py +0 -0
  264. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_document_qa.py +0 -0
  265. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_element_addition.py +0 -0
  266. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_element_collection_guides.py +0 -0
  267. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_element_collection_show_cols.py +0 -0
  268. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_element_collection_slicing.py +0 -0
  269. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_element_exclusions.py +0 -0
  270. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_element_show_crop_highlights.py +0 -0
  271. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_empty_pseudo_class.py +0 -0
  272. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_exclusions.py +0 -0
  273. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_expand.py +0 -0
  274. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_extraction_error.py +0 -0
  275. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_extraction_mixin_fix.py +0 -0
  276. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_extraction_text_and_vision.py +0 -0
  277. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_extraction_working.py +0 -0
  278. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_find_similar.py +0 -0
  279. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_first_last_selectors.py +0 -0
  280. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_fix_get_sections_zero_height.py +0 -0
  281. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_flow_region_directional.py +0 -0
  282. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_get_sections_fix_comprehensive.py +0 -0
  283. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_get_sections_zero_height.py +0 -0
  284. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_groupby.py +0 -0
  285. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides.py +0 -0
  286. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides_apply_exclusions.py +0 -0
  287. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides_apply_exclusions_simple.py +0 -0
  288. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides_extract_table.py +0 -0
  289. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides_extract_table_collections.py +0 -0
  290. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides_extract_table_exclusions.py +0 -0
  291. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides_extract_table_real.py +0 -0
  292. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides_from_stripes.py +0 -0
  293. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides_integration.py +0 -0
  294. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_guides_marker_sorting.py +0 -0
  295. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_highlight_detection.py +0 -0
  296. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_highlight_detection_comprehensive.py +0 -0
  297. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_highlight_protocol.py +0 -0
  298. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_highlight_protocol_simple.py +0 -0
  299. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_highlight_regions.py +0 -0
  300. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_horizontal_guides_alignment.py +0 -0
  301. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_comprehensive.py +0 -0
  302. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_debug.py +0 -0
  303. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_final.py +0 -0
  304. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_final_verification.py +0 -0
  305. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_fix.py +0 -0
  306. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_mock.py +0 -0
  307. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_simple.py +0 -0
  308. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_types_pdf.py +0 -0
  309. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_verification.py +0 -0
  310. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_include_boundaries_with_real_text.py +0 -0
  311. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_loading_original.py +0 -0
  312. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_merge_connected.py +0 -0
  313. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_merge_connected_real_world.py +0 -0
  314. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_merge_method.py +0 -0
  315. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_multi_page_table_discovery.py +0 -0
  316. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_optional_deps.py +0 -0
  317. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_page_exclusion_lists.py +0 -0
  318. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  319. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_pdf_exclusions_in_find_methods.py +0 -0
  320. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_region_show_crop_highlights.py +0 -0
  321. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_region_viewer.py +0 -0
  322. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_sections_end_only.py +0 -0
  323. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_sections_with_start_and_end.py +0 -0
  324. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_show_column_layout.py +0 -0
  325. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_show_edge_cases.py +0 -0
  326. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_show_exclusions.py +0 -0
  327. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_show_exclusions_feature.py +0 -0
  328. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_show_limit.py +0 -0
  329. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_skip_repeating_headers_multipage.py +0 -0
  330. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_slice_cache_reuse.py +0 -0
  331. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_slice_exclusion_fix.py +0 -0
  332. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_slice_exclusion_issue.py +0 -0
  333. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_slice_exclusion_mock.py +0 -0
  334. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_sliced_collection_exclusions.py +0 -0
  335. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_strikethrough_detection.py +0 -0
  336. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_table_result_header_mismatch.py +0 -0
  337. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_table_result_keep_blank.py +0 -0
  338. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_tiny_text_tables.py +0 -0
  339. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_tiny_text_tables_table.py +0 -0
  340. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_tutorials.py +0 -0
  341. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_underline_detection.py +0 -0
  342. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tests/test_update_text.py +0 -0
  343. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/todo/bad_pdf_analysis.md +0 -0
  344. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/todo/evaluation.md +0 -0
  345. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  346. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  347. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  348. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/README.md +0 -0
  349. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/__init__.py +0 -0
  350. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/analyser.py +0 -0
  351. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  352. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  353. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/eval_suite.py +0 -0
  354. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  355. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  356. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  357. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  358. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  359. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/reporter.py +0 -0
  360. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/tools/bad_pdf_eval/utils.py +0 -0
  361. {natural_pdf-0.2.12 → natural_pdf-0.2.13}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.12
3
+ Version: 0.2.13
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -92,6 +92,16 @@ class HighlightRenderer:
92
92
 
93
93
  def _draw_highlights(self):
94
94
  """Draws all highlight shapes, borders, vertices, and attributes."""
95
+ # Get the pdfplumber page offset for coordinate translation
96
+ page_offset_x = 0
97
+ page_offset_y = 0
98
+
99
+ if hasattr(self.page, "_page") and hasattr(self.page._page, "bbox"):
100
+ # PDFPlumber page bbox might have negative offsets
101
+ page_offset_x = -self.page._page.bbox[0]
102
+ page_offset_y = -self.page._page.bbox[1]
103
+ logger.debug(f"Applying highlight offset: x={page_offset_x}, y={page_offset_y}")
104
+
95
105
  for highlight in self.highlights:
96
106
  # Create a transparent overlay for this single highlight
97
107
  overlay = Image.new("RGBA", self.base_image.size, (0, 0, 0, 0))
@@ -101,7 +111,11 @@ class HighlightRenderer:
101
111
 
102
112
  if highlight.is_polygon:
103
113
  scaled_polygon = [
104
- (p[0] * self.scale_factor, p[1] * self.scale_factor) for p in highlight.polygon
114
+ (
115
+ (p[0] + page_offset_x) * self.scale_factor,
116
+ (p[1] + page_offset_y) * self.scale_factor,
117
+ )
118
+ for p in highlight.polygon
105
119
  ]
106
120
  # Draw polygon fill and border
107
121
  draw.polygon(
@@ -117,11 +131,16 @@ class HighlightRenderer:
117
131
  else: # Rectangle
118
132
  x0, top, x1, bottom = highlight.bbox
119
133
  x0_s, top_s, x1_s, bottom_s = (
120
- x0 * self.scale_factor,
121
- top * self.scale_factor,
122
- x1 * self.scale_factor,
123
- bottom * self.scale_factor,
134
+ (x0 + page_offset_x) * self.scale_factor,
135
+ (top + page_offset_y) * self.scale_factor,
136
+ (x1 + page_offset_x) * self.scale_factor,
137
+ (bottom + page_offset_y) * self.scale_factor,
124
138
  )
139
+ logger.debug(f"Original bbox: ({x0}, {top}, {x1}, {bottom})")
140
+ logger.debug(
141
+ f"Offset bbox: ({x0 + page_offset_x}, {top + page_offset_y}, {x1 + page_offset_x}, {bottom + page_offset_y})"
142
+ )
143
+ logger.debug(f"Scaled bbox: ({x0_s}, {top_s}, {x1_s}, {bottom_s})")
125
144
  scaled_bbox = [x0_s, top_s, x1_s, bottom_s]
126
145
  # Draw rectangle fill and border
127
146
  draw.rectangle(
@@ -1482,11 +1501,22 @@ class HighlightingService:
1482
1501
  offset_x = crop_offset[0] * scale_factor
1483
1502
  offset_y = crop_offset[1] * scale_factor
1484
1503
 
1504
+ # Add pdfplumber page offset for coordinate translation
1505
+ page_offset_x = 0
1506
+ page_offset_y = 0
1507
+ if hasattr(page, "_page") and hasattr(page._page, "bbox"):
1508
+ # PDFPlumber page bbox might have negative offsets
1509
+ page_offset_x = -page._page.bbox[0]
1510
+ page_offset_y = -page._page.bbox[1]
1511
+
1485
1512
  # Draw the highlight
1486
1513
  if polygon:
1487
1514
  # Scale polygon points and apply offset
1488
1515
  scaled_polygon = [
1489
- (p[0] * scale_factor - offset_x, p[1] * scale_factor - offset_y)
1516
+ (
1517
+ (p[0] + page_offset_x) * scale_factor - offset_x,
1518
+ (p[1] + page_offset_y) * scale_factor - offset_y,
1519
+ )
1490
1520
  for p in polygon
1491
1521
  ]
1492
1522
  draw.polygon(
@@ -1496,10 +1526,10 @@ class HighlightingService:
1496
1526
  # Scale bbox and apply offset
1497
1527
  x0, y0, x1, y1 = bbox
1498
1528
  scaled_bbox = [
1499
- x0 * scale_factor - offset_x,
1500
- y0 * scale_factor - offset_y,
1501
- x1 * scale_factor - offset_x,
1502
- y1 * scale_factor - offset_y,
1529
+ (x0 + page_offset_x) * scale_factor - offset_x,
1530
+ (y0 + page_offset_y) * scale_factor - offset_y,
1531
+ (x1 + page_offset_x) * scale_factor - offset_x,
1532
+ (y1 + page_offset_y) * scale_factor - offset_y,
1503
1533
  ]
1504
1534
  draw.rectangle(
1505
1535
  scaled_bbox, fill=color, outline=(color[0], color[1], color[2], BORDER_ALPHA)
@@ -106,6 +106,7 @@ class DirectionalMixin:
106
106
  include_source: bool = False,
107
107
  until: Optional[str] = None,
108
108
  include_endpoint: bool = True,
109
+ offset: float = 0.1,
109
110
  **kwargs,
110
111
  ) -> "Region":
111
112
  """
@@ -118,6 +119,7 @@ class DirectionalMixin:
118
119
  include_source: Whether to include this element/region's area in the result
119
120
  until: Optional selector string to specify a boundary element
120
121
  include_endpoint: Whether to include the boundary element found by 'until'
122
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
121
123
  **kwargs: Additional parameters for the 'until' selector search
122
124
 
123
125
  Returns:
@@ -127,7 +129,7 @@ class DirectionalMixin:
127
129
 
128
130
  is_horizontal = direction in ("left", "right")
129
131
  is_positive = direction in ("right", "below") # right/below are positive directions
130
- pixel_offset = 1 # Offset for excluding elements/endpoints
132
+ pixel_offset = offset # Use provided offset for excluding elements/endpoints
131
133
 
132
134
  # 1. Determine initial boundaries based on direction and include_source
133
135
  if is_horizontal:
@@ -260,6 +262,7 @@ class DirectionalMixin:
260
262
  include_source: bool = False,
261
263
  until: Optional[str] = None,
262
264
  include_endpoint: bool = True,
265
+ offset: float = 0.1,
263
266
  **kwargs,
264
267
  ) -> "Region":
265
268
  """
@@ -271,6 +274,7 @@ class DirectionalMixin:
271
274
  include_source: Whether to include this element/region in the result (default: False)
272
275
  until: Optional selector string to specify an upper boundary element
273
276
  include_endpoint: Whether to include the boundary element in the region (default: True)
277
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
274
278
  **kwargs: Additional parameters
275
279
 
276
280
  Returns:
@@ -295,6 +299,7 @@ class DirectionalMixin:
295
299
  include_source=include_source,
296
300
  until=until,
297
301
  include_endpoint=include_endpoint,
302
+ offset=offset,
298
303
  **kwargs,
299
304
  )
300
305
 
@@ -305,6 +310,7 @@ class DirectionalMixin:
305
310
  include_source: bool = False,
306
311
  until: Optional[str] = None,
307
312
  include_endpoint: bool = True,
313
+ offset: float = 0.1,
308
314
  **kwargs,
309
315
  ) -> "Region":
310
316
  """
@@ -316,6 +322,7 @@ class DirectionalMixin:
316
322
  include_source: Whether to include this element/region in the result (default: False)
317
323
  until: Optional selector string to specify a lower boundary element
318
324
  include_endpoint: Whether to include the boundary element in the region (default: True)
325
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
319
326
  **kwargs: Additional parameters
320
327
 
321
328
  Returns:
@@ -340,6 +347,7 @@ class DirectionalMixin:
340
347
  include_source=include_source,
341
348
  until=until,
342
349
  include_endpoint=include_endpoint,
350
+ offset=offset,
343
351
  **kwargs,
344
352
  )
345
353
 
@@ -350,6 +358,7 @@ class DirectionalMixin:
350
358
  include_source: bool = False,
351
359
  until: Optional[str] = None,
352
360
  include_endpoint: bool = True,
361
+ offset: float = 0.1,
353
362
  **kwargs,
354
363
  ) -> "Region":
355
364
  """
@@ -361,6 +370,7 @@ class DirectionalMixin:
361
370
  include_source: Whether to include this element/region in the result (default: False)
362
371
  until: Optional selector string to specify a left boundary element
363
372
  include_endpoint: Whether to include the boundary element in the region (default: True)
373
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
364
374
  **kwargs: Additional parameters
365
375
 
366
376
  Returns:
@@ -385,6 +395,7 @@ class DirectionalMixin:
385
395
  include_source=include_source,
386
396
  until=until,
387
397
  include_endpoint=include_endpoint,
398
+ offset=offset,
388
399
  **kwargs,
389
400
  )
390
401
 
@@ -395,6 +406,7 @@ class DirectionalMixin:
395
406
  include_source: bool = False,
396
407
  until: Optional[str] = None,
397
408
  include_endpoint: bool = True,
409
+ offset: float = 0.1,
398
410
  **kwargs,
399
411
  ) -> "Region":
400
412
  """
@@ -406,6 +418,7 @@ class DirectionalMixin:
406
418
  include_source: Whether to include this element/region in the result (default: False)
407
419
  until: Optional selector string to specify a right boundary element
408
420
  include_endpoint: Whether to include the boundary element in the region (default: True)
421
+ offset: Pixel offset when excluding source/endpoint (default: 0.1)
409
422
  **kwargs: Additional parameters
410
423
 
411
424
  Returns:
@@ -430,6 +443,7 @@ class DirectionalMixin:
430
443
  include_source=include_source,
431
444
  until=until,
432
445
  include_endpoint=include_endpoint,
446
+ offset=offset,
433
447
  **kwargs,
434
448
  )
435
449
 
@@ -45,6 +45,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
45
45
 
46
46
  # Import new utils
47
47
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
48
+ from natural_pdf.vision.mixin import VisualSearchMixin
48
49
 
49
50
  # Import viewer widget support
50
51
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
@@ -80,6 +81,7 @@ class Region(
80
81
  ExtractionMixin,
81
82
  ShapeDetectionMixin,
82
83
  DescribeMixin,
84
+ VisualSearchMixin,
83
85
  Visualizable,
84
86
  ):
85
87
  """Represents a rectangular region on a page.
@@ -1692,7 +1694,21 @@ class Region(
1692
1694
  else:
1693
1695
  filtered_page = base_plumber_page
1694
1696
 
1695
- cropped = filtered_page.crop(self.bbox)
1697
+ # Ensure bbox is within pdfplumber page bounds
1698
+ page_bbox = filtered_page.bbox
1699
+ clipped_bbox = (
1700
+ max(self.bbox[0], page_bbox[0]), # x0
1701
+ max(self.bbox[1], page_bbox[1]), # y0
1702
+ min(self.bbox[2], page_bbox[2]), # x1
1703
+ min(self.bbox[3], page_bbox[3]), # y1
1704
+ )
1705
+
1706
+ # Only crop if the clipped bbox is valid (has positive width and height)
1707
+ if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
1708
+ cropped = filtered_page.crop(clipped_bbox)
1709
+ else:
1710
+ # If the region is completely outside the page bounds, return empty list
1711
+ return []
1696
1712
 
1697
1713
  # Extract all tables from the cropped area
1698
1714
  tables = cropped.extract_tables(table_settings)
@@ -1786,7 +1802,21 @@ class Region(
1786
1802
  filtered_page = base_plumber_page
1787
1803
 
1788
1804
  # Now crop the (possibly filtered) page to the region bbox
1789
- cropped = filtered_page.crop(self.bbox)
1805
+ # Ensure bbox is within pdfplumber page bounds
1806
+ page_bbox = filtered_page.bbox
1807
+ clipped_bbox = (
1808
+ max(self.bbox[0], page_bbox[0]), # x0
1809
+ max(self.bbox[1], page_bbox[1]), # y0
1810
+ min(self.bbox[2], page_bbox[2]), # x1
1811
+ min(self.bbox[3], page_bbox[3]), # y1
1812
+ )
1813
+
1814
+ # Only crop if the clipped bbox is valid (has positive width and height)
1815
+ if clipped_bbox[2] > clipped_bbox[0] and clipped_bbox[3] > clipped_bbox[1]:
1816
+ cropped = filtered_page.crop(clipped_bbox)
1817
+ else:
1818
+ # If the region is completely outside the page bounds, return empty table
1819
+ return []
1790
1820
 
1791
1821
  # Extract the single largest table from the cropped area
1792
1822
  table = cropped.extract_table(table_settings)
@@ -1,7 +1,6 @@
1
1
  """Vision module for visual similarity and pattern matching"""
2
2
 
3
3
  from .mixin import VisualSearchMixin
4
- from .results import Match, MatchResults
5
4
  from .similarity import VisualMatcher, compute_phash
6
5
 
7
- __all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]
6
+ __all__ = ["VisualMatcher", "compute_phash", "VisualSearchMixin"]
@@ -6,9 +6,6 @@ import numpy as np
6
6
  from PIL import Image
7
7
  from tqdm.auto import tqdm
8
8
 
9
- from .results import Match, MatchResults
10
- from .similarity import VisualMatcher, compute_phash
11
-
12
9
 
13
10
  class VisualSearchMixin:
14
11
  """Add find_similar method to classes that include this mixin"""
@@ -21,11 +18,12 @@ class VisualSearchMixin:
21
18
  sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
22
19
  resolution: int = 72,
23
20
  hash_size: int = 20,
24
- step_factor: float = 0.1,
21
+ step: Optional[int] = None,
22
+ method: str = "phash",
25
23
  max_per_page: Optional[int] = None,
26
24
  show_progress: bool = True,
27
25
  **kwargs,
28
- ) -> MatchResults:
26
+ ) -> "MatchResults":
29
27
  """
30
28
  Find regions visually similar to the given example(s).
31
29
 
@@ -35,15 +33,19 @@ class VisualSearchMixin:
35
33
  confidence: Minimum similarity score (0-1)
36
34
  sizes: Size variations to search. Can be:
37
35
  - float: ±percentage (e.g., 0.2 = 80%-120%)
38
- - tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
36
+ - tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.2))
39
37
  - tuple(min, max, step): explicit step size
40
38
  - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
41
39
  resolution: Resolution for image comparison (DPI) (default: 72)
42
- hash_size: Size of perceptual hash grid (default: 12)
43
- step_factor: Step size as fraction of template size (default: 0.1)
40
+ hash_size: Size of perceptual hash grid (default: 20)
41
+ step: Step size in pixels for sliding window
42
+ method: Matching algorithm - "phash" (default) or "template"
44
43
  max_per_page: Maximum matches to return per page
45
44
  show_progress: Show progress bar for multi-page searches (default: True)
46
- **kwargs: Additional options
45
+ **kwargs: Additional options including:
46
+ mask_threshold: For both template and phash methods, pixels >= this value are masked.
47
+ For template matching: pixels are ignored in matching (e.g., 0.95)
48
+ For phash: pixels are replaced with median before hashing (e.g., 0.95)
47
49
 
48
50
  Returns:
49
51
  MatchResults collection
@@ -55,15 +57,25 @@ class VisualSearchMixin:
55
57
  if not isinstance(examples, list):
56
58
  examples = [examples]
57
59
 
60
+ from .similarity import VisualMatcher, compute_phash
61
+
58
62
  # Initialize matcher with specified hash size
59
63
  matcher = VisualMatcher(hash_size=hash_size)
60
64
 
61
65
  # Prepare templates
62
66
  templates = []
67
+ # Extract mask_threshold from kwargs for phash
68
+ mask_threshold = kwargs.get("mask_threshold")
69
+ mask_threshold_255 = (
70
+ int(mask_threshold * 255) if mask_threshold is not None and method == "phash" else None
71
+ )
72
+
63
73
  for example in examples:
64
74
  # Render the example region/element
65
75
  example_image = example.render(resolution=resolution, crop=True)
66
- template_hash = compute_phash(example_image, hash_size=hash_size)
76
+ template_hash = compute_phash(
77
+ example_image, hash_size=hash_size, mask_threshold=mask_threshold_255
78
+ )
67
79
  templates.append({"image": example_image, "hash": template_hash, "source": example})
68
80
 
69
81
  # Get pages to search based on the object type
@@ -76,6 +88,8 @@ class VisualSearchMixin:
76
88
  pages_to_search = self.pages
77
89
  elif hasattr(self, "number"): # Single page
78
90
  pages_to_search = [self]
91
+ elif hasattr(self, "page") and hasattr(self, "bbox"): # Region
92
+ pages_to_search = [self]
79
93
  else:
80
94
  raise TypeError(f"Cannot search in {type(self)}")
81
95
 
@@ -86,10 +100,16 @@ class VisualSearchMixin:
86
100
  scales = matcher._get_search_scales(sizes)
87
101
 
88
102
  # Pre-calculate for all pages and templates
89
- for page in pages_to_search:
90
- # Estimate page image size
91
- page_w = int(page.width * resolution / 72.0)
92
- page_h = int(page.height * resolution / 72.0)
103
+ for search_obj in pages_to_search:
104
+ # Estimate image size based on object type
105
+ if hasattr(search_obj, "page") and hasattr(search_obj, "bbox"):
106
+ # Region
107
+ page_w = int(search_obj.width * resolution / 72.0)
108
+ page_h = int(search_obj.height * resolution / 72.0)
109
+ else:
110
+ # Page
111
+ page_w = int(search_obj.width * resolution / 72.0)
112
+ page_h = int(search_obj.height * resolution / 72.0)
93
113
 
94
114
  for template_data in templates:
95
115
  template_w, template_h = template_data["image"].size
@@ -99,11 +119,15 @@ class VisualSearchMixin:
99
119
  scaled_h = int(template_h * scale)
100
120
 
101
121
  if scaled_w <= page_w and scaled_h <= page_h:
102
- step_x = max(1, int(scaled_w * step_factor))
103
- step_y = max(1, int(scaled_h * step_factor))
104
-
105
- x_windows = len(range(0, page_w - scaled_w + 1, step_x))
106
- y_windows = len(range(0, page_h - scaled_h + 1, step_y))
122
+ # Determine step size
123
+ if step is not None:
124
+ actual_step = step
125
+ else:
126
+ # Default to 10% of template size
127
+ actual_step = max(1, int(min(scaled_w, scaled_h) * 0.1))
128
+
129
+ x_windows = len(range(0, page_w - scaled_w + 1, actual_step))
130
+ y_windows = len(range(0, page_h - scaled_h + 1, actual_step))
107
131
  total_operations += x_windows * y_windows
108
132
 
109
133
  # Search each page
@@ -124,9 +148,20 @@ class VisualSearchMixin:
124
148
  mininterval=0.1, # Minimum time between updates (seconds)
125
149
  )
126
150
 
127
- for page_idx, page in enumerate(pages_to_search):
128
- # Render the full page once
129
- page_image = page.render(resolution=resolution)
151
+ for page_idx, search_obj in enumerate(pages_to_search):
152
+ # Determine if we're searching in a page or a region
153
+ if hasattr(search_obj, "page") and hasattr(search_obj, "bbox"):
154
+ # This is a Region - render only the region area
155
+ region = search_obj
156
+ page = region.page
157
+ page_image = region.render(resolution=resolution, crop=True)
158
+ # Region offset for coordinate conversion
159
+ region_x0, region_y0 = region.x0, region.top
160
+ else:
161
+ # This is a Page - render the full page
162
+ page = search_obj
163
+ page_image = page.render(resolution=resolution)
164
+ region_x0, region_y0 = 0, 0
130
165
 
131
166
  # Convert page coordinates to image coordinates
132
167
  scale = resolution / 72.0 # PDF is 72 DPI
@@ -168,7 +203,8 @@ class VisualSearchMixin:
168
203
  template_hash=template_hash,
169
204
  confidence_threshold=confidence,
170
205
  sizes=sizes,
171
- step_factor=step_factor,
206
+ step=step,
207
+ method=method,
172
208
  show_progress=False, # We handle progress ourselves
173
209
  progress_callback=update_progress if progress_bar else None,
174
210
  **kwargs,
@@ -180,10 +216,12 @@ class VisualSearchMixin:
180
216
 
181
217
  # Convert from image pixels to PDF points
182
218
  # No flipping needed! PDF coordinates map directly to PIL coordinates
183
- pdf_x0 = img_x0 / scale
184
- pdf_y0 = img_y0 / scale
185
- pdf_x1 = img_x1 / scale
186
- pdf_y1 = img_y1 / scale
219
+ pdf_x0 = img_x0 / scale + region_x0
220
+ pdf_y0 = img_y0 / scale + region_y0
221
+ pdf_x1 = img_x1 / scale + region_x0
222
+ pdf_y1 = img_y1 / scale + region_y0
223
+
224
+ from .results import Match
187
225
 
188
226
  # Create Match object
189
227
  match = Match(
@@ -206,4 +244,6 @@ class VisualSearchMixin:
206
244
  if progress_bar:
207
245
  progress_bar.close()
208
246
 
247
+ from .results import MatchResults
248
+
209
249
  return MatchResults(all_matches)
@@ -2,7 +2,6 @@
2
2
 
3
3
  from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple
4
4
 
5
- # Import Region directly as it's a base class
6
5
  from natural_pdf.elements.region import Region
7
6
 
8
7
  if TYPE_CHECKING:
@@ -39,16 +38,41 @@ class Match(Region):
39
38
 
40
39
 
41
40
  class MatchResults:
42
- """Collection of Match objects with transformation methods"""
41
+ """
42
+ Collection of Match objects with transformation methods.
43
+
44
+ Matches are automatically sorted by confidence (highest first), so:
45
+ - matches[0] is the best match
46
+ - Iteration yields matches from best to worst
47
+ - The .top(n) method returns the n best matches
48
+
49
+ Example:
50
+ >>> matches = page.find_similar(logo_region)
51
+ >>> print(f"Found {len(matches)} matches")
52
+ >>>
53
+ >>> # Best match
54
+ >>> best = matches[0]
55
+ >>> print(f"Best match confidence: {best.confidence:.3f}")
56
+ >>>
57
+ >>> # Top 5 matches
58
+ >>> for match in matches.top(5):
59
+ ... print(f"Confidence: {match.confidence:.3f} at page {match.page.number}")
60
+ >>>
61
+ >>> # All matches above 90% confidence
62
+ >>> high_conf = matches.filter_by_confidence(0.9)
63
+ """
43
64
 
44
65
  def __init__(self, matches: List[Match]):
45
- """Initialize with list of Match objects"""
66
+ """Initialize with list of Match objects, automatically sorted by confidence"""
46
67
  # Import here to avoid circular import
47
68
  from natural_pdf.elements.element_collection import ElementCollection
48
69
 
70
+ # Sort matches by confidence (highest first)
71
+ sorted_matches = sorted(matches, key=lambda m: m.confidence, reverse=True)
72
+
49
73
  # Create a base ElementCollection
50
- self._collection = ElementCollection(matches)
51
- self._matches = matches
74
+ self._collection = ElementCollection(sorted_matches)
75
+ self._matches = sorted_matches
52
76
 
53
77
  def __len__(self):
54
78
  return len(self._matches)
@@ -68,6 +92,26 @@ class MatchResults:
68
92
  """Filter matches by minimum confidence"""
69
93
  return self.filter(lambda m: m.confidence >= min_confidence)
70
94
 
95
+ def top(self, n: int) -> "MatchResults":
96
+ """
97
+ Get the top N matches with highest confidence.
98
+
99
+ Args:
100
+ n: Number of top matches to return
101
+
102
+ Returns:
103
+ New MatchResults with only the top N matches
104
+
105
+ Example:
106
+ >>> matches = page.find_similar(logo)
107
+ >>> best_5 = matches.top(5)
108
+ >>> for match in best_5:
109
+ ... print(f"Confidence: {match.confidence:.3f}")
110
+ """
111
+ # Since matches are already sorted by confidence, just take first n
112
+ top_matches = self._matches[:n]
113
+ return MatchResults(top_matches)
114
+
71
115
  def pages(self):
72
116
  """Get unique pages containing matches"""
73
117
  # Import here to avoid circular import