natural-pdf 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (283) hide show
  1. {natural_pdf-0.2.3/natural_pdf.egg-info → natural_pdf-0.2.4}/PKG-INFO +1 -1
  2. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/page.py +2 -0
  3. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/pdf.py +4 -1
  4. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/pdf_collection.py +131 -4
  5. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/render_spec.py +2 -2
  6. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/base.py +18 -14
  7. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/region.py +10 -8
  8. natural_pdf-0.2.4/natural_pdf/vision/__init__.py +7 -0
  9. natural_pdf-0.2.4/natural_pdf/vision/mixin.py +209 -0
  10. natural_pdf-0.2.4/natural_pdf/vision/results.py +146 -0
  11. natural_pdf-0.2.4/natural_pdf/vision/similarity.py +321 -0
  12. {natural_pdf-0.2.3 → natural_pdf-0.2.4/natural_pdf.egg-info}/PKG-INFO +1 -1
  13. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/SOURCES.txt +7 -0
  14. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/top_level.txt +0 -1
  15. natural_pdf-0.2.4/tests/test_element_show_crop_highlights.py +168 -0
  16. natural_pdf-0.2.4/tests/test_find_similar.py +147 -0
  17. natural_pdf-0.2.4/tests/test_region_show_crop_highlights.py +219 -0
  18. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/analysis_framework.mdc +0 -0
  19. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/coding-style.mdc +0 -0
  20. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  21. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/minimal-comments.mdc +0 -0
  22. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  23. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  24. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.github/workflows/ci.yml +0 -0
  25. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.github/workflows/docs.yml +0 -0
  26. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.github/workflows/nightly-tutorials.yml +0 -0
  27. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.gitignore +0 -0
  28. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/.pre-commit-config.yaml +0 -0
  29. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/01-execute_notebooks.py +0 -0
  30. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/02-run_all_tutorials.sh +0 -0
  31. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/CLAUDE.md +0 -0
  32. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/LICENSE +0 -0
  33. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/MANIFEST.in +0 -0
  34. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/README.md +0 -0
  35. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/audit_packaging.py +0 -0
  36. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/check_run_md.sh +0 -0
  37. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/api/index.md +0 -0
  38. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/favicon.png +0 -0
  39. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/favicon.svg +0 -0
  40. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/javascripts/custom.js +0 -0
  41. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/logo.svg +0 -0
  42. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/sample-screen.png +0 -0
  43. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/social-preview.png +0 -0
  44. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/social-preview.svg +0 -0
  45. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/assets/stylesheets/custom.css +0 -0
  46. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/categorizing-documents/index.md +0 -0
  47. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/data-extraction/index.md +0 -0
  48. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/describe/index.md +0 -0
  49. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/document-qa/index.md +0 -0
  50. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/element-selection/index.md +0 -0
  51. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/extracting-clean-text/index.md +0 -0
  52. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/finetuning/index.md +0 -0
  53. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/fix-messy-tables/index.md +0 -0
  54. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/fix-messy-tables/table_1.csv +0 -0
  55. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/fix-messy-tables/table_2.csv +0 -0
  56. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/fix-messy-tables/table_3.csv +0 -0
  57. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/index.md +0 -0
  58. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/installation/index.md +0 -0
  59. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/interactive-widget/index.md +0 -0
  60. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/layout-analysis/index.md +0 -0
  61. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/loops-and-groups/index.md +0 -0
  62. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/ocr/index.md +0 -0
  63. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/pdf-navigation/index.md +0 -0
  64. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  65. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/process-forms-and-invoices/index.md +0 -0
  66. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/quick-reference/index.md +0 -0
  67. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/reflowing-pages/index.md +0 -0
  68. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/regions/index.md +0 -0
  69. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tables/index.md +0 -0
  70. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/text-analysis/index.md +0 -0
  71. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/01-loading-and-extraction.md +0 -0
  72. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/02-finding-elements.md +0 -0
  73. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/03-extracting-blocks.md +0 -0
  74. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/04-table-extraction.md +0 -0
  75. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/05-excluding-content.md +0 -0
  76. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/06-document-qa.md +0 -0
  77. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/07-layout-analysis.md +0 -0
  78. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/07-working-with-regions.md +0 -0
  79. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/08-spatial-navigation.md +0 -0
  80. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/09-section-extraction.md +0 -0
  81. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/10-form-field-extraction.md +0 -0
  82. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  83. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/12-ocr-integration.md +0 -0
  84. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/13-semantic-search.md +0 -0
  85. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/tutorials/14-categorizing-documents.md +0 -0
  86. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/visual-debugging/index.md +0 -0
  87. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/docs/visual-debugging/region.png +0 -0
  88. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/mkdocs.yml +0 -0
  89. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/__init__.py +0 -0
  90. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/__init__.py +0 -0
  91. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/guides.py +0 -0
  92. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/__init__.py +0 -0
  93. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/base.py +0 -0
  94. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/docling.py +0 -0
  95. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/gemini.py +0 -0
  96. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  97. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  98. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  99. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/paddle.py +0 -0
  100. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  101. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/surya.py +0 -0
  102. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  103. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/tatr.py +0 -0
  104. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/layout/yolo.py +0 -0
  105. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  106. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/text_options.py +0 -0
  107. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/text_structure.py +0 -0
  108. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/analyzers/utils.py +0 -0
  109. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/classification/manager.py +0 -0
  110. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/classification/mixin.py +0 -0
  111. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/classification/results.py +0 -0
  112. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/cli.py +0 -0
  113. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/collections/mixins.py +0 -0
  114. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/__init__.py +0 -0
  115. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/element_manager.py +0 -0
  116. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/highlighting_service.py +0 -0
  117. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/page_collection.py +0 -0
  118. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/core/page_groupby.py +0 -0
  119. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/__init__.py +0 -0
  120. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/base.py +0 -0
  121. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/elements.py +0 -0
  122. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/mixin.py +0 -0
  123. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/describe/summary.py +0 -0
  124. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/__init__.py +0 -0
  125. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/element_collection.py +0 -0
  126. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/image.py +0 -0
  127. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/line.py +0 -0
  128. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/rect.py +0 -0
  129. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/elements/text.py +0 -0
  130. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/export/mixin.py +0 -0
  131. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/__init__.py +0 -0
  132. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/base.py +0 -0
  133. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/data/__init__.py +0 -0
  134. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/data/pdf.ttf +0 -0
  135. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/data/sRGB.icc +0 -0
  136. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/hocr.py +0 -0
  137. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/hocr_font.py +0 -0
  138. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/original_pdf.py +0 -0
  139. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/paddleocr.py +0 -0
  140. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/exporters/searchable_pdf.py +0 -0
  141. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/extraction/manager.py +0 -0
  142. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/extraction/mixin.py +0 -0
  143. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/extraction/result.py +0 -0
  144. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/__init__.py +0 -0
  145. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/collections.py +0 -0
  146. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/element.py +0 -0
  147. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/flow.py +0 -0
  148. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/flows/region.py +0 -0
  149. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/__init__.py +0 -0
  150. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine.py +0 -0
  151. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_doctr.py +0 -0
  152. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_easyocr.py +0 -0
  153. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_paddle.py +0 -0
  154. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/engine_surya.py +0 -0
  155. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/ocr_factory.py +0 -0
  156. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/ocr_manager.py +0 -0
  157. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/ocr_options.py +0 -0
  158. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/ocr/utils.py +0 -0
  159. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/qa/__init__.py +0 -0
  160. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/qa/document_qa.py +0 -0
  161. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/qa/qa_result.py +0 -0
  162. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/__init__.py +0 -0
  163. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/lancedb_search_service.py +0 -0
  164. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/numpy_search_service.py +0 -0
  165. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/search_options.py +0 -0
  166. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/search_service_protocol.py +0 -0
  167. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/search/searchable_mixin.py +0 -0
  168. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/selectors/__init__.py +0 -0
  169. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/selectors/parser.py +0 -0
  170. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/tables/__init__.py +0 -0
  171. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/tables/result.py +0 -0
  172. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/__init__.py +0 -0
  173. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  174. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/spa/css/style.css +0 -0
  175. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/spa/index.html +0 -0
  176. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/spa/js/app.js +0 -0
  177. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/templates/spa/words.txt +0 -0
  178. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/text_mixin.py +0 -0
  179. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/__init__.py +0 -0
  180. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/bidi_mirror.py +0 -0
  181. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/debug.py +0 -0
  182. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/highlighting.py +0 -0
  183. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/identifiers.py +0 -0
  184. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/layout.py +0 -0
  185. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/locks.py +0 -0
  186. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/packaging.py +0 -0
  187. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/reading_order.py +0 -0
  188. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/text_extraction.py +0 -0
  189. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/utils/visualization.py +0 -0
  190. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/widgets/__init__.py +0 -0
  191. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf/widgets/viewer.py +0 -0
  192. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/dependency_links.txt +0 -0
  193. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/entry_points.txt +0 -0
  194. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/natural_pdf.egg-info/requires.txt +0 -0
  195. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/noxfile.py +0 -0
  196. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/memory_comparison.py +0 -0
  197. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/pdf_analyzer.py +0 -0
  198. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_analysis.py +0 -0
  199. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  200. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  201. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  202. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  203. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/test_cleanup_methods.py +0 -0
  204. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/optimization/test_memory_fix.py +0 -0
  205. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/publish.sh +0 -0
  206. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/pyproject.toml +0 -0
  207. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/sample-screen.png +0 -0
  208. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/setup.cfg +0 -0
  209. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/test_install.sh +0 -0
  210. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/conftest.py +0 -0
  211. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/exporters/test_paddleocr_exporter.py +0 -0
  212. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_annotate.py +0 -0
  213. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_arabic_performance.py +0 -0
  214. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_arabic_real_world.py +0 -0
  215. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_color_conversion.py +0 -0
  216. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_containment_geometry.py +0 -0
  217. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_elements.py +0 -0
  218. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_loading.py +0 -0
  219. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_spatial.py +0 -0
  220. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_text_extraction.py +0 -0
  221. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_core/test_text_layer.py +0 -0
  222. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_directional_defaults.py +0 -0
  223. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_document_qa.py +0 -0
  224. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_element_collection_slicing.py +0 -0
  225. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_exclusions.py +0 -0
  226. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_expand.py +0 -0
  227. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_extraction_error.py +0 -0
  228. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_extraction_mixin_fix.py +0 -0
  229. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_extraction_text_and_vision.py +0 -0
  230. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_extraction_working.py +0 -0
  231. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_first_last_selectors.py +0 -0
  232. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_flow_region_directional.py +0 -0
  233. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_groupby.py +0 -0
  234. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides.py +0 -0
  235. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_apply_exclusions.py +0 -0
  236. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_apply_exclusions_simple.py +0 -0
  237. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_extract_table.py +0 -0
  238. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_extract_table_real.py +0 -0
  239. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_guides_integration.py +0 -0
  240. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_highlight_detection.py +0 -0
  241. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_highlight_protocol.py +0 -0
  242. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_highlight_protocol_simple.py +0 -0
  243. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_highlight_regions.py +0 -0
  244. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_loading_original.py +0 -0
  245. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_multi_page_table_discovery.py +0 -0
  246. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_optional_deps.py +0 -0
  247. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_page_exclusion_lists.py +0 -0
  248. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  249. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_region_viewer.py +0 -0
  250. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_sections_end_only.py +0 -0
  251. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_column_layout.py +0 -0
  252. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_edge_cases.py +0 -0
  253. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_exclusions.py +0 -0
  254. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_exclusions_feature.py +0 -0
  255. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_show_limit.py +0 -0
  256. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_skip_repeating_headers_multipage.py +0 -0
  257. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_strikethrough_detection.py +0 -0
  258. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_table_result_header_mismatch.py +0 -0
  259. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_table_result_keep_blank.py +0 -0
  260. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_tiny_text_tables.py +0 -0
  261. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_tiny_text_tables_table.py +0 -0
  262. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_tutorials.py +0 -0
  263. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_underline_detection.py +0 -0
  264. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tests/test_update_text.py +0 -0
  265. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/todo/bad_pdf_analysis.md +0 -0
  266. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/todo/evaluation.md +0 -0
  267. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  268. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  269. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  270. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/README.md +0 -0
  271. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/__init__.py +0 -0
  272. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/analyser.py +0 -0
  273. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  274. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  275. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/eval_suite.py +0 -0
  276. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  277. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  278. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  279. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  280. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  281. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/reporter.py +0 -0
  282. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/tools/bad_pdf_eval/utils.py +0 -0
  283. {natural_pdf-0.2.3 → natural_pdf-0.2.4}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -78,6 +78,7 @@ from natural_pdf.utils.locks import pdf_render_lock # Import the lock
78
78
 
79
79
  # # Import new utils
80
80
  from natural_pdf.utils.text_extraction import filter_chars_spatially, generate_text_layout
81
+ from natural_pdf.vision.mixin import VisualSearchMixin
81
82
  from natural_pdf.widgets.viewer import _IPYWIDGETS_AVAILABLE, InteractiveViewerWidget
82
83
 
83
84
  # --- End Classification Imports --- #
@@ -101,6 +102,7 @@ class Page(
101
102
  ExtractionMixin,
102
103
  ShapeDetectionMixin,
103
104
  DescribeMixin,
105
+ VisualSearchMixin,
104
106
  Visualizable,
105
107
  ):
106
108
  """Enhanced Page wrapper built on top of pdfplumber.Page.
@@ -42,6 +42,7 @@ from natural_pdf.ocr import OCRManager, OCROptions
42
42
  from natural_pdf.selectors.parser import parse_selector
43
43
  from natural_pdf.text_mixin import TextMixin
44
44
  from natural_pdf.utils.locks import pdf_render_lock
45
+ from natural_pdf.vision.mixin import VisualSearchMixin
45
46
 
46
47
  if TYPE_CHECKING:
47
48
  from natural_pdf.elements.element_collection import ElementCollection
@@ -252,7 +253,9 @@ class _LazyPageList(Sequence):
252
253
  # --- End Lazy Page List Helper --- #
253
254
 
254
255
 
255
- class PDF(TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, Visualizable):
256
+ class PDF(
257
+ TextMixin, ExtractionMixin, ExportMixin, ClassificationMixin, VisualSearchMixin, Visualizable
258
+ ):
256
259
  """Enhanced PDF wrapper built on top of pdfplumber.
257
260
 
258
261
  This class provides a fluent interface for working with PDF documents,
@@ -40,6 +40,7 @@ logger = logging.getLogger(__name__)
40
40
  from natural_pdf.core.pdf import PDF
41
41
  from natural_pdf.elements.region import Region
42
42
  from natural_pdf.export.mixin import ExportMixin
43
+ from natural_pdf.vision.mixin import VisualSearchMixin
43
44
 
44
45
  # --- Search Imports ---
45
46
  try:
@@ -69,8 +70,8 @@ from natural_pdf.search.searchable_mixin import SearchableMixin # Import the ne
69
70
 
70
71
 
71
72
  class PDFCollection(
72
- SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin
73
- ): # Add ExportMixin and ShapeDetectionMixin
73
+ SearchableMixin, ApplyMixin, ExportMixin, ShapeDetectionMixin, VisualSearchMixin
74
+ ):
74
75
  def __init__(
75
76
  self,
76
77
  source: Union[str, Iterable[Union[str, "PDF"]]],
@@ -258,8 +259,6 @@ class PDFCollection(
258
259
  return iter(self._pdfs)
259
260
 
260
261
  def __repr__(self) -> str:
261
- # Removed search status
262
- return f"<PDFCollection(count={len(self._pdfs)})>"
263
262
  return f"<PDFCollection(count={len(self._pdfs)})>"
264
263
 
265
264
  @property
@@ -267,6 +266,134 @@ class PDFCollection(
267
266
  """Returns the list of PDF objects held by the collection."""
268
267
  return self._pdfs
269
268
 
269
+ def show(self, limit: Optional[int] = 30, per_pdf_limit: Optional[int] = 10, **kwargs):
270
+ """
271
+ Display all PDFs in the collection with labels.
272
+
273
+ Each PDF is shown with its pages in a grid layout (6 columns by default),
274
+ and all PDFs are stacked vertically with labels.
275
+
276
+ Args:
277
+ limit: Maximum total pages to show across all PDFs (default: 30)
278
+ per_pdf_limit: Maximum pages to show per PDF (default: 10)
279
+ **kwargs: Additional arguments passed to each PDF's show() method
280
+ (e.g., columns, exclusions, resolution, etc.)
281
+
282
+ Returns:
283
+ Displayed image in Jupyter or None
284
+ """
285
+ if not self._pdfs:
286
+ print("Empty collection")
287
+ return None
288
+
289
+ # Import here to avoid circular imports
290
+ import numpy as np
291
+ from PIL import Image, ImageDraw, ImageFont
292
+
293
+ # Calculate pages per PDF if total limit is set
294
+ if limit and not per_pdf_limit:
295
+ per_pdf_limit = max(1, limit // len(self._pdfs))
296
+
297
+ # Collect images from each PDF
298
+ all_images = []
299
+ total_pages_shown = 0
300
+
301
+ for pdf in self._pdfs:
302
+ if limit and total_pages_shown >= limit:
303
+ break
304
+
305
+ # Calculate limit for this PDF
306
+ pdf_limit = per_pdf_limit
307
+ if limit:
308
+ remaining = limit - total_pages_shown
309
+ pdf_limit = min(per_pdf_limit or remaining, remaining)
310
+
311
+ # Get PDF identifier
312
+ pdf_name = getattr(pdf, "filename", None) or getattr(pdf, "path", "Unknown")
313
+ if isinstance(pdf_name, Path):
314
+ pdf_name = pdf_name.name
315
+ elif "/" in str(pdf_name):
316
+ pdf_name = str(pdf_name).split("/")[-1]
317
+
318
+ # Render this PDF
319
+ try:
320
+ # Get render specs from the PDF
321
+ render_specs = pdf._get_render_specs(mode="show", max_pages=pdf_limit, **kwargs)
322
+
323
+ if not render_specs:
324
+ continue
325
+
326
+ # Get the highlighter and render without displaying
327
+ highlighter = pdf._get_highlighter()
328
+ pdf_image = highlighter.unified_render(
329
+ specs=render_specs,
330
+ layout="grid" if len(render_specs) > 1 else "single",
331
+ columns=6,
332
+ **kwargs,
333
+ )
334
+
335
+ if pdf_image:
336
+ # Add label above the PDF image
337
+ label_height = 40
338
+ label_bg_color = (240, 240, 240)
339
+ label_text_color = (0, 0, 0)
340
+
341
+ # Create new image with space for label
342
+ width, height = pdf_image.size
343
+ labeled_image = Image.new("RGB", (width, height + label_height), "white")
344
+
345
+ # Draw label background
346
+ draw = ImageDraw.Draw(labeled_image)
347
+ draw.rectangle([0, 0, width, label_height], fill=label_bg_color)
348
+
349
+ # Draw label text
350
+ try:
351
+ # Try to use a nice font if available
352
+ font = ImageFont.truetype("Arial", 20)
353
+ except:
354
+ # Fallback to default font
355
+ font = ImageFont.load_default()
356
+
357
+ label_text = f"{pdf_name} ({len(pdf.pages)} pages)"
358
+ draw.text((10, 10), label_text, fill=label_text_color, font=font)
359
+
360
+ # Paste PDF image below label
361
+ labeled_image.paste(pdf_image, (0, label_height))
362
+
363
+ all_images.append(labeled_image)
364
+ total_pages_shown += min(pdf_limit, len(pdf.pages))
365
+
366
+ except Exception as e:
367
+ logger.warning(f"Failed to render PDF {pdf_name}: {e}")
368
+ continue
369
+
370
+ if not all_images:
371
+ print("No PDFs could be rendered")
372
+ return None
373
+
374
+ # Combine all images vertically
375
+ if len(all_images) == 1:
376
+ combined = all_images[0]
377
+ else:
378
+ # Add spacing between PDFs
379
+ spacing = 20
380
+ total_height = sum(img.height for img in all_images) + spacing * (len(all_images) - 1)
381
+ max_width = max(img.width for img in all_images)
382
+
383
+ combined = Image.new("RGB", (max_width, total_height), "white")
384
+
385
+ y_offset = 0
386
+ for i, img in enumerate(all_images):
387
+ # Center images if they're narrower than max width
388
+ x_offset = (max_width - img.width) // 2
389
+ combined.paste(img, (x_offset, y_offset))
390
+ y_offset += img.height
391
+ if i < len(all_images) - 1:
392
+ y_offset += spacing
393
+
394
+ # Return the combined image (Jupyter will display it automatically)
395
+ return combined
396
+
270
397
  @overload
271
398
  def find_all(
272
399
  self,
@@ -186,7 +186,7 @@ class Visualizable:
186
186
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
187
187
  labels: bool = True,
188
188
  label_format: Optional[str] = None,
189
- highlights: Optional[List[Dict[str, Any]]] = None,
189
+ highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
190
190
  legend_position: str = "right",
191
191
  annotate: Optional[Union[str, List[str]]] = None,
192
192
  # Layout options for multi-page/region
@@ -211,7 +211,7 @@ class Visualizable:
211
211
  color: Default highlight color
212
212
  labels: Whether to show labels for highlights
213
213
  label_format: Format string for labels (e.g., "Element {index}")
214
- highlights: Additional highlight groups to show
214
+ highlights: Additional highlight groups to show, or False to disable all highlights
215
215
  legend_position: Position of legend/colorbar ('right', 'left', 'top', 'bottom')
216
216
  annotate: Attribute name(s) to display on highlights (string or list)
217
217
  layout: How to arrange multiple pages/regions (defaults to 'grid' for multi-page, 'single' for single page)
@@ -1192,7 +1192,7 @@ class Element(
1192
1192
  self,
1193
1193
  mode: Literal["show", "render"] = "show",
1194
1194
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
1195
- highlights: Optional[List[Dict[str, Any]]] = None,
1195
+ highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
1196
1196
  crop: Union[bool, Literal["content"]] = False,
1197
1197
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
1198
1198
  label: Optional[str] = None,
@@ -1203,7 +1203,7 @@ class Element(
1203
1203
  Args:
1204
1204
  mode: Rendering mode - 'show' includes highlights, 'render' is clean
1205
1205
  color: Color for highlighting this element in show mode
1206
- highlights: Additional highlight groups to show
1206
+ highlights: Additional highlight groups to show, or False to disable all highlights
1207
1207
  crop: Whether to crop to element bounds
1208
1208
  crop_bbox: Explicit crop bounds
1209
1209
  label: Optional label for this element
@@ -1225,19 +1225,23 @@ class Element(
1225
1225
  if hasattr(self, "bbox") and self.bbox:
1226
1226
  spec.crop_bbox = self.bbox
1227
1227
 
1228
- # Add highlight in show mode
1229
- if mode == "show":
1230
- # Use provided label or generate one
1231
- element_label = label if label is not None else self.__class__.__name__
1232
-
1233
- spec.add_highlight(
1234
- element=self,
1235
- color=color or "red", # Default red for single element
1236
- label=element_label,
1237
- )
1228
+ # Add highlight in show mode (unless explicitly disabled with highlights=False)
1229
+ if mode == "show" and highlights is not False:
1230
+ # Only highlight this element if:
1231
+ # 1. We're not cropping, OR
1232
+ # 2. We're cropping but color was explicitly specified
1233
+ if not crop or color is not None:
1234
+ # Use provided label or generate one
1235
+ element_label = label if label is not None else self.__class__.__name__
1236
+
1237
+ spec.add_highlight(
1238
+ element=self,
1239
+ color=color or "red", # Default red for single element
1240
+ label=element_label,
1241
+ )
1238
1242
 
1239
- # Add additional highlight groups if provided
1240
- if highlights:
1243
+ # Add additional highlight groups if provided (and highlights is a list)
1244
+ if highlights and isinstance(highlights, list):
1241
1245
  for group in highlights:
1242
1246
  group_elements = group.get("elements", [])
1243
1247
  group_color = group.get("color", color)
@@ -221,7 +221,7 @@ class Region(
221
221
  self,
222
222
  mode: Literal["show", "render"] = "show",
223
223
  color: Optional[Union[str, Tuple[int, int, int]]] = None,
224
- highlights: Optional[List[Dict[str, Any]]] = None,
224
+ highlights: Optional[Union[List[Dict[str, Any]], bool]] = None,
225
225
  crop: Union[bool, Literal["content"]] = True, # Default to True for regions
226
226
  crop_bbox: Optional[Tuple[float, float, float, float]] = None,
227
227
  **kwargs,
@@ -231,7 +231,7 @@ class Region(
231
231
  Args:
232
232
  mode: Rendering mode - 'show' includes highlights, 'render' is clean
233
233
  color: Color for highlighting this region in show mode
234
- highlights: Additional highlight groups to show
234
+ highlights: Additional highlight groups to show, or False to disable all highlights
235
235
  crop: Whether to crop to this region
236
236
  crop_bbox: Explicit crop bounds (overrides region bounds)
237
237
  **kwargs: Additional parameters
@@ -250,10 +250,12 @@ class Region(
250
250
  # Crop to this region's bounds
251
251
  spec.crop_bbox = self.bbox
252
252
 
253
- # Add highlights in show mode
254
- if mode == "show":
255
- # Highlight this region
256
- if color or mode == "show": # Always highlight in show mode
253
+ # Add highlights in show mode (unless explicitly disabled with highlights=False)
254
+ if mode == "show" and highlights is not False:
255
+ # Only highlight this region if:
256
+ # 1. We're not cropping, OR
257
+ # 2. We're cropping but color was explicitly specified
258
+ if not crop or color is not None:
257
259
  spec.add_highlight(
258
260
  bbox=self.bbox,
259
261
  polygon=self.polygon if self.has_polygon else None,
@@ -261,8 +263,8 @@ class Region(
261
263
  label=self.label or self.name or "Region",
262
264
  )
263
265
 
264
- # Add additional highlight groups if provided
265
- if highlights:
266
+ # Add additional highlight groups if provided (and highlights is a list)
267
+ if highlights and isinstance(highlights, list):
266
268
  for group in highlights:
267
269
  elements = group.get("elements", [])
268
270
  group_color = group.get("color", color)
@@ -0,0 +1,7 @@
1
+ """Vision module for visual similarity and pattern matching"""
2
+
3
+ from .mixin import VisualSearchMixin
4
+ from .results import Match, MatchResults
5
+ from .similarity import VisualMatcher, compute_phash
6
+
7
+ __all__ = ["VisualMatcher", "compute_phash", "Match", "MatchResults", "VisualSearchMixin"]
@@ -0,0 +1,209 @@
1
+ """Mixin to add visual similarity search to Page/PDF/PDFCollection"""
2
+
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import numpy as np
6
+ from PIL import Image
7
+ from tqdm.auto import tqdm
8
+
9
+ from .results import Match, MatchResults
10
+ from .similarity import VisualMatcher, compute_phash
11
+
12
+
13
+ class VisualSearchMixin:
14
+ """Add find_similar method to classes that include this mixin"""
15
+
16
+ def find_similar(
17
+ self,
18
+ examples: Union["Element", "Region", List[Union["Element", "Region"]]],
19
+ using: str = "vision",
20
+ confidence: float = 0.6,
21
+ sizes: Optional[Union[float, Tuple, List]] = (0.8, 1.2),
22
+ resolution: int = 72,
23
+ hash_size: int = 20,
24
+ step_factor: float = 0.1,
25
+ max_per_page: Optional[int] = None,
26
+ show_progress: bool = True,
27
+ **kwargs,
28
+ ) -> MatchResults:
29
+ """
30
+ Find regions visually similar to the given example(s).
31
+
32
+ Args:
33
+ examples: Single element/region or list of examples to search for
34
+ using: Search method - currently only 'vision' is supported
35
+ confidence: Minimum similarity score (0-1)
36
+ sizes: Size variations to search. Can be:
37
+ - float: ±percentage (e.g., 0.2 = 80%-120%)
38
+ - tuple(min, max): search range with smart logarithmic steps (default: (0.8, 1.0))
39
+ - tuple(min, max, step): explicit step size
40
+ - list: exact sizes to try (e.g., [0.8, 1.0, 1.2])
41
+ resolution: Resolution for image comparison (DPI) (default: 72)
42
+ hash_size: Size of perceptual hash grid (default: 12)
43
+ step_factor: Step size as fraction of template size (default: 0.1)
44
+ max_per_page: Maximum matches to return per page
45
+ show_progress: Show progress bar for multi-page searches (default: True)
46
+ **kwargs: Additional options
47
+
48
+ Returns:
49
+ MatchResults collection
50
+ """
51
+ if using != "vision":
52
+ raise NotImplementedError(f"using='{using}' not yet supported")
53
+
54
+ # Ensure examples is a list
55
+ if not isinstance(examples, list):
56
+ examples = [examples]
57
+
58
+ # Initialize matcher with specified hash size
59
+ matcher = VisualMatcher(hash_size=hash_size)
60
+
61
+ # Prepare templates
62
+ templates = []
63
+ for example in examples:
64
+ # Render the example region/element
65
+ example_image = example.render(resolution=resolution, crop=True)
66
+ template_hash = compute_phash(example_image, hash_size=hash_size)
67
+ templates.append({"image": example_image, "hash": template_hash, "source": example})
68
+
69
+ # Get pages to search based on the object type
70
+ if hasattr(self, "__class__") and self.__class__.__name__ == "PDFCollection":
71
+ # PDFCollection needs to iterate through all PDFs
72
+ pages_to_search = []
73
+ for pdf in self:
74
+ pages_to_search.extend(pdf.pages)
75
+ elif hasattr(self, "pages"): # PDF
76
+ pages_to_search = self.pages
77
+ elif hasattr(self, "number"): # Single page
78
+ pages_to_search = [self]
79
+ else:
80
+ raise TypeError(f"Cannot search in {type(self)}")
81
+
82
+ # Calculate total operations for progress bar
83
+ total_operations = 0
84
+ if show_progress:
85
+ # Get scales that will be searched
86
+ scales = matcher._get_search_scales(sizes)
87
+
88
+ # Pre-calculate for all pages and templates
89
+ for page in pages_to_search:
90
+ # Estimate page image size
91
+ page_w = int(page.width * resolution / 72.0)
92
+ page_h = int(page.height * resolution / 72.0)
93
+
94
+ for template_data in templates:
95
+ template_w, template_h = template_data["image"].size
96
+
97
+ for scale in scales:
98
+ scaled_w = int(template_w * scale)
99
+ scaled_h = int(template_h * scale)
100
+
101
+ if scaled_w <= page_w and scaled_h <= page_h:
102
+ step_x = max(1, int(scaled_w * step_factor))
103
+ step_y = max(1, int(scaled_h * step_factor))
104
+
105
+ x_windows = len(range(0, page_w - scaled_w + 1, step_x))
106
+ y_windows = len(range(0, page_h - scaled_h + 1, step_y))
107
+ total_operations += x_windows * y_windows
108
+
109
+ # Search each page
110
+ all_matches = []
111
+
112
+ # Create single progress bar for all operations
113
+ progress_bar = None
114
+ operations_done = 0
115
+ last_update = 0
116
+ update_frequency = max(1, total_operations // 1000) # Update at most 1000 times
117
+
118
+ if show_progress and total_operations > 0:
119
+ progress_bar = tqdm(
120
+ total=total_operations,
121
+ desc="Searching",
122
+ unit="window",
123
+ miniters=update_frequency, # Minimum iterations between updates
124
+ mininterval=0.1, # Minimum time between updates (seconds)
125
+ )
126
+
127
+ for page_idx, page in enumerate(pages_to_search):
128
+ # Render the full page once
129
+ page_image = page.render(resolution=resolution)
130
+
131
+ # Convert page coordinates to image coordinates
132
+ scale = resolution / 72.0 # PDF is 72 DPI
133
+
134
+ page_matches = []
135
+
136
+ # Search for each template
137
+ for template_idx, template_data in enumerate(templates):
138
+ template_image = template_data["image"]
139
+ template_hash = template_data["hash"]
140
+
141
+ # Custom progress callback to update our main progress bar
142
+ def update_progress():
143
+ nonlocal operations_done, last_update
144
+ operations_done += 1
145
+
146
+ # Only update progress bar every N operations to avoid overwhelming output
147
+ if progress_bar and (
148
+ operations_done - last_update >= update_frequency
149
+ or operations_done == total_operations
150
+ ):
151
+ progress_bar.update(operations_done - last_update)
152
+ last_update = operations_done
153
+
154
+ # Update description with current page/template info
155
+ if len(pages_to_search) > 1:
156
+ progress_bar.set_description(
157
+ f"Page {page.number}/{len(pages_to_search)}"
158
+ )
159
+ elif len(templates) > 1:
160
+ progress_bar.set_description(
161
+ f"Template {template_idx + 1}/{len(templates)}"
162
+ )
163
+
164
+ # Find matches in this page - never show internal progress
165
+ candidates = matcher.find_matches_in_image(
166
+ template_image,
167
+ page_image,
168
+ template_hash=template_hash,
169
+ confidence_threshold=confidence,
170
+ sizes=sizes,
171
+ step_factor=step_factor,
172
+ show_progress=False, # We handle progress ourselves
173
+ progress_callback=update_progress if progress_bar else None,
174
+ **kwargs,
175
+ )
176
+
177
+ # Convert image coordinates back to PDF coordinates
178
+ for candidate in candidates:
179
+ img_x0, img_y0, img_x1, img_y1 = candidate.bbox
180
+
181
+ # Convert from image pixels to PDF points
182
+ # No flipping needed! PDF coordinates map directly to PIL coordinates
183
+ pdf_x0 = img_x0 / scale
184
+ pdf_y0 = img_y0 / scale
185
+ pdf_x1 = img_x1 / scale
186
+ pdf_y1 = img_y1 / scale
187
+
188
+ # Create Match object
189
+ match = Match(
190
+ page=page,
191
+ bbox=(pdf_x0, pdf_y0, pdf_x1, pdf_y1),
192
+ confidence=candidate.confidence,
193
+ source_example=template_data["source"],
194
+ )
195
+ page_matches.append(match)
196
+
197
+ # Apply max_per_page limit if specified
198
+ if max_per_page and len(page_matches) > max_per_page:
199
+ # Sort by confidence and take top N
200
+ page_matches.sort(key=lambda m: m.confidence, reverse=True)
201
+ page_matches = page_matches[:max_per_page]
202
+
203
+ all_matches.extend(page_matches)
204
+
205
+ # Close progress bar
206
+ if progress_bar:
207
+ progress_bar.close()
208
+
209
+ return MatchResults(all_matches)