natural-pdf 0.2.17__tar.gz → 0.2.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. natural_pdf-0.2.19/CHECKBOX_DETECTION.md +172 -0
  2. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/CLAUDE.md +1 -0
  3. {natural_pdf-0.2.17/natural_pdf.egg-info → natural_pdf-0.2.19}/PKG-INFO +1 -1
  4. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/element-selection/index.md +45 -0
  5. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/layout-analysis/index.md +59 -0
  6. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/quick-reference/index.md +2 -0
  7. natural_pdf-0.2.19/example_checkbox_usage.py +55 -0
  8. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/__init__.py +8 -0
  9. natural_pdf-0.2.19/natural_pdf/analyzers/checkbox/__init__.py +6 -0
  10. natural_pdf-0.2.19/natural_pdf/analyzers/checkbox/base.py +265 -0
  11. natural_pdf-0.2.19/natural_pdf/analyzers/checkbox/checkbox_analyzer.py +329 -0
  12. natural_pdf-0.2.19/natural_pdf/analyzers/checkbox/checkbox_manager.py +166 -0
  13. natural_pdf-0.2.19/natural_pdf/analyzers/checkbox/checkbox_options.py +60 -0
  14. natural_pdf-0.2.19/natural_pdf/analyzers/checkbox/mixin.py +95 -0
  15. natural_pdf-0.2.19/natural_pdf/analyzers/checkbox/rtdetr.py +201 -0
  16. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/collections/mixins.py +14 -5
  17. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/core/element_manager.py +5 -1
  18. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/core/page.py +103 -9
  19. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/core/page_collection.py +41 -1
  20. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/core/pdf.py +24 -1
  21. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/describe/base.py +20 -0
  22. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/elements/base.py +152 -10
  23. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/elements/element_collection.py +41 -2
  24. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/elements/region.py +115 -2
  25. natural_pdf-0.2.19/natural_pdf/judge.py +1509 -0
  26. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/selectors/parser.py +42 -1
  27. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/spatial.py +42 -39
  28. {natural_pdf-0.2.17 → natural_pdf-0.2.19/natural_pdf.egg-info}/PKG-INFO +1 -1
  29. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf.egg-info/SOURCES.txt +43 -0
  30. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf.egg-info/top_level.txt +2 -0
  31. natural_pdf-0.2.19/temp/check_model.py +49 -0
  32. natural_pdf-0.2.19/temp/check_pdf_content.py +9 -0
  33. natural_pdf-0.2.19/temp/checkbox_checks.py +590 -0
  34. natural_pdf-0.2.19/temp/checkbox_simple.py +117 -0
  35. natural_pdf-0.2.19/temp/checkbox_ux_ideas.py +400 -0
  36. natural_pdf-0.2.19/temp/context_manager_prototype.py +177 -0
  37. natural_pdf-0.2.19/temp/convert_to_hf.py +60 -0
  38. natural_pdf-0.2.19/temp/demo_text_closest.py +66 -0
  39. natural_pdf-0.2.19/temp/inspect_model.py +43 -0
  40. natural_pdf-0.2.19/temp/rtdetr_dinov2_test.py +49 -0
  41. natural_pdf-0.2.19/temp/test_closest_debug.py +26 -0
  42. natural_pdf-0.2.19/temp/test_closest_debug2.py +22 -0
  43. natural_pdf-0.2.19/temp/test_context_exploration.py +85 -0
  44. natural_pdf-0.2.19/temp/test_durham.py +30 -0
  45. natural_pdf-0.2.19/temp/test_empty_string.py +16 -0
  46. natural_pdf-0.2.19/temp/test_similarity.py +15 -0
  47. natural_pdf-0.2.19/tests/demo_multipage.py +56 -0
  48. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_aggregate_selectors.py +2 -2
  49. natural_pdf-0.2.19/tests/test_closest_substring_sorting.py +136 -0
  50. natural_pdf-0.2.19/tests/test_closest_until.py +119 -0
  51. natural_pdf-0.2.19/tests/test_closest_until_comparison.py +106 -0
  52. natural_pdf-0.2.19/tests/test_closest_until_debug.py +81 -0
  53. natural_pdf-0.2.19/tests/test_closest_until_fix.py +112 -0
  54. natural_pdf-0.2.19/tests/test_closest_until_ordering.py +117 -0
  55. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_element_exclusions.py +18 -0
  56. natural_pdf-0.2.19/tests/test_exclusion_recursion_fix.py +46 -0
  57. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_expand_enhanced.py +3 -3
  58. natural_pdf-0.2.19/tests/test_extract_text_words.py +116 -0
  59. natural_pdf-0.2.19/tests/test_from_parameter.py +154 -0
  60. natural_pdf-0.2.19/tests/test_from_parameter_example.py +69 -0
  61. natural_pdf-0.2.19/tests/test_from_self_exclusion.py +60 -0
  62. natural_pdf-0.2.19/tests/test_from_simple.py +56 -0
  63. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_pdf_exclusions_in_find_methods.py +13 -5
  64. natural_pdf-0.2.19/tests/test_text_closest_selector.py +179 -0
  65. natural_pdf-0.2.19/tests/test_within_constraint.py +214 -0
  66. natural_pdf-0.2.19/tests/test_words_vs_find_all_text.py +97 -0
  67. natural_pdf-0.2.19/tests/test_words_vs_find_all_text_summary.md +54 -0
  68. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.cursor/rules/analysis_framework.mdc +0 -0
  69. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.cursor/rules/coding-style.mdc +0 -0
  70. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  71. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.cursor/rules/minimal-comments.mdc +0 -0
  72. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  73. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  74. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.github/workflows/ci.yml +0 -0
  75. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.github/workflows/docs.yml +0 -0
  76. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.github/workflows/nightly-tutorials.yml +0 -0
  77. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.gitignore +0 -0
  78. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/.pre-commit-config.yaml +0 -0
  79. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/01-execute_notebooks.py +0 -0
  80. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/02-run_all_tutorials.sh +0 -0
  81. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/LICENSE +0 -0
  82. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/MANIFEST.in +0 -0
  83. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/README.md +0 -0
  84. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/audit_packaging.py +0 -0
  85. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/check_run_md.sh +0 -0
  86. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/api/index.md +0 -0
  87. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/assets/favicon.png +0 -0
  88. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/assets/favicon.svg +0 -0
  89. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/assets/javascripts/custom.js +0 -0
  90. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/assets/logo.svg +0 -0
  91. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/assets/sample-screen.png +0 -0
  92. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/assets/social-preview.png +0 -0
  93. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/assets/social-preview.svg +0 -0
  94. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/assets/stylesheets/custom.css +0 -0
  95. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/categorizing-documents/index.md +0 -0
  96. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/data-extraction/index.md +0 -0
  97. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/describe/index.md +0 -0
  98. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/document-qa/index.md +0 -0
  99. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/extracting-clean-text/index.md +0 -0
  100. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/finetuning/index.md +0 -0
  101. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/fix-messy-tables/index.md +0 -0
  102. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/fix-messy-tables/table_1.csv +0 -0
  103. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/fix-messy-tables/table_2.csv +0 -0
  104. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/fix-messy-tables/table_3.csv +0 -0
  105. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/guide_adjustment_stream.md +0 -0
  106. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/guides_boundary_columns.md +0 -0
  107. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/index.md +0 -0
  108. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/installation/index.md +0 -0
  109. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/interactive-widget/index.md +0 -0
  110. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/loops-and-groups/index.md +0 -0
  111. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/ocr/index.md +0 -0
  112. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/pdf-navigation/index.md +0 -0
  113. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  114. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/process-forms-and-invoices/index.md +0 -0
  115. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/reflowing-pages/index.md +0 -0
  116. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/regions/index.md +0 -0
  117. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tables/index.md +0 -0
  118. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/text-analysis/index.md +0 -0
  119. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/01-loading-and-extraction.md +0 -0
  120. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/02-finding-elements.md +0 -0
  121. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/03-extracting-blocks.md +0 -0
  122. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/04-table-extraction.md +0 -0
  123. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/05-excluding-content.md +0 -0
  124. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/06-document-qa.md +0 -0
  125. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/07-layout-analysis.md +0 -0
  126. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/07-working-with-regions.md +0 -0
  127. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/08-spatial-navigation.md +0 -0
  128. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/09-section-extraction.md +0 -0
  129. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/10-form-field-extraction.md +0 -0
  130. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  131. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/12-ocr-integration.md +0 -0
  132. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/13-semantic-search.md +0 -0
  133. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/tutorials/14-categorizing-documents.md +0 -0
  134. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/visual-debugging/index.md +0 -0
  135. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/docs/visual-debugging/region.png +0 -0
  136. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/mkdocs.yml +0 -0
  137. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/__init__.py +0 -0
  138. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/guides.py +0 -0
  139. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/__init__.py +0 -0
  140. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/base.py +0 -0
  141. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/docling.py +0 -0
  142. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/gemini.py +0 -0
  143. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  144. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  145. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  146. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/paddle.py +0 -0
  147. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  148. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/surya.py +0 -0
  149. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  150. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/tatr.py +0 -0
  151. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/layout/yolo.py +0 -0
  152. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  153. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/text_options.py +0 -0
  154. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/text_structure.py +0 -0
  155. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/analyzers/utils.py +0 -0
  156. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/classification/manager.py +0 -0
  157. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/classification/mixin.py +0 -0
  158. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/classification/results.py +0 -0
  159. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/cli.py +0 -0
  160. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/core/__init__.py +0 -0
  161. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/core/highlighting_service.py +0 -0
  162. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/core/page_groupby.py +0 -0
  163. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/core/pdf_collection.py +0 -0
  164. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/core/render_spec.py +0 -0
  165. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/describe/__init__.py +0 -0
  166. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/describe/elements.py +0 -0
  167. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/describe/mixin.py +0 -0
  168. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/describe/summary.py +0 -0
  169. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/elements/__init__.py +0 -0
  170. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/elements/image.py +0 -0
  171. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/elements/line.py +0 -0
  172. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/elements/rect.py +0 -0
  173. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/elements/text.py +0 -0
  174. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/export/mixin.py +0 -0
  175. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/__init__.py +0 -0
  176. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/base.py +0 -0
  177. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/data/__init__.py +0 -0
  178. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/data/pdf.ttf +0 -0
  179. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/data/sRGB.icc +0 -0
  180. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/hocr.py +0 -0
  181. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/hocr_font.py +0 -0
  182. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/original_pdf.py +0 -0
  183. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/paddleocr.py +0 -0
  184. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/exporters/searchable_pdf.py +0 -0
  185. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/extraction/manager.py +0 -0
  186. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/extraction/mixin.py +0 -0
  187. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/extraction/result.py +0 -0
  188. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/flows/__init__.py +0 -0
  189. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/flows/collections.py +0 -0
  190. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/flows/element.py +0 -0
  191. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/flows/flow.py +0 -0
  192. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/flows/region.py +0 -0
  193. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/__init__.py +0 -0
  194. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/engine.py +0 -0
  195. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/engine_doctr.py +0 -0
  196. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/engine_easyocr.py +0 -0
  197. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/engine_paddle.py +0 -0
  198. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/engine_surya.py +0 -0
  199. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/ocr_factory.py +0 -0
  200. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/ocr_manager.py +0 -0
  201. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/ocr_options.py +0 -0
  202. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/ocr/utils.py +0 -0
  203. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/qa/__init__.py +0 -0
  204. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/qa/document_qa.py +0 -0
  205. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/qa/qa_result.py +0 -0
  206. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/search/__init__.py +0 -0
  207. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/search/lancedb_search_service.py +0 -0
  208. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/search/numpy_search_service.py +0 -0
  209. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/search/search_options.py +0 -0
  210. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/search/search_service_protocol.py +0 -0
  211. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/search/searchable_mixin.py +0 -0
  212. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/selectors/__init__.py +0 -0
  213. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/tables/__init__.py +0 -0
  214. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/tables/result.py +0 -0
  215. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/templates/__init__.py +0 -0
  216. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  217. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/templates/spa/css/style.css +0 -0
  218. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/templates/spa/index.html +0 -0
  219. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/templates/spa/js/app.js +0 -0
  220. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/templates/spa/words.txt +0 -0
  221. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/text_mixin.py +0 -0
  222. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/__init__.py +0 -0
  223. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/bidi_mirror.py +0 -0
  224. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/color_utils.py +0 -0
  225. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/debug.py +0 -0
  226. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/highlighting.py +0 -0
  227. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/identifiers.py +0 -0
  228. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/layout.py +0 -0
  229. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/locks.py +0 -0
  230. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/packaging.py +0 -0
  231. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/pdfminer_patches.py +0 -0
  232. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/reading_order.py +0 -0
  233. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/sections.py +0 -0
  234. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/text_extraction.py +0 -0
  235. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/utils/visualization.py +0 -0
  236. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/vision/__init__.py +0 -0
  237. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/vision/mixin.py +0 -0
  238. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/vision/results.py +0 -0
  239. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/vision/similarity.py +0 -0
  240. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/vision/template_matching.py +0 -0
  241. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/widgets/__init__.py +0 -0
  242. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf/widgets/viewer.py +0 -0
  243. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf.egg-info/dependency_links.txt +0 -0
  244. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf.egg-info/entry_points.txt +0 -0
  245. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/natural_pdf.egg-info/requires.txt +0 -0
  246. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/noxfile.py +0 -0
  247. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/optimization/memory_comparison.py +0 -0
  248. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/optimization/pdf_analyzer.py +0 -0
  249. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/optimization/performance_analysis.py +0 -0
  250. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  251. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  252. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  253. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  254. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/optimization/test_cleanup_methods.py +0 -0
  255. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/optimization/test_memory_fix.py +0 -0
  256. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/publish.sh +0 -0
  257. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/pyproject.toml +0 -0
  258. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/sample-screen.png +0 -0
  259. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/setup.cfg +0 -0
  260. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/fix_page_exclusions.py +0 -0
  261. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_draw_guides.py +0 -0
  262. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_draw_guides_interactive.py +0 -0
  263. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_exclusion_with_debug.py +0 -0
  264. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_find_exclusions_fix.py +0 -0
  265. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_find_exclusions_fix_no_recursion.py +0 -0
  266. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_fix_real_pdf.py +0 -0
  267. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_fix_working.py +0 -0
  268. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_fixed_pdf_exclusions.py +0 -0
  269. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_guide_draw_notebook.py +0 -0
  270. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_horizontal_top_bottom.py +0 -0
  271. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_inline_js.py +0 -0
  272. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_marker_order.py +0 -0
  273. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_original_exclusions_now_work.py +0 -0
  274. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_pdf_exclusions_with_guides.py +0 -0
  275. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_region_exclusions_detailed.py +0 -0
  276. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_stripes_real_pdf.py +0 -0
  277. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_vertical_stripes.py +0 -0
  278. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_widget_functionality.py +0 -0
  279. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/temp/test_widget_simple.py +0 -0
  280. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/conftest.py +0 -0
  281. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/exporters/test_paddleocr_exporter.py +0 -0
  282. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_annotate.py +0 -0
  283. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_arabic_performance.py +0 -0
  284. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_arabic_real_world.py +0 -0
  285. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_auto_multipage_option.py +0 -0
  286. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_color_conversion.py +0 -0
  287. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_color_hex_display.py +0 -0
  288. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_core/test_containment_geometry.py +0 -0
  289. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_core/test_elements.py +0 -0
  290. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_core/test_loading.py +0 -0
  291. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_core/test_spatial.py +0 -0
  292. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_core/test_text_extraction.py +0 -0
  293. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_core/test_text_layer.py +0 -0
  294. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_crop_enhancements.py +0 -0
  295. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_crop_region_highlights.py +0 -0
  296. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_directional_defaults.py +0 -0
  297. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_dissolve.py +0 -0
  298. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_dissolve_cross_page_bug.py +0 -0
  299. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_dissolve_debug_issue.py +0 -0
  300. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_dissolve_real_world_issue.py +0 -0
  301. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_dissolve_single_elements.py +0 -0
  302. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_dissolve_vertical_offset_issue.py +0 -0
  303. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_document_qa.py +0 -0
  304. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_element_addition.py +0 -0
  305. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_element_collection_guides.py +0 -0
  306. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_element_collection_show_cols.py +0 -0
  307. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_element_collection_slicing.py +0 -0
  308. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_element_show_crop_highlights.py +0 -0
  309. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_empty_pseudo_class.py +0 -0
  310. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_exclude_multi_page.py +0 -0
  311. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_exclude_real_pdf.py +0 -0
  312. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_exclusions.py +0 -0
  313. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_expand.py +0 -0
  314. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_extraction_error.py +0 -0
  315. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_extraction_mixin_fix.py +0 -0
  316. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_extraction_text_and_vision.py +0 -0
  317. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_extraction_working.py +0 -0
  318. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_find_similar.py +0 -0
  319. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_first_last_selectors.py +0 -0
  320. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_fix_get_sections_zero_height.py +0 -0
  321. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_flow_region_directional.py +0 -0
  322. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_get_sections_fix_comprehensive.py +0 -0
  323. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_get_sections_zero_height.py +0 -0
  324. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_groupby.py +0 -0
  325. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guide_adjustment_stream.py +0 -0
  326. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides.py +0 -0
  327. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_apply_exclusions.py +0 -0
  328. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_apply_exclusions_simple.py +0 -0
  329. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_boundaries.py +0 -0
  330. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_extract_table.py +0 -0
  331. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_extract_table_collections.py +0 -0
  332. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_extract_table_exclusions.py +0 -0
  333. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_extract_table_real.py +0 -0
  334. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_from_headers.py +0 -0
  335. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_from_stripes.py +0 -0
  336. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_integration.py +0 -0
  337. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_marker_sorting.py +0 -0
  338. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_guides_partial.py +0 -0
  339. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_highlight_color_falsy.py +0 -0
  340. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_highlight_detection.py +0 -0
  341. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_highlight_detection_comprehensive.py +0 -0
  342. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_highlight_offset.py +0 -0
  343. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_highlight_protocol.py +0 -0
  344. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_highlight_protocol_simple.py +0 -0
  345. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_highlight_regions.py +0 -0
  346. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_horizontal_guides_alignment.py +0 -0
  347. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_include_boundaries_comprehensive.py +0 -0
  348. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_include_boundaries_final.py +0 -0
  349. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_include_boundaries_final_verification.py +0 -0
  350. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_include_boundaries_fix.py +0 -0
  351. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_include_boundaries_mock.py +0 -0
  352. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_include_boundaries_simple.py +0 -0
  353. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_include_boundaries_types_pdf.py +0 -0
  354. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_include_boundaries_verification.py +0 -0
  355. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_include_boundaries_with_real_text.py +0 -0
  356. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_loading_original.py +0 -0
  357. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_match_results_sorting.py +0 -0
  358. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_merge_connected.py +0 -0
  359. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_merge_connected_real_world.py +0 -0
  360. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_merge_method.py +0 -0
  361. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_merged_flowregion_specs.py +0 -0
  362. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_mixed_collection_rendering.py +0 -0
  363. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_multi_page_table_discovery.py +0 -0
  364. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_multipage_directional.py +0 -0
  365. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_negative_bounds_pdf.py +0 -0
  366. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_optional_deps.py +0 -0
  367. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_page_exclusion_lists.py +0 -0
  368. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  369. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_pdfminer_bug_status.py +0 -0
  370. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_pdfminer_color_bug.py +0 -0
  371. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_pdfminer_color_stack_bug.py +0 -0
  372. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_phash_masking.py +0 -0
  373. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_region_find_similar.py +0 -0
  374. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_region_show_crop_highlights.py +0 -0
  375. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_region_viewer.py +0 -0
  376. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_sections_end_only.py +0 -0
  377. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_sections_with_start_and_end.py +0 -0
  378. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_show_column_layout.py +0 -0
  379. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_show_edge_cases.py +0 -0
  380. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_show_exclusions.py +0 -0
  381. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_show_exclusions_feature.py +0 -0
  382. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_show_limit.py +0 -0
  383. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_skip_repeating_headers_multipage.py +0 -0
  384. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_slice_cache_reuse.py +0 -0
  385. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_slice_exclusion_fix.py +0 -0
  386. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_slice_exclusion_issue.py +0 -0
  387. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_slice_exclusion_mock.py +0 -0
  388. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_sliced_collection_exclusions.py +0 -0
  389. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_smart_exclusion.py +0 -0
  390. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_spatial_offset.py +0 -0
  391. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_strikethrough_detection.py +0 -0
  392. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_table_result_header_mismatch.py +0 -0
  393. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_table_result_keep_blank.py +0 -0
  394. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_template_matching.py +0 -0
  395. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_template_white_masking.py +0 -0
  396. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_tiny_text_tables.py +0 -0
  397. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_tiny_text_tables_table.py +0 -0
  398. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_tutorials.py +0 -0
  399. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_underline_detection.py +0 -0
  400. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tests/test_update_text.py +0 -0
  401. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/todo/bad_pdf_analysis.md +0 -0
  402. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/todo/evaluation.md +0 -0
  403. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  404. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  405. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  406. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/README.md +0 -0
  407. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/__init__.py +0 -0
  408. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/analyser.py +0 -0
  409. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  410. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  411. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/eval_suite.py +0 -0
  412. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  413. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  414. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  415. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  416. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  417. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/reporter.py +0 -0
  418. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/tools/bad_pdf_eval/utils.py +0 -0
  419. {natural_pdf-0.2.17 → natural_pdf-0.2.19}/uv.lock +0 -0
@@ -0,0 +1,172 @@
1
+ # Checkbox Detection in Natural PDF
2
+
3
+ Natural PDF now includes built-in checkbox detection using computer vision models. This feature can automatically detect checkboxes in PDF documents and determine whether they are checked or unchecked.
4
+
5
+ ## Quick Start
6
+
7
+ ```python
8
+ import natural_pdf as npdf
9
+
10
+ # Load PDF and detect checkboxes
11
+ pdf = npdf.PDF("form.pdf")
12
+ checkboxes = pdf[0].detect_checkboxes()
13
+
14
+ # Check results
15
+ for cb in checkboxes:
16
+ print(f"Checkbox at {cb.bbox}: {'✓' if cb.is_checked else '✗'}")
17
+ ```
18
+
19
+ ## Features
20
+
21
+ ### 1. Basic Detection
22
+ ```python
23
+ # Detect all checkboxes on a page
24
+ checkboxes = page.detect_checkboxes()
25
+
26
+ # Access checkbox properties
27
+ checkbox = checkboxes[0]
28
+ print(checkbox.is_checked) # True/False
29
+ print(checkbox.checkbox_state) # "checked"/"unchecked"
30
+ print(checkbox.confidence) # Detection confidence (0-1)
31
+ ```
32
+
33
+ ### 2. Using Selectors
34
+ ```python
35
+ # Find checked/unchecked boxes
36
+ checked = page.find_all('checkbox:checked')
37
+ unchecked = page.find_all('checkbox:unchecked')
38
+
39
+ # All checkboxes
40
+ all_checkboxes = page.find_all('checkbox')
41
+
42
+ # By attributes (note: use is_checked, not checked)
43
+ checked = page.find_all('checkbox[is_checked=true]')
44
+ ```
45
+
46
+ ### 3. Limited Detection
47
+ When you know the expected number of checkboxes:
48
+ ```python
49
+ # Get top 10 checkboxes by confidence
50
+ checkboxes = page.detect_checkboxes(limit=10)
51
+ ```
52
+
53
+ ### 4. Multi-level Detection
54
+ ```python
55
+ # Entire PDF
56
+ all_checkboxes = pdf.detect_checkboxes()
57
+
58
+ # Page collection
59
+ pages = pdf[0:5]
60
+ checkboxes = pages.detect_checkboxes()
61
+
62
+ # Within a region
63
+ region = page.find('text:contains("Options")').below()
64
+ checkboxes = region.detect_checkboxes()
65
+ ```
66
+
67
+ ### 5. Visualization
68
+ ```python
69
+ # Show detected checkboxes
70
+ checkboxes.show()
71
+
72
+ # Checkboxes display their state in repr
73
+ print(checkboxes[0])
74
+ # <Region type='checkbox' [checked] bbox=(100, 200, 120, 220)>
75
+ ```
76
+
77
+ ## Advanced Configuration
78
+
79
+ ### Custom Detection Options
80
+ ```python
81
+ from natural_pdf.analyzers.checkbox import CheckboxOptions
82
+
83
+ # Higher confidence threshold (default is 0.05)
84
+ options = CheckboxOptions(confidence=0.5)
85
+ checkboxes = page.detect_checkboxes(options=options)
86
+
87
+ # Different resolution (default is 150 DPI)
88
+ checkboxes = page.detect_checkboxes(resolution=300)
89
+
90
+ # GPU acceleration
91
+ checkboxes = page.detect_checkboxes(device='cuda')
92
+ ```
93
+
94
+ ### Custom Models
95
+ ```python
96
+ # Use a different checkbox detection model
97
+ options = CheckboxOptions(
98
+ model_repo="your-org/your-checkbox-model",
99
+ label_mapping={
100
+ "empty_box": "unchecked",
101
+ "ticked_box": "checked",
102
+ }
103
+ )
104
+ checkboxes = page.detect_checkboxes(options=options)
105
+ ```
106
+
107
+ ### Disable Text Filtering
108
+ ```python
109
+ # If your checkboxes contain text for some reason
110
+ checkboxes = page.detect_checkboxes(reject_with_text=False)
111
+
112
+ # Or with options
113
+ options = CheckboxOptions(reject_with_text=False)
114
+ checkboxes = page.detect_checkboxes(options=options)
115
+ ```
116
+
117
+ ## Implementation Details
118
+
119
+ - **Default Model**: Uses `wendys-llc/rtdetr-v2-r50-chkbx` RT-DETR model
120
+ - **Low Confidence**: Default confidence is 0.02 (very low to catch all checkboxes)
121
+ - **Resolution**: Renders at 150 DPI by default for efficiency
122
+ - **No Overlaps**: Aggressive NMS rejects ANY overlapping detections
123
+ - **Text Filtering**: Automatically rejects detections containing text (real checkboxes should be empty)
124
+ - **Architecture**: Follows the same pattern as layout detection for consistency
125
+
126
+ ## Common Use Cases
127
+
128
+ ### Form Processing
129
+ ```python
130
+ # Extract form checkbox states
131
+ form_data = {}
132
+ for cb in page.detect_checkboxes():
133
+ # Find nearby text label
134
+ label = cb.left('text').extract_text() or cb.above('text').extract_text()
135
+ form_data[label] = cb.is_checked
136
+ ```
137
+
138
+ ### Validation
139
+ ```python
140
+ # Ensure all required checkboxes are checked
141
+ required = ["Terms", "Privacy", "Age"]
142
+ checkboxes = page.detect_checkboxes()
143
+
144
+ for req in required:
145
+ cb = page.find(f'text:contains("{req}")').right('checkbox:first')
146
+ if not cb or not cb.is_checked:
147
+ print(f"Warning: {req} not checked!")
148
+ ```
149
+
150
+ ### Batch Processing
151
+ ```python
152
+ # Process multiple forms
153
+ for pdf_path in pdf_files:
154
+ pdf = npdf.PDF(pdf_path)
155
+ results = []
156
+
157
+ for page in pdf.pages:
158
+ checkboxes = page.detect_checkboxes(limit=20)
159
+ checked_count = len([cb for cb in checkboxes if cb.is_checked])
160
+ results.append({
161
+ 'page': page.number,
162
+ 'total': len(checkboxes),
163
+ 'checked': checked_count
164
+ })
165
+ ```
166
+
167
+ ## Troubleshooting
168
+
169
+ 1. **No checkboxes detected**: Try lowering confidence threshold
170
+ 2. **Too many false positives**: Increase confidence threshold
171
+ 3. **Missing transformers**: Install with `pip install transformers torch`
172
+ 4. **Selector syntax**: Use `:checked`/`:unchecked` or `[is_checked=true]`
@@ -96,3 +96,4 @@ Natural PDF is a Python library for intelligent PDF document processing that com
96
96
  ### Environment and Tooling
97
97
  - Always use the virtual environment in .venv
98
98
  - Use uv when possible for efficient package management
99
+ - Don't create new PDFs for testing, just use pdfs/01-practice.pdf.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.17
3
+ Version: 0.2.19
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -117,6 +117,7 @@ These are powerful filters that let you find elements based on their content or
117
117
  | Pseudo-Class | Example | What It Finds |
118
118
  |-----------------------|-----------------------------------|---------------|
119
119
  | `:contains('text')` | `text:contains('Report')` | Elements containing specific text |
120
+ | `:closest('text')` | `text:closest('Invoice Date')` | Fuzzy text matching (great for OCR errors) |
120
121
  | `:bold` | `text:bold` | Bold text (detected automatically) |
121
122
  | `:italic` | `text:italic` | Italic text |
122
123
  | `:strike` | `text:strike` | Struck-through text |
@@ -187,6 +188,34 @@ page.find_all('text:contains("INS-\\w+")', regex=True)
187
188
  page.find_all('text:contains("jungle health")', regex=True, case=False)
188
189
  ```
189
190
 
191
+ ### Fuzzy Text Matching for OCR Errors
192
+
193
+ When working with OCR'd PDFs, text recognition isn't always perfect. The `:closest()` pseudo-class helps you find text even when it contains errors:
194
+
195
+ ```python
196
+ # Find "Invoice Date" even if OCR read it as "Invice Date" or "Invoice Dat"
197
+ page.find('text:closest("Invoice Date")')
198
+
199
+ # Specify a similarity threshold (0.0 to 1.0)
200
+ # 0.8 = 80% similar
201
+ page.find_all('text:closest("Date of Review@0.8")')
202
+
203
+ # Default threshold is 0.0 - returns all text sorted by similarity
204
+ # Exact substring matches always come first
205
+ all_sorted = page.find_all('text:closest("Durham")')
206
+ ```
207
+
208
+ The `:closest()` selector is particularly useful for:
209
+ - OCR errors like "rn" read as "m" (Durharn → Durham)
210
+ - Missing punctuation (Date: → Date)
211
+ - Character confusion (l/I, 0/O)
212
+ - Partial matches when you're not sure of the exact text
213
+
214
+ ```python
215
+ # Combine with other selectors for more precision
216
+ page.find('text:closest("Total Amount@0.7")[size>12]')
217
+ ```
218
+
190
219
  ## Working with Groups of Elements
191
220
 
192
221
  `find_all()` returns an `ElementCollection` - like a list, but with PDF-specific superpowers.
@@ -221,6 +250,22 @@ service_headings = headings.filter(lambda heading: 'Service' in heading.extract_
221
250
  headings.extract_text()
222
251
  ```
223
252
 
253
+ ### Applying Functions to Collections
254
+
255
+ The `.apply()` method lets you transform each element in a collection. It preserves the collection type even when results are empty:
256
+
257
+ ```python
258
+ # Apply a function to each element
259
+ uppercase_texts = texts.apply(lambda t: t.extract_text().upper())
260
+
261
+ # Navigate from each element - returns an ElementCollection
262
+ regions_below = headings.apply(lambda h: h.below())
263
+
264
+ # Even empty results maintain the collection type
265
+ empty_collection = page.find_all('nonexistent').apply(lambda x: x.expand(10))
266
+ # Returns ElementCollection([]) not []
267
+ ```
268
+
224
269
  *Note: `.highest()`, `.lowest()`, etc. will complain if your collection spans multiple pages.*
225
270
 
226
271
  ## Finding Elements with Statistical Properties
@@ -207,6 +207,64 @@ regions = page.analyze_layout(engine="gemini", options=options)
207
207
  - The client must be compatible with the OpenAI API (see the `openai` Python package).
208
208
  - This feature is intended for advanced users who need LLM-based layout analysis.
209
209
 
210
+ ## Using Judge for Visual Classification
211
+
212
+ Natural PDF includes a `Judge` class that can learn to classify visual elements like checkboxes. This is particularly useful after layout detection when you need to determine the state of detected elements.
213
+
214
+ ### Example: Checkbox Classification
215
+
216
+ ```python
217
+ from natural_pdf import Judge
218
+
219
+ # Create a judge for checkbox classification
220
+ judge = Judge("form_checkboxes", labels=["checked", "unchecked"])
221
+
222
+ # Train with examples
223
+ checked_region = page.find("text=Acceptable").left(width=20)
224
+ unchecked_region = page.find("text=Deficient").left(width=20)
225
+
226
+ judge.add(checked_region, "checked")
227
+ judge.add(unchecked_region, "unchecked")
228
+
229
+ # Classify new checkboxes
230
+ new_checkbox = page.find("text=At-Risk").left(width=20)
231
+ result = judge.decide(new_checkbox)
232
+ print(f"Checkbox is: {result.label} (confidence: {result.score:.2f})")
233
+
234
+ # Find which checkbox is selected among multiple options
235
+ checkboxes = [
236
+ page.find("text=Option A").left(width=20),
237
+ page.find("text=Option B").left(width=20),
238
+ page.find("text=Option C").left(width=20)
239
+ ]
240
+ selected = judge.pick("checked", checkboxes, labels=["Option A", "Option B", "Option C"])
241
+ print(f"Selected: {selected.label}")
242
+ ```
243
+
244
+ ### Key Features of Judge
245
+
246
+ 1. **Simple Training**: Requires minimal examples (even just one per class)
247
+ 2. **Robust to Imbalance**: Uses Youden's J weights and prior correction
248
+ 3. **Interactive Teaching**: Use `judge.teach()` in Jupyter for labeling
249
+ 4. **Visual Inspection**: Use `judge.inspect()` to see predictions on training data
250
+ 5. **Persistence**: Save/load trained judges with `judge.save()` and `Judge.load()`
251
+
252
+ ### Advanced Usage
253
+
254
+ ```python
255
+ # Adjust prior if you expect more checked boxes
256
+ judge = Judge("checkboxes", labels=["checked", "unchecked"], target_prior=0.7)
257
+
258
+ # Interactive teaching in Jupyter
259
+ judge.teach() # Use arrow keys to label examples
260
+
261
+ # Visual inspection with previews
262
+ judge.inspect(preview=True) # Shows HTML table with images
263
+
264
+ # Count checkboxes by type
265
+ checked_count = judge.count("checked", checkbox_regions)
266
+ ```
267
+
210
268
  ## Next Steps
211
269
 
212
270
  Layout analysis provides regions that you can use for:
@@ -214,3 +272,4 @@ Layout analysis provides regions that you can use for:
214
272
  - [Table Extraction](../tables/index.ipynb): Especially powerful with TATR regions.
215
273
  - [Text Extraction](../text-extraction/index.ipynb): Extract text only from specific region types (e.g., paragraphs).
216
274
  - [Document QA](../document-qa/index.ipynb): Focus question answering on specific detected regions.
275
+ - Visual Classification: Use Judge to classify detected elements (checkboxes, signatures, etc.)
@@ -40,6 +40,8 @@ data = table_region.extract_table()
40
40
  page.find('text:contains("Invoice")') # Contains text
41
41
  page.find('text:contains("total")', case=False) # Case insensitive
42
42
  page.find('text:contains("\\d+")', regex=True) # Regex pattern
43
+ page.find('text:closest("Invoice Date")') # Fuzzy match (OCR errors)
44
+ page.find('text:closest("Total@0.8")') # 80% similarity threshold
43
45
  ```
44
46
 
45
47
  ### Text Formatting
@@ -0,0 +1,55 @@
1
+ """Example usage of checkbox detection in Natural PDF."""
2
+
3
+ import natural_pdf as npdf
4
+
5
+ # Load a PDF
6
+ pdf = npdf.PDF("pdfs/01-practice.pdf")
7
+ page = pdf[0]
8
+
9
+ # Basic checkbox detection
10
+ print("=== Basic Checkbox Detection ===")
11
+ checkboxes = page.detect_checkboxes()
12
+ print(f"Found {len(checkboxes)} checkboxes")
13
+
14
+ # Show what was found
15
+ for i, cb in enumerate(checkboxes[:3]):
16
+ print(f"\nCheckbox {i}:")
17
+ print(f" State: {'Checked' if cb.is_checked else 'Unchecked'}")
18
+ print(f" Confidence: {cb.confidence:.2f}")
19
+ print(f" Position: {cb.bbox}")
20
+
21
+ # Using selectors to filter checkboxes
22
+ print("\n=== Using Selectors ===")
23
+ checked = page.find_all("checkbox:checked")
24
+ unchecked = page.find_all("checkbox:unchecked")
25
+ print(f"Checked boxes: {len(checked)}")
26
+ print(f"Unchecked boxes: {len(unchecked)}")
27
+
28
+ # Limit detection when you know expected count
29
+ print("\n=== Limited Detection ===")
30
+ # If you know there should be 10 checkboxes on a form
31
+ limited_checkboxes = page.detect_checkboxes(limit=10)
32
+ print(f"Found top {len(limited_checkboxes)} checkboxes by confidence")
33
+
34
+ # Multi-page detection
35
+ print("\n=== Multi-page Detection ===")
36
+ all_checkboxes = pdf.detect_checkboxes(show_progress=False)
37
+ print(f"Total checkboxes in PDF: {len(all_checkboxes)}")
38
+
39
+ # Visualize checkboxes
40
+ print("\n=== Visualization ===")
41
+ print("Showing detected checkboxes...")
42
+ checkboxes.show()
43
+
44
+ # Advanced: Using custom options
45
+ print("\n=== Advanced Options ===")
46
+ from natural_pdf.analyzers.checkbox import CheckboxOptions
47
+
48
+ # Higher confidence threshold
49
+ options = CheckboxOptions(confidence=0.5)
50
+ high_conf_checkboxes = page.detect_checkboxes(options=options)
51
+ print(f"High confidence checkboxes: {len(high_conf_checkboxes)}")
52
+
53
+ # GPU acceleration if available
54
+ gpu_checkboxes = page.detect_checkboxes(device="cuda")
55
+ print(f"GPU-detected checkboxes: {len(gpu_checkboxes)}")
@@ -66,6 +66,7 @@ class Options:
66
66
  self.layout = ConfigSection(
67
67
  directional_offset=0.01, # Offset in points when using directional methods
68
68
  auto_multipage=False, # Whether directional methods span pages by default
69
+ directional_within=None, # Region to constrain directional operations to
69
70
  )
70
71
 
71
72
 
@@ -126,6 +127,9 @@ from natural_pdf.elements.region import Region
126
127
  from natural_pdf.flows.flow import Flow
127
128
  from natural_pdf.flows.region import FlowRegion
128
129
 
130
+ # Judge for visual classification
131
+ from natural_pdf.judge import Decision, Judge, JudgeError, PickResult
132
+
129
133
  # Search options (if extras installed)
130
134
  try:
131
135
  from natural_pdf.search.search_options import (
@@ -165,6 +169,10 @@ __all__ = [
165
169
  "Flow",
166
170
  "FlowRegion",
167
171
  "Guides",
172
+ "Judge",
173
+ "Decision",
174
+ "PickResult",
175
+ "JudgeError",
168
176
  "TextSearchOptions",
169
177
  "MultiModalSearchOptions",
170
178
  "BaseSearchOptions",
@@ -0,0 +1,6 @@
1
+ """Checkbox detection analyzers for natural-pdf."""
2
+
3
+ from .checkbox_manager import CheckboxManager
4
+ from .checkbox_options import CheckboxOptions, RTDETRCheckboxOptions
5
+
6
+ __all__ = ["CheckboxManager", "CheckboxOptions", "RTDETRCheckboxOptions"]
@@ -0,0 +1,265 @@
1
+ """Base class for checkbox detection engines."""
2
+
3
+ import logging
4
+ from abc import ABC, abstractmethod
5
+ from typing import Any, Dict, List, Set
6
+
7
+ from PIL import Image
8
+
9
+ from .checkbox_options import CheckboxOptions
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class CheckboxDetector(ABC):
15
+ """Abstract base class for checkbox detection engines.
16
+
17
+ This class defines the standard interface that all checkbox detection engines
18
+ must implement in natural-pdf. Checkbox detectors analyze document images to
19
+ identify checkboxes and their states (checked/unchecked).
20
+
21
+ Subclasses must implement:
22
+ - detect(): Core checkbox detection for a single image
23
+ - is_available(): Check if engine dependencies are installed
24
+ - _load_model_from_options(): Load and configure the detection model
25
+ - _get_cache_key(): Generate cache keys for model instances
26
+
27
+ Attributes:
28
+ logger: Logger instance for the specific detector.
29
+ _model_cache: Dictionary cache for loaded model instances.
30
+ """
31
+
32
+ def __init__(self):
33
+ """Initialize the base checkbox detector."""
34
+ self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
35
+ self.logger.info(f"Initializing {self.__class__.__name__}")
36
+ self._model_cache: Dict[str, Any] = {} # Cache for initialized models
37
+
38
+ @abstractmethod
39
+ def detect(self, image: Image.Image, options: CheckboxOptions) -> List[Dict[str, Any]]:
40
+ """
41
+ Detect checkboxes in a given PIL Image.
42
+
43
+ Args:
44
+ image: PIL Image of the page/region to analyze.
45
+ options: Instance of CheckboxOptions with configuration.
46
+
47
+ Returns:
48
+ List of detection dictionaries with:
49
+ - 'bbox': Tuple[float, float, float, float] - (x0, y0, x1, y1) relative to image
50
+ - 'class': str - Original class name from model (e.g., 'checkbox', 'checked_checkbox')
51
+ - 'normalized_class': str - Always 'checkbox'
52
+ - 'is_checked': bool - Whether checkbox is checked
53
+ - 'checkbox_state': str - 'checked' or 'unchecked'
54
+ - 'confidence': float - Confidence score (0.0-1.0)
55
+ - 'model': str - Name of the model used
56
+ - 'source': str - Always 'checkbox'
57
+ """
58
+ raise NotImplementedError("Subclasses must implement this method")
59
+
60
+ @classmethod
61
+ @abstractmethod
62
+ def is_available(cls) -> bool:
63
+ """
64
+ Check if the detector's dependencies are installed and usable.
65
+
66
+ Returns:
67
+ True if the detector is available, False otherwise.
68
+ """
69
+ raise NotImplementedError("Subclasses must implement this method")
70
+
71
+ def _get_cache_key(self, options: CheckboxOptions) -> str:
72
+ """
73
+ Generate a cache key for model loading based on relevant options.
74
+
75
+ Args:
76
+ options: The options dataclass instance.
77
+
78
+ Returns:
79
+ A string cache key.
80
+ """
81
+ # Base key includes device, subclasses should add model specifics
82
+ device_key = str(options.device).lower()
83
+ return f"{self.__class__.__name__}_{device_key}"
84
+
85
+ def _get_model(self, options: CheckboxOptions) -> Any:
86
+ """
87
+ Get or initialize the underlying model based on options, using caching.
88
+ """
89
+ cache_key = self._get_cache_key(options)
90
+ if cache_key not in self._model_cache:
91
+ self.logger.info(f"Loading model for cache key: {cache_key}")
92
+ try:
93
+ # Ensure dependencies are met before loading
94
+ if not self.is_available():
95
+ raise RuntimeError(f"{self.__class__.__name__} dependencies are not met.")
96
+ self._model_cache[cache_key] = self._load_model_from_options(options)
97
+ self.logger.info(f"Model loaded successfully for key: {cache_key}")
98
+ except Exception as e:
99
+ self.logger.error(f"Failed to load model for key {cache_key}: {e}", exc_info=True)
100
+ # Remove potentially corrupted cache entry
101
+ self._model_cache.pop(cache_key, None)
102
+ raise
103
+ else:
104
+ self.logger.debug(f"Using cached model for key: {cache_key}")
105
+ return self._model_cache[cache_key]
106
+
107
+ @abstractmethod
108
+ def _load_model_from_options(self, options: CheckboxOptions) -> Any:
109
+ """
110
+ Load and configure the detection model based on provided options.
111
+
112
+ Args:
113
+ options: The options dataclass instance.
114
+
115
+ Returns:
116
+ The loaded model object(s).
117
+ """
118
+ raise NotImplementedError("Subclasses must implement _load_model_from_options")
119
+
120
+ def _map_label_to_state(self, label: str, options: CheckboxOptions) -> tuple[bool, str]:
121
+ """
122
+ Map model output label to checkbox state.
123
+
124
+ Args:
125
+ label: Raw label from model (e.g., 'checked_checkbox', '1')
126
+ options: Options containing label mapping
127
+
128
+ Returns:
129
+ Tuple of (is_checked: bool, state: str)
130
+ """
131
+ # Normalize label
132
+ normalized_label = str(label).lower().strip()
133
+
134
+ # Check mapping
135
+ if normalized_label in options.label_mapping:
136
+ state = options.label_mapping[normalized_label]
137
+ is_checked = state == "checked"
138
+ return is_checked, state
139
+
140
+ # Default heuristic if not in mapping
141
+ if any(term in normalized_label for term in ["checked", "tick", "filled", "1"]):
142
+ return True, "checked"
143
+ else:
144
+ return False, "unchecked"
145
+
146
+ def _apply_nms(
147
+ self, detections: List[Dict[str, Any]], iou_threshold: float
148
+ ) -> List[Dict[str, Any]]:
149
+ """
150
+ Apply non-maximum suppression to remove overlapping detections.
151
+ For checkboxes, we reject ANY meaningful overlap.
152
+
153
+ Args:
154
+ detections: List of detection dictionaries
155
+ iou_threshold: IoU threshold for suppression (ignored for checkboxes - we use stricter rules)
156
+
157
+ Returns:
158
+ Filtered list of detections
159
+ """
160
+ if not detections:
161
+ return detections
162
+
163
+ # Sort by confidence (descending), then by area (ascending) to prefer smaller boxes
164
+ def sort_key(det):
165
+ bbox = det["bbox"]
166
+ area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
167
+ return (-det["confidence"], area)
168
+
169
+ sorted_detections = sorted(detections, key=sort_key)
170
+
171
+ keep = []
172
+ for i, det in enumerate(sorted_detections):
173
+ should_keep = True
174
+ det_bbox = det["bbox"]
175
+
176
+ for kept_det in keep:
177
+ kept_bbox = kept_det["bbox"]
178
+
179
+ # Check for ANY overlap at all
180
+ if self._boxes_overlap(det_bbox, kept_bbox):
181
+ should_keep = False
182
+ logger.debug(f"Rejecting box {det_bbox} due to overlap with {kept_bbox}")
183
+ break
184
+
185
+ if should_keep:
186
+ keep.append(det)
187
+ logger.debug(f"Keeping box {det_bbox} with confidence {det['confidence']}")
188
+
189
+ logger.info(f"NMS: Reduced {len(detections)} detections to {len(keep)}")
190
+ return keep
191
+
192
+ def _boxes_overlap(self, box1: tuple, box2: tuple) -> bool:
193
+ """Check if two boxes have any overlap at all."""
194
+ x1_min, y1_min, x1_max, y1_max = box1
195
+ x2_min, y2_min, x2_max, y2_max = box2
196
+
197
+ # Check if boxes are separated
198
+ if x1_max <= x2_min or x2_max <= x1_min:
199
+ return False
200
+ if y1_max <= y2_min or y2_max <= y1_min:
201
+ return False
202
+
203
+ # If we get here, boxes overlap
204
+ return True
205
+
206
+ def _compute_intersection_ratio(self, box1: tuple, box2: tuple) -> float:
207
+ """
208
+ Compute intersection ratio relative to the smaller box.
209
+ This is more aggressive than IoU for checkbox detection.
210
+ """
211
+ x1_min, y1_min, x1_max, y1_max = box1
212
+ x2_min, y2_min, x2_max, y2_max = box2
213
+
214
+ # Intersection
215
+ inter_xmin = max(x1_min, x2_min)
216
+ inter_ymin = max(y1_min, y2_min)
217
+ inter_xmax = min(x1_max, x2_max)
218
+ inter_ymax = min(y1_max, y2_max)
219
+
220
+ if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
221
+ return 0.0
222
+
223
+ inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
224
+
225
+ # Areas of both boxes
226
+ area1 = (x1_max - x1_min) * (y1_max - y1_min)
227
+ area2 = (x2_max - x2_min) * (y2_max - y2_min)
228
+
229
+ # Ratio relative to smaller box
230
+ smaller_area = min(area1, area2)
231
+ if smaller_area == 0:
232
+ return 0.0
233
+
234
+ return inter_area / smaller_area
235
+
236
+ def _compute_iou(self, box1: tuple, box2: tuple) -> float:
237
+ """Compute IoU between two boxes."""
238
+ x1_min, y1_min, x1_max, y1_max = box1
239
+ x2_min, y2_min, x2_max, y2_max = box2
240
+
241
+ # Intersection
242
+ inter_xmin = max(x1_min, x2_min)
243
+ inter_ymin = max(y1_min, y2_min)
244
+ inter_xmax = min(x1_max, x2_max)
245
+ inter_ymax = min(y1_max, y2_max)
246
+
247
+ if inter_xmax < inter_xmin or inter_ymax < inter_ymin:
248
+ return 0.0
249
+
250
+ inter_area = (inter_xmax - inter_xmin) * (inter_ymax - inter_ymin)
251
+
252
+ # Union
253
+ area1 = (x1_max - x1_min) * (y1_max - y1_min)
254
+ area2 = (x2_max - x2_min) * (y2_max - y2_min)
255
+ union_area = area1 + area2 - inter_area
256
+
257
+ if union_area == 0:
258
+ return 0.0
259
+
260
+ return inter_area / union_area
261
+
262
+ def __del__(self):
263
+ """Cleanup resources."""
264
+ self.logger.info(f"Cleaning up {self.__class__.__name__} resources.")
265
+ self._model_cache.clear()