natural-pdf 0.2.8__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (320) hide show
  1. {natural_pdf-0.2.8/natural_pdf.egg-info → natural_pdf-0.2.9}/PKG-INFO +1 -1
  2. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/elements/element_collection.py +61 -33
  3. {natural_pdf-0.2.8 → natural_pdf-0.2.9/natural_pdf.egg-info}/PKG-INFO +1 -1
  4. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.cursor/rules/analysis_framework.mdc +0 -0
  5. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.cursor/rules/coding-style.mdc +0 -0
  6. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  7. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.cursor/rules/minimal-comments.mdc +0 -0
  8. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  9. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  10. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.github/workflows/ci.yml +0 -0
  11. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.github/workflows/docs.yml +0 -0
  12. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.github/workflows/nightly-tutorials.yml +0 -0
  13. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.gitignore +0 -0
  14. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/.pre-commit-config.yaml +0 -0
  15. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/01-execute_notebooks.py +0 -0
  16. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/02-run_all_tutorials.sh +0 -0
  17. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/CLAUDE.md +0 -0
  18. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/LICENSE +0 -0
  19. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/MANIFEST.in +0 -0
  20. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/README.md +0 -0
  21. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/audit_packaging.py +0 -0
  22. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/check_run_md.sh +0 -0
  23. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/api/index.md +0 -0
  24. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/assets/favicon.png +0 -0
  25. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/assets/favicon.svg +0 -0
  26. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/assets/javascripts/custom.js +0 -0
  27. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/assets/logo.svg +0 -0
  28. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/assets/sample-screen.png +0 -0
  29. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/assets/social-preview.png +0 -0
  30. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/assets/social-preview.svg +0 -0
  31. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/assets/stylesheets/custom.css +0 -0
  32. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/categorizing-documents/index.md +0 -0
  33. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/data-extraction/index.md +0 -0
  34. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/describe/index.md +0 -0
  35. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/document-qa/index.md +0 -0
  36. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/element-selection/index.md +0 -0
  37. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/extracting-clean-text/index.md +0 -0
  38. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/finetuning/index.md +0 -0
  39. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/fix-messy-tables/index.md +0 -0
  40. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/fix-messy-tables/table_1.csv +0 -0
  41. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/fix-messy-tables/table_2.csv +0 -0
  42. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/fix-messy-tables/table_3.csv +0 -0
  43. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/index.md +0 -0
  44. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/installation/index.md +0 -0
  45. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/interactive-widget/index.md +0 -0
  46. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/layout-analysis/index.md +0 -0
  47. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/loops-and-groups/index.md +0 -0
  48. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/ocr/index.md +0 -0
  49. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/pdf-navigation/index.md +0 -0
  50. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  51. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/process-forms-and-invoices/index.md +0 -0
  52. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/quick-reference/index.md +0 -0
  53. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/reflowing-pages/index.md +0 -0
  54. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/regions/index.md +0 -0
  55. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tables/index.md +0 -0
  56. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/text-analysis/index.md +0 -0
  57. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/01-loading-and-extraction.md +0 -0
  58. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/02-finding-elements.md +0 -0
  59. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/03-extracting-blocks.md +0 -0
  60. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/04-table-extraction.md +0 -0
  61. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/05-excluding-content.md +0 -0
  62. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/06-document-qa.md +0 -0
  63. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/07-layout-analysis.md +0 -0
  64. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/07-working-with-regions.md +0 -0
  65. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/08-spatial-navigation.md +0 -0
  66. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/09-section-extraction.md +0 -0
  67. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/10-form-field-extraction.md +0 -0
  68. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  69. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/12-ocr-integration.md +0 -0
  70. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/13-semantic-search.md +0 -0
  71. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/tutorials/14-categorizing-documents.md +0 -0
  72. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/visual-debugging/index.md +0 -0
  73. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/docs/visual-debugging/region.png +0 -0
  74. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/mkdocs.yml +0 -0
  75. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/__init__.py +0 -0
  76. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/__init__.py +0 -0
  77. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/guides.py +0 -0
  78. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/__init__.py +0 -0
  79. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/base.py +0 -0
  80. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/docling.py +0 -0
  81. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/gemini.py +0 -0
  82. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  83. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  84. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  85. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/paddle.py +0 -0
  86. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  87. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/surya.py +0 -0
  88. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  89. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/tatr.py +0 -0
  90. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/layout/yolo.py +0 -0
  91. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  92. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/text_options.py +0 -0
  93. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/text_structure.py +0 -0
  94. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/analyzers/utils.py +0 -0
  95. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/classification/manager.py +0 -0
  96. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/classification/mixin.py +0 -0
  97. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/classification/results.py +0 -0
  98. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/cli.py +0 -0
  99. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/collections/mixins.py +0 -0
  100. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/core/__init__.py +0 -0
  101. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/core/element_manager.py +0 -0
  102. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/core/highlighting_service.py +0 -0
  103. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/core/page.py +0 -0
  104. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/core/page_collection.py +0 -0
  105. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/core/page_groupby.py +0 -0
  106. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/core/pdf.py +0 -0
  107. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/core/pdf_collection.py +0 -0
  108. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/core/render_spec.py +0 -0
  109. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/describe/__init__.py +0 -0
  110. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/describe/base.py +0 -0
  111. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/describe/elements.py +0 -0
  112. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/describe/mixin.py +0 -0
  113. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/describe/summary.py +0 -0
  114. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/elements/__init__.py +0 -0
  115. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/elements/base.py +0 -0
  116. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/elements/image.py +0 -0
  117. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/elements/line.py +0 -0
  118. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/elements/rect.py +0 -0
  119. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/elements/region.py +0 -0
  120. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/elements/text.py +0 -0
  121. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/export/mixin.py +0 -0
  122. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/__init__.py +0 -0
  123. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/base.py +0 -0
  124. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/data/__init__.py +0 -0
  125. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/data/pdf.ttf +0 -0
  126. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/data/sRGB.icc +0 -0
  127. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/hocr.py +0 -0
  128. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/hocr_font.py +0 -0
  129. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/original_pdf.py +0 -0
  130. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/paddleocr.py +0 -0
  131. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/exporters/searchable_pdf.py +0 -0
  132. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/extraction/manager.py +0 -0
  133. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/extraction/mixin.py +0 -0
  134. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/extraction/result.py +0 -0
  135. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/flows/__init__.py +0 -0
  136. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/flows/collections.py +0 -0
  137. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/flows/element.py +0 -0
  138. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/flows/flow.py +0 -0
  139. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/flows/region.py +0 -0
  140. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/__init__.py +0 -0
  141. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/engine.py +0 -0
  142. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_doctr.py +0 -0
  143. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_easyocr.py +0 -0
  144. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_paddle.py +0 -0
  145. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/engine_surya.py +0 -0
  146. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/ocr_factory.py +0 -0
  147. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/ocr_manager.py +0 -0
  148. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/ocr_options.py +0 -0
  149. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/ocr/utils.py +0 -0
  150. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/qa/__init__.py +0 -0
  151. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/qa/document_qa.py +0 -0
  152. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/qa/qa_result.py +0 -0
  153. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/search/__init__.py +0 -0
  154. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/search/lancedb_search_service.py +0 -0
  155. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/search/numpy_search_service.py +0 -0
  156. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/search/search_options.py +0 -0
  157. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/search/search_service_protocol.py +0 -0
  158. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/search/searchable_mixin.py +0 -0
  159. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/selectors/__init__.py +0 -0
  160. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/selectors/parser.py +0 -0
  161. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/tables/__init__.py +0 -0
  162. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/tables/result.py +0 -0
  163. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/templates/__init__.py +0 -0
  164. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  165. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/templates/spa/css/style.css +0 -0
  166. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/templates/spa/index.html +0 -0
  167. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/templates/spa/js/app.js +0 -0
  168. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/templates/spa/words.txt +0 -0
  169. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/text_mixin.py +0 -0
  170. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/__init__.py +0 -0
  171. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/bidi_mirror.py +0 -0
  172. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/color_utils.py +0 -0
  173. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/debug.py +0 -0
  174. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/highlighting.py +0 -0
  175. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/identifiers.py +0 -0
  176. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/layout.py +0 -0
  177. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/locks.py +0 -0
  178. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/packaging.py +0 -0
  179. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/reading_order.py +0 -0
  180. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/text_extraction.py +0 -0
  181. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/utils/visualization.py +0 -0
  182. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/vision/__init__.py +0 -0
  183. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/vision/mixin.py +0 -0
  184. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/vision/results.py +0 -0
  185. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/vision/similarity.py +0 -0
  186. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/widgets/__init__.py +0 -0
  187. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf/widgets/viewer.py +0 -0
  188. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf.egg-info/SOURCES.txt +0 -0
  189. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf.egg-info/dependency_links.txt +0 -0
  190. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf.egg-info/entry_points.txt +0 -0
  191. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf.egg-info/requires.txt +0 -0
  192. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/natural_pdf.egg-info/top_level.txt +0 -0
  193. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/noxfile.py +0 -0
  194. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/optimization/memory_comparison.py +0 -0
  195. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/optimization/pdf_analyzer.py +0 -0
  196. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/optimization/performance_analysis.py +0 -0
  197. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/optimization/performance_results/image_heavy_snapshots.csv +0 -0
  198. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/optimization/performance_results/image_heavy_snapshots.json +0 -0
  199. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/optimization/performance_results/text_heavy_snapshots.csv +0 -0
  200. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/optimization/performance_results/text_heavy_snapshots.json +0 -0
  201. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/optimization/test_cleanup_methods.py +0 -0
  202. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/optimization/test_memory_fix.py +0 -0
  203. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/publish.sh +0 -0
  204. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/pyproject.toml +0 -0
  205. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/sample-screen.png +0 -0
  206. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/setup.cfg +0 -0
  207. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/conftest.py +0 -0
  208. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/exporters/test_paddleocr_exporter.py +0 -0
  209. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_annotate.py +0 -0
  210. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_arabic_performance.py +0 -0
  211. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_arabic_real_world.py +0 -0
  212. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_color_conversion.py +0 -0
  213. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_color_hex_display.py +0 -0
  214. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_core/test_containment_geometry.py +0 -0
  215. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_core/test_elements.py +0 -0
  216. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_core/test_loading.py +0 -0
  217. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_core/test_spatial.py +0 -0
  218. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_core/test_text_extraction.py +0 -0
  219. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_core/test_text_layer.py +0 -0
  220. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_crop_enhancements.py +0 -0
  221. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_crop_region_highlights.py +0 -0
  222. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_directional_defaults.py +0 -0
  223. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_dissolve.py +0 -0
  224. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_dissolve_cross_page_bug.py +0 -0
  225. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_dissolve_debug_issue.py +0 -0
  226. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_dissolve_real_world_issue.py +0 -0
  227. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_dissolve_single_elements.py +0 -0
  228. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_dissolve_vertical_offset_issue.py +0 -0
  229. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_document_qa.py +0 -0
  230. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_element_addition.py +0 -0
  231. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_element_collection_show_cols.py +0 -0
  232. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_element_collection_slicing.py +0 -0
  233. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_element_show_crop_highlights.py +0 -0
  234. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_empty_pseudo_class.py +0 -0
  235. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_exclusions.py +0 -0
  236. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_expand.py +0 -0
  237. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_extraction_error.py +0 -0
  238. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_extraction_mixin_fix.py +0 -0
  239. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_extraction_text_and_vision.py +0 -0
  240. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_extraction_working.py +0 -0
  241. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_find_similar.py +0 -0
  242. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_first_last_selectors.py +0 -0
  243. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_fix_get_sections_zero_height.py +0 -0
  244. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_flow_region_directional.py +0 -0
  245. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_get_sections_fix_comprehensive.py +0 -0
  246. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_get_sections_zero_height.py +0 -0
  247. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_groupby.py +0 -0
  248. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_guides.py +0 -0
  249. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_guides_apply_exclusions.py +0 -0
  250. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_guides_apply_exclusions_simple.py +0 -0
  251. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_guides_extract_table.py +0 -0
  252. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_guides_extract_table_collections.py +0 -0
  253. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_guides_extract_table_exclusions.py +0 -0
  254. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_guides_extract_table_real.py +0 -0
  255. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_guides_integration.py +0 -0
  256. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_highlight_detection.py +0 -0
  257. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_highlight_detection_comprehensive.py +0 -0
  258. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_highlight_protocol.py +0 -0
  259. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_highlight_protocol_simple.py +0 -0
  260. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_highlight_regions.py +0 -0
  261. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_comprehensive.py +0 -0
  262. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_debug.py +0 -0
  263. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_final.py +0 -0
  264. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_final_verification.py +0 -0
  265. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_fix.py +0 -0
  266. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_mock.py +0 -0
  267. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_simple.py +0 -0
  268. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_types_pdf.py +0 -0
  269. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_verification.py +0 -0
  270. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_include_boundaries_with_real_text.py +0 -0
  271. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_loading_original.py +0 -0
  272. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_merge_connected.py +0 -0
  273. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_merge_connected_real_world.py +0 -0
  274. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_merge_method.py +0 -0
  275. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_multi_page_table_discovery.py +0 -0
  276. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_optional_deps.py +0 -0
  277. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_page_exclusion_lists.py +0 -0
  278. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_pdf_add_exclusion_elementcollection.py +0 -0
  279. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_region_show_crop_highlights.py +0 -0
  280. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_region_viewer.py +0 -0
  281. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_sections_end_only.py +0 -0
  282. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_sections_with_start_and_end.py +0 -0
  283. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_show_column_layout.py +0 -0
  284. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_show_edge_cases.py +0 -0
  285. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_show_exclusions.py +0 -0
  286. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_show_exclusions_feature.py +0 -0
  287. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_show_limit.py +0 -0
  288. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_skip_repeating_headers_multipage.py +0 -0
  289. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_slice_cache_reuse.py +0 -0
  290. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_slice_exclusion_fix.py +0 -0
  291. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_slice_exclusion_issue.py +0 -0
  292. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_slice_exclusion_mock.py +0 -0
  293. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_sliced_collection_exclusions.py +0 -0
  294. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_strikethrough_detection.py +0 -0
  295. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_table_result_header_mismatch.py +0 -0
  296. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_table_result_keep_blank.py +0 -0
  297. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_tiny_text_tables.py +0 -0
  298. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_tiny_text_tables_table.py +0 -0
  299. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_tutorials.py +0 -0
  300. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_underline_detection.py +0 -0
  301. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tests/test_update_text.py +0 -0
  302. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/todo/bad_pdf_analysis.md +0 -0
  303. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/todo/evaluation.md +0 -0
  304. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/IMPROVEMENTS_SUMMARY.md +0 -0
  305. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/LLM_NaturalPDF_CheatSheet.md +0 -0
  306. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/LLM_NaturalPDF_Workflows.md +0 -0
  307. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/README.md +0 -0
  308. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/__init__.py +0 -0
  309. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/analyser.py +0 -0
  310. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/collate_summaries.py +0 -0
  311. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/compile_attempts_markdown.py +0 -0
  312. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/eval_suite.py +0 -0
  313. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/evaluate_quality.py +0 -0
  314. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/export_enrichment_csv.py +0 -0
  315. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/extraction_decision_tree.md +0 -0
  316. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/llm_enrich.py +0 -0
  317. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/llm_enrich_with_retry.py +0 -0
  318. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/reporter.py +0 -0
  319. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/tools/bad_pdf_eval/utils.py +0 -0
  320. {natural_pdf-0.2.8 → natural_pdf-0.2.9}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.8
3
+ Version: 0.2.9
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -633,9 +633,7 @@ class ElementCollection(
633
633
  pdfplumber's layout engine if layout=True is specified.
634
634
 
635
635
  Args:
636
- separator: String to insert between text from different elements when
637
- using simple joining (layout=False). Default is a single space.
638
- Ignored when layout=True as the layout engine handles spacing.
636
+ separator: String to join text from elements. Default is a single space.
639
637
  preserve_whitespace: Deprecated. Use layout=False for simple joining.
640
638
  use_exclusions: Deprecated. Exclusions should be applied *before* creating
641
639
  the collection or by filtering the collection itself.
@@ -652,15 +650,49 @@ class ElementCollection(
652
650
  Returns:
653
651
  Combined text from elements, potentially with layout-based spacing.
654
652
  """
655
- # Filter to just TextElements that likely have _char_dicts
656
- text_elements = [
653
+ # Check if we have any elements at all
654
+ if not self._elements:
655
+ return ""
656
+
657
+ # Check if all elements are TextElements with character data
658
+ text_elements_with_chars = [
657
659
  el
658
660
  for el in self._elements
659
- if isinstance(el, TextElement) and hasattr(el, "_char_dicts")
661
+ if isinstance(el, TextElement) and hasattr(el, "_char_dicts") and el._char_dicts
660
662
  ]
661
663
 
662
- if not text_elements:
663
- return ""
664
+ # If we have a mixed collection (Regions, TextElements without chars, etc),
665
+ # use a simpler approach: call extract_text on each element
666
+ if len(text_elements_with_chars) < len(self._elements):
667
+ # Mixed collection - extract text from each element
668
+ element_texts = []
669
+
670
+ # Sort elements by position first
671
+ sorted_elements = sorted(
672
+ self._elements,
673
+ key=lambda el: (
674
+ el.page.index if hasattr(el, "page") else 0,
675
+ el.top if hasattr(el, "top") else 0,
676
+ el.x0 if hasattr(el, "x0") else 0,
677
+ ),
678
+ )
679
+
680
+ for el in sorted_elements:
681
+ if hasattr(el, "extract_text"):
682
+ # Call extract_text on the element (works for TextElement, Region, etc)
683
+ text = el.extract_text(**kwargs)
684
+ if text:
685
+ element_texts.append(text)
686
+ elif hasattr(el, "text"):
687
+ # Fallback to text property if available
688
+ text = getattr(el, "text", "")
689
+ if text:
690
+ element_texts.append(text)
691
+
692
+ return separator.join(element_texts)
693
+
694
+ # All elements are TextElements with char data - use the original approach
695
+ text_elements = text_elements_with_chars
664
696
 
665
697
  # Collect all character dictionaries
666
698
  all_char_dicts = []
@@ -669,11 +701,20 @@ class ElementCollection(
669
701
 
670
702
  if not all_char_dicts:
671
703
  # Handle case where elements exist but have no char dicts
672
- logger.warning(
704
+ logger.debug(
673
705
  "ElementCollection.extract_text: No character dictionaries found in TextElements."
674
706
  )
707
+ # Sort elements by position before joining
708
+ sorted_text_elements = sorted(
709
+ text_elements,
710
+ key=lambda el: (
711
+ el.page.index if hasattr(el, "page") else 0,
712
+ el.top if hasattr(el, "top") else 0,
713
+ el.x0 if hasattr(el, "x0") else 0,
714
+ ),
715
+ )
675
716
  return separator.join(
676
- getattr(el, "text", "") for el in text_elements
717
+ getattr(el, "text", "") for el in sorted_text_elements
677
718
  ) # Fallback to simple join of word text
678
719
 
679
720
  # Apply content filtering if provided
@@ -737,33 +778,20 @@ class ElementCollection(
737
778
  all_char_dicts.sort(
738
779
  key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
739
780
  )
740
- result = separator.join(c.get("text", "") for c in all_char_dicts)
781
+ result = " ".join(c.get("text", "") for c in all_char_dicts)
741
782
 
742
783
  else:
784
+ print("JOIN WITHOUT LAYOUT")
743
785
  # Default: Simple join without layout
744
786
  logger.debug("ElementCollection.extract_text: Using simple join (layout=False).")
745
-
746
- # Instead of joining all characters individually, we need to:
747
- # 1. Extract text from each element
748
- # 2. Join the element texts with the separator
749
-
750
- # Sort elements by document order (page, top, x0)
751
- sorted_elements = sorted(
752
- text_elements,
753
- key=lambda el: (
754
- el.page.index if hasattr(el, "page") else 0,
755
- el.top if hasattr(el, "top") else 0,
756
- el.x0 if hasattr(el, "x0") else 0,
757
- ),
758
- )
759
-
760
- # Extract text from each element
761
- element_texts = []
762
- for el in sorted_elements:
763
- if hasattr(el, "text") and el.text:
764
- element_texts.append(el.text)
765
-
766
- result = separator.join(element_texts)
787
+ result = separator.join(el.extract_text() for el in text_elements)
788
+
789
+ # # Sort chars by document order (page, top, x0)
790
+ # all_char_dicts.sort(
791
+ # key=lambda c: (c.get("page_number", 0), c.get("top", 0), c.get("x0", 0))
792
+ # )
793
+ # # Simple join of character text
794
+ # result = "".join(c.get("text", "") for c in all_char_dicts)
767
795
 
768
796
  # Determine final strip flag – same rule as global helper unless caller overrides
769
797
  strip_text = strip if strip is not None else (not use_layout)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.2.8
3
+ Version: 0.2.9
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes