natural-pdf 0.1.24__tar.gz → 0.1.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. {natural_pdf-0.1.24/natural_pdf.egg-info → natural_pdf-0.1.27}/PKG-INFO +1 -1
  2. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/ocr/index.md +18 -29
  3. {natural_pdf-0.1.24/natural_pdf/templates/spa → natural_pdf-0.1.27/docs/ocr-tool}/css/style.css +4 -0
  4. natural_pdf-0.1.27/docs/ocr-tool/index.html +31 -0
  5. natural_pdf-0.1.27/docs/ocr-tool/js/vendor/FileSaver.min.js +3 -0
  6. natural_pdf-0.1.27/docs/ocr-tool/js/vendor/babel.min.js +2 -0
  7. natural_pdf-0.1.27/docs/ocr-tool/js/vendor/hooks.umd.js +2 -0
  8. natural_pdf-0.1.27/docs/ocr-tool/js/vendor/htm-preact.umd.min.js +1 -0
  9. natural_pdf-0.1.27/docs/ocr-tool/js/vendor/htm.umd.min.js +1 -0
  10. natural_pdf-0.1.27/docs/ocr-tool/js/vendor/jszip.min.js +13 -0
  11. natural_pdf-0.1.27/docs/ocr-tool/js/vendor/preact.umd.min.js +1 -0
  12. natural_pdf-0.1.27/docs/ocr-tool/js/vendor/react-dom.development.js +29924 -0
  13. natural_pdf-0.1.27/docs/ocr-tool/js/vendor/react.development.js +3343 -0
  14. natural_pdf-0.1.27/docs/reflowing-pages/index.ipynb +358 -0
  15. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/reflowing-pages/index.md +4 -3
  16. natural_pdf-0.1.27/docs/tutorials/01-loading-and-extraction.ipynb +312 -0
  17. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/02-finding-elements.ipynb +42 -42
  18. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/03-extracting-blocks.ipynb +17 -17
  19. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/04-table-extraction.ipynb +30 -30
  20. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/05-excluding-content.ipynb +29 -29
  21. natural_pdf-0.1.27/docs/tutorials/06-document-qa.ipynb +445 -0
  22. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/07-layout-analysis.ipynb +41 -41
  23. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/07-working-with-regions.ipynb +58 -58
  24. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/08-spatial-navigation.ipynb +71 -71
  25. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/09-section-extraction.ipynb +109 -109
  26. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/10-form-field-extraction.ipynb +57 -57
  27. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/11-enhanced-table-processing.ipynb +141 -141
  28. natural_pdf-0.1.27/docs/tutorials/12-ocr-integration.ipynb +4771 -0
  29. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/13-semantic-search.ipynb +112 -112
  30. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/page.py +66 -7
  31. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/summary.py +2 -2
  32. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/line.py +9 -4
  33. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/region.py +48 -12
  34. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/text.py +50 -1
  35. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/qa/document_qa.py +62 -8
  36. natural_pdf-0.1.27/natural_pdf/templates/spa/css/style.css +338 -0
  37. natural_pdf-0.1.27/natural_pdf/templates/spa/js/app.js +472 -0
  38. natural_pdf-0.1.27/natural_pdf/templates/spa/words.txt +235976 -0
  39. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/packaging.py +23 -9
  40. {natural_pdf-0.1.24 → natural_pdf-0.1.27/natural_pdf.egg-info}/PKG-INFO +1 -1
  41. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/SOURCES.txt +13 -0
  42. natural_pdf-0.1.27/pdfs/needs-ocr.pdf +0 -0
  43. natural_pdf-0.1.24/docs/reflowing-pages/index.ipynb +0 -360
  44. natural_pdf-0.1.24/docs/tutorials/01-loading-and-extraction.ipynb +0 -312
  45. natural_pdf-0.1.24/docs/tutorials/06-document-qa.ipynb +0 -445
  46. natural_pdf-0.1.24/docs/tutorials/12-ocr-integration.ipynb +0 -4733
  47. natural_pdf-0.1.24/pdfs/needs-ocr.pdf +0 -0
  48. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/analysis_framework.mdc +0 -0
  49. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/coding-style.mdc +0 -0
  50. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  51. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/minimal-comments.mdc +0 -0
  52. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  53. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  54. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.github/workflows/docs.yml +0 -0
  55. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.gitignore +0 -0
  56. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/.pre-commit-config.yaml +0 -0
  57. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/01-execute_notebooks.py +0 -0
  58. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/02-run_all_tutorials.sh +0 -0
  59. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/CLAUDE.md +0 -0
  60. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/LICENSE +0 -0
  61. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/MANIFEST.in +0 -0
  62. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/README.md +0 -0
  63. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/audit_packaging.py +0 -0
  64. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/check_run_md.sh +0 -0
  65. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/api/index.md +0 -0
  66. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/favicon.png +0 -0
  67. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/favicon.svg +0 -0
  68. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/javascripts/custom.js +0 -0
  69. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/logo.svg +0 -0
  70. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/sample-screen.png +0 -0
  71. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/social-preview.png +0 -0
  72. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/social-preview.svg +0 -0
  73. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/assets/stylesheets/custom.css +0 -0
  74. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/categorizing-documents/index.md +0 -0
  75. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/data-extraction/index.md +0 -0
  76. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/describe/index.ipynb +0 -0
  77. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/describe/index.md +0 -0
  78. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/document-qa/index.ipynb +0 -0
  79. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/document-qa/index.md +0 -0
  80. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/element-selection/index.ipynb +0 -0
  81. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/element-selection/index.md +0 -0
  82. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/extracting-clean-text/index.ipynb +0 -0
  83. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/extracting-clean-text/index.md +0 -0
  84. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/finetuning/index.md +0 -0
  85. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/index.ipynb +0 -0
  86. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/index.md +0 -0
  87. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/table_1.csv +0 -0
  88. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/table_2.csv +0 -0
  89. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/fix-messy-tables/table_3.csv +0 -0
  90. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/index.md +0 -0
  91. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/installation/index.md +0 -0
  92. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/interactive-widget/index.ipynb +0 -0
  93. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/interactive-widget/index.md +0 -0
  94. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/layout-analysis/index.ipynb +0 -0
  95. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/layout-analysis/index.md +0 -0
  96. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/loops-and-groups/index.ipynb +0 -0
  97. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/loops-and-groups/index.md +0 -0
  98. {natural_pdf-0.1.24/natural_pdf/templates/spa → natural_pdf-0.1.27/docs/ocr-tool}/js/app.js +0 -0
  99. {natural_pdf-0.1.24/natural_pdf/templates/spa → natural_pdf-0.1.27/docs/ocr-tool}/words.txt +0 -0
  100. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/pdf-navigation/index.ipynb +0 -0
  101. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/pdf-navigation/index.md +0 -0
  102. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/process-forms-and-invoices/extracted_form_data.csv +0 -0
  103. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/process-forms-and-invoices/index.ipynb +0 -0
  104. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/process-forms-and-invoices/index.md +0 -0
  105. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/quick-reference/index.ipynb +0 -0
  106. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/quick-reference/index.md +0 -0
  107. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/regions/index.ipynb +0 -0
  108. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/regions/index.md +0 -0
  109. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tables/index.ipynb +0 -0
  110. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tables/index.md +0 -0
  111. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/text-analysis/index.ipynb +0 -0
  112. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/text-analysis/index.md +0 -0
  113. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/text-extraction/index.ipynb +0 -0
  114. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/01-loading-and-extraction.md +0 -0
  115. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/02-finding-elements.md +0 -0
  116. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/03-extracting-blocks.md +0 -0
  117. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/04-table-extraction.md +0 -0
  118. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/05-excluding-content.md +0 -0
  119. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/06-document-qa.md +0 -0
  120. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/07-layout-analysis.md +0 -0
  121. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/07-working-with-regions.md +0 -0
  122. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/08-spatial-navigation.md +0 -0
  123. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/09-section-extraction.md +0 -0
  124. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/10-form-field-extraction.md +0 -0
  125. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  126. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/12-ocr-integration.md +0 -0
  127. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/13-semantic-search.md +0 -0
  128. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/14-categorizing-documents.ipynb +0 -0
  129. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/tutorials/14-categorizing-documents.md +0 -0
  130. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/visual-debugging/index.ipynb +0 -0
  131. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/visual-debugging/index.md +0 -0
  132. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/docs/visual-debugging/region.png +0 -0
  133. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/mkdocs.yml +0 -0
  134. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/__init__.py +0 -0
  135. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/__init__.py +0 -0
  136. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/__init__.py +0 -0
  137. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/base.py +0 -0
  138. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/docling.py +0 -0
  139. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/gemini.py +0 -0
  140. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  141. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  142. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  143. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/paddle.py +0 -0
  144. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  145. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/surya.py +0 -0
  146. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/table_structure_utils.py +0 -0
  147. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/tatr.py +0 -0
  148. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/layout/yolo.py +0 -0
  149. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/shape_detection_mixin.py +0 -0
  150. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/text_options.py +0 -0
  151. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/text_structure.py +0 -0
  152. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/analyzers/utils.py +0 -0
  153. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/classification/manager.py +0 -0
  154. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/classification/mixin.py +0 -0
  155. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/classification/results.py +0 -0
  156. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/cli.py +0 -0
  157. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/collections/mixins.py +0 -0
  158. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/collections/pdf_collection.py +0 -0
  159. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/__init__.py +0 -0
  160. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/element_manager.py +0 -0
  161. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/highlighting_service.py +0 -0
  162. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/core/pdf.py +0 -0
  163. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/__init__.py +0 -0
  164. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/base.py +0 -0
  165. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/elements.py +0 -0
  166. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/describe/mixin.py +0 -0
  167. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/__init__.py +0 -0
  168. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/base.py +0 -0
  169. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/collections.py +0 -0
  170. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/elements/rect.py +0 -0
  171. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/export/mixin.py +0 -0
  172. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/__init__.py +0 -0
  173. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/base.py +0 -0
  174. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/data/__init__.py +0 -0
  175. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/data/pdf.ttf +0 -0
  176. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/data/sRGB.icc +0 -0
  177. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/hocr.py +0 -0
  178. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/hocr_font.py +0 -0
  179. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/original_pdf.py +0 -0
  180. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/paddleocr.py +0 -0
  181. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/exporters/searchable_pdf.py +0 -0
  182. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/extraction/manager.py +0 -0
  183. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/extraction/mixin.py +0 -0
  184. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/extraction/result.py +0 -0
  185. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/__init__.py +0 -0
  186. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/collections.py +0 -0
  187. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/element.py +0 -0
  188. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/flow.py +0 -0
  189. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/flows/region.py +0 -0
  190. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/__init__.py +0 -0
  191. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine.py +0 -0
  192. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine_doctr.py +0 -0
  193. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine_easyocr.py +0 -0
  194. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine_paddle.py +0 -0
  195. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/engine_surya.py +0 -0
  196. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/ocr_factory.py +0 -0
  197. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/ocr_manager.py +0 -0
  198. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/ocr_options.py +0 -0
  199. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/ocr/utils.py +0 -0
  200. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/qa/__init__.py +0 -0
  201. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/qa/qa_result.py +0 -0
  202. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/__init__.py +0 -0
  203. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/lancedb_search_service.py +0 -0
  204. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/numpy_search_service.py +0 -0
  205. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/search_options.py +0 -0
  206. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/search_service_protocol.py +0 -0
  207. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/search/searchable_mixin.py +0 -0
  208. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/selectors/__init__.py +0 -0
  209. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/selectors/parser.py +0 -0
  210. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/templates/__init__.py +0 -0
  211. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  212. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/templates/spa/index.html +0 -0
  213. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/__init__.py +0 -0
  214. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/debug.py +0 -0
  215. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/highlighting.py +0 -0
  216. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/identifiers.py +0 -0
  217. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/locks.py +0 -0
  218. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/reading_order.py +0 -0
  219. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/text_extraction.py +0 -0
  220. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/utils/visualization.py +0 -0
  221. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/widgets/__init__.py +0 -0
  222. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf/widgets/viewer.py +0 -0
  223. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/dependency_links.txt +0 -0
  224. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/entry_points.txt +0 -0
  225. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/requires.txt +0 -0
  226. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/natural_pdf.egg-info/top_level.txt +0 -0
  227. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/noxfile.py +0 -0
  228. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/.gitkeep +0 -0
  229. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/01-practice.pdf +0 -0
  230. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/0500000US42001.pdf +0 -0
  231. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/0500000US42007.pdf +0 -0
  232. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/1107231007033739008.pdf +0 -0
  233. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/2014 Statistics.pdf +0 -0
  234. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/2019 Statistics.pdf +0 -0
  235. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/30.pdf +0 -0
  236. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  237. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/anexo_edital_6604_1743480-table.pdf +0 -0
  238. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/appendix_fy2026.pdf +0 -0
  239. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/cia-doc.pdf +0 -0
  240. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/geometry.pdf +0 -0
  241. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/image.png +0 -0
  242. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/image.png.pdf +0 -0
  243. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/multicolumn.pdf +0 -0
  244. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/red.pdf +0 -0
  245. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr-2.pdf +0 -0
  246. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr-3.pdf +0 -0
  247. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr-small.jpg +0 -0
  248. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr-wide.jpg +0 -0
  249. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny-ocr.pdf +0 -0
  250. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/tiny.pdf +0 -0
  251. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pdfs/word-counter.pdf +0 -0
  252. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/publish.sh +0 -0
  253. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/pyproject.toml +0 -0
  254. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/sample-screen.png +0 -0
  255. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/setup.cfg +0 -0
  256. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/test_install.sh +0 -0
  257. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/conftest.py +0 -0
  258. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/exporters/test_paddleocr_exporter.py +0 -0
  259. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_containment_geometry.py +0 -0
  260. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_elements.py +0 -0
  261. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_loading.py +0 -0
  262. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_spatial.py +0 -0
  263. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_core/test_text_extraction.py +0 -0
  264. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_loading_original.py +0 -0
  265. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_optional_deps.py +0 -0
  266. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/tests/test_tutorials.py +0 -0
  267. {natural_pdf-0.1.24 → natural_pdf-0.1.27}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.24
3
+ Version: 0.1.27
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -6,12 +6,12 @@ Got a PDF that's actually just a bunch of scanned images? Or maybe a PDF where t
6
6
 
7
7
  Natural PDF supports multiple OCR engines, each with different strengths:
8
8
 
9
- | Engine | Best For | Speed | Memory | Notes |
10
- |---------------------|----------|-------|--------|-------|
11
- | **EasyOCR** | General documents, handwritten text | Moderate | Higher | Good all-around choice |
12
- | **PaddleOCR** | Asian languages, when speed matters | Fast | Efficient | Great for Chinese, Japanese, Korean |
13
- | **Surya OCR** | Highest accuracy needed | Moderate | Higher (GPU helps) | Best quality results |
14
- | **Gemini** | Complex layouts (via API) | Depends on API | N/A | Requires API key |
9
+ - [EasyOCR](https://github.com/JaidedAI/EasyOCR)
10
+ - [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
11
+ - [Surya](https://github.com/datalab-to/surya)
12
+ - [DocTR](https://github.com/mindee/doctr)
13
+
14
+ What are those strengths??? It honestly doesn't even matter, *it's so easy to try each of them you can just see what works best for you*.
15
15
 
16
16
  If you try to use an engine that isn't installed, Natural PDF will tell you exactly what to install.
17
17
 
@@ -27,7 +27,7 @@ pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-o
27
27
  page = pdf.pages[0]
28
28
 
29
29
  # Apply OCR using the default engine
30
- ocr_elements = page.apply_ocr(languages=['en'])
30
+ ocr_elements = page.apply_ocr()
31
31
 
32
32
  # Extract the text (uses OCR results automatically)
33
33
  text = page.extract_text()
@@ -68,29 +68,16 @@ easy_opts = EasyOCROptions(
68
68
  batch_size=8 # Process multiple regions at once
69
69
  )
70
70
  ocr_elements = page.apply_ocr(engine='easyocr', options=easy_opts)
71
-
72
- # Configure Surya for high-accuracy line detection
73
- surya_opts = SuryaOCROptions(
74
- languages=['en', 'de'],
75
- min_confidence=0.4 # Minimum confidence for results
76
- )
77
- ocr_elements = page.apply_ocr(engine='surya', options=surya_opts)
78
71
  ```
79
72
 
80
- ## How OCR Actually Works
73
+ ## OCRing regions
81
74
 
82
- When you run `page.apply_ocr()`, here's what happens:
75
+ Don't want to apply OCR to an entire page? You don't need to!
83
76
 
84
77
  ```python
85
- # Apply OCR to a page - this adds text elements to the page
86
- ocr_elements = page.apply_ocr(engine='easyocr')
87
- print(f"Found {len(ocr_elements)} text elements via OCR")
88
-
89
- # You can also OCR just a specific region
90
- title = page.find('text:contains("Title")')
91
- if title:
92
- content_region = title.below(height=300)
93
- region_ocr_elements = content_region.apply_ocr(engine='paddle', languages=['en'])
78
+ # Grab the top half of the page
79
+ region = page.region(0, 0, height=page.height/2, width=page.width)
80
+ region.apply_ocr(engine='paddle')
94
81
  ```
95
82
 
96
83
  *Note: Running OCR again on the same area will replace the previous OCR results.*
@@ -195,7 +182,10 @@ Natural PDF includes a web app for reviewing and correcting OCR results:
195
182
  create_correction_task_package(pdf, "correction_package.zip", overwrite=True)
196
183
  ```
197
184
 
198
- 2. **Start the web app:**
185
+ 2. **Visit [the live OCR tool](https://jsoma.github.io/natural-pdf/ocr-tool)** and upload your zip file.
186
+
187
+ If you're a crazy person, alternatively you can do it locally like this:
188
+
199
189
  ```bash
200
190
  # Find where Natural PDF is installed
201
191
  NATURAL_PDF_PATH=$(python -c "import site; print(site.getsitepackages()[0])")/natural_pdf
@@ -203,10 +193,9 @@ Natural PDF includes a web app for reviewing and correcting OCR results:
203
193
  # Start the web server
204
194
  cd $NATURAL_PDF_PATH/templates/spa
205
195
  python -m http.server 8000
206
- ```
207
196
 
208
- 3. **Use the app:**
209
- Open `http://localhost:8000` in your browser and drag in your `correction_package.zip` file to review and edit the OCR results.
197
+ # Open http://localhost:8000 in your browser
198
+ ```
210
199
 
211
200
  ## Next Steps
212
201
 
@@ -195,6 +195,10 @@ button.secondary:hover {
195
195
  /* display: inline-block; /* Remove this if using flex parent */
196
196
  }
197
197
 
198
+ .image-clip-canvas {
199
+ min-width: 70%;
200
+ }
201
+
198
202
  .editing-content {
199
203
  font-size: 18px;
200
204
  text-align: center;
@@ -0,0 +1,31 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>OCR Correction Tool</title>
7
+ <link rel="stylesheet" href="css/style.css">
8
+ </head>
9
+ <body>
10
+ <header>
11
+ <h1>OCR Correction Tool</h1>
12
+ </header>
13
+
14
+ <main id="app">
15
+ <p>Loading application...</p>
16
+ </main>
17
+
18
+ <footer>
19
+ <p>Generated by natural-pdf</p>
20
+ </footer>
21
+
22
+ <script src="js/vendor/react.development.js"></script>
23
+ <script src="js/vendor/react-dom.development.js"></script>
24
+ <script src="js/vendor/babel.min.js"></script>
25
+ <script src="js/vendor/jszip.min.js"></script>
26
+ <script src="js/vendor/FileSaver.min.js"></script>
27
+
28
+ <script type="text/babel" src="js/app.js"></script>
29
+
30
+ </body>
31
+ </html>
@@ -0,0 +1,3 @@
1
+ (function(a,b){if("function"==typeof define&&define.amd)define([],b);else if("undefined"!=typeof exports)b();else{b(),a.FileSaver={exports:{}}.exports}})(this,function(){"use strict";function b(a,b){return"undefined"==typeof b?b={autoBom:!1}:"object"!=typeof b&&(console.warn("Deprecated: Expected third argument to be a object"),b={autoBom:!b}),b.autoBom&&/^\s*(?:text\/\S*|application\/xml|\S*\/\S*\+xml)\s*;.*charset\s*=\s*utf-8/i.test(a.type)?new Blob(["\uFEFF",a],{type:a.type}):a}function c(a,b,c){var d=new XMLHttpRequest;d.open("GET",a),d.responseType="blob",d.onload=function(){g(d.response,b,c)},d.onerror=function(){console.error("could not download file")},d.send()}function d(a){var b=new XMLHttpRequest;b.open("HEAD",a,!1);try{b.send()}catch(a){}return 200<=b.status&&299>=b.status}function e(a){try{a.dispatchEvent(new MouseEvent("click"))}catch(c){var b=document.createEvent("MouseEvents");b.initMouseEvent("click",!0,!0,window,0,0,0,80,20,!1,!1,!1,!1,0,null),a.dispatchEvent(b)}}var f="object"==typeof window&&window.window===window?window:"object"==typeof self&&self.self===self?self:"object"==typeof global&&global.global===global?global:void 0,a=f.navigator&&/Macintosh/.test(navigator.userAgent)&&/AppleWebKit/.test(navigator.userAgent)&&!/Safari/.test(navigator.userAgent),g=f.saveAs||("object"!=typeof window||window!==f?function(){}:"download"in HTMLAnchorElement.prototype&&!a?function(b,g,h){var i=f.URL||f.webkitURL,j=document.createElement("a");g=g||b.name||"download",j.download=g,j.rel="noopener","string"==typeof b?(j.href=b,j.origin===location.origin?e(j):d(j.href)?c(b,g,h):e(j,j.target="_blank")):(j.href=i.createObjectURL(b),setTimeout(function(){i.revokeObjectURL(j.href)},4E4),setTimeout(function(){e(j)},0))}:"msSaveOrOpenBlob"in navigator?function(f,g,h){if(g=g||f.name||"download","string"!=typeof f)navigator.msSaveOrOpenBlob(b(f,h),g);else if(d(f))c(f,g,h);else{var i=document.createElement("a");i.href=f,i.target="_blank",setTimeout(function(){e(i)})}}:function(b,d,e,g){if(g=g||open("","_blank"),g&&(g.document.title=g.document.body.innerText="downloading..."),"string"==typeof b)return c(b,d,e);var h="application/octet-stream"===b.type,i=/constructor/i.test(f.HTMLElement)||f.safari,j=/CriOS\/[\d]+/.test(navigator.userAgent);if((j||h&&i||a)&&"undefined"!=typeof FileReader){var k=new FileReader;k.onloadend=function(){var a=k.result;a=j?a:a.replace(/^data:[^;]*;/,"data:attachment/file;"),g?g.location.href=a:location=a,g=null},k.readAsDataURL(b)}else{var l=f.URL||f.webkitURL,m=l.createObjectURL(b);g?g.location=m:location.href=m,g=null,setTimeout(function(){l.revokeObjectURL(m)},4E4)}});f.saveAs=g.saveAs=g,"undefined"!=typeof module&&(module.exports=g)});
2
+
3
+ //# sourceMappingURL=FileSaver.min.js.map