natural-pdf 0.1.8__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. natural_pdf-0.1.9/MANIFEST.in +48 -0
  2. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/PKG-INFO +12 -3
  3. natural_pdf-0.1.9/audit_packaging.py +56 -0
  4. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/data-extraction/index.md +41 -19
  5. natural_pdf-0.1.9/docs/tutorials/01-loading-and-extraction.ipynb +1628 -0
  6. natural_pdf-0.1.9/docs/tutorials/02-finding-elements.ipynb +374 -0
  7. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/02-finding-elements.md +3 -3
  8. natural_pdf-0.1.9/docs/tutorials/03-extracting-blocks.ipynb +152 -0
  9. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/04-table-extraction.ipynb +12 -12
  10. natural_pdf-0.1.9/docs/tutorials/05-excluding-content.ipynb +275 -0
  11. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/06-document-qa.ipynb +28 -28
  12. natural_pdf-0.1.9/docs/tutorials/07-layout-analysis.ipynb +269 -0
  13. natural_pdf-0.1.9/docs/tutorials/07-working-with-regions.ipynb +414 -0
  14. natural_pdf-0.1.9/docs/tutorials/08-spatial-navigation.ipynb +513 -0
  15. natural_pdf-0.1.9/docs/tutorials/09-section-extraction.ipynb +2439 -0
  16. natural_pdf-0.1.9/docs/tutorials/10-form-field-extraction.ipynb +503 -0
  17. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  18. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/12-ocr-integration.ipynb +1007 -1007
  19. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/13-semantic-search.ipynb +335 -642
  20. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/13-semantic-search.md +8 -7
  21. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/__init__.py +1 -0
  22. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/base.py +1 -5
  23. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/gemini.py +61 -51
  24. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  25. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_manager.py +26 -84
  26. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_options.py +7 -0
  27. natural_pdf-0.1.9/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  28. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/surya.py +46 -123
  29. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/tatr.py +51 -4
  30. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/text_structure.py +3 -5
  31. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/utils.py +3 -3
  32. natural_pdf-0.1.9/natural_pdf/classification/manager.py +422 -0
  33. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/classification/mixin.py +49 -35
  34. natural_pdf-0.1.9/natural_pdf/classification/results.py +80 -0
  35. natural_pdf-0.1.9/natural_pdf/collections/mixins.py +111 -0
  36. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/collections/pdf_collection.py +177 -64
  37. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/element_manager.py +30 -14
  38. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/highlighting_service.py +13 -22
  39. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/page.py +423 -101
  40. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/pdf.py +633 -190
  41. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/base.py +134 -40
  42. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/collections.py +503 -131
  43. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/region.py +659 -90
  44. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/text.py +1 -1
  45. natural_pdf-0.1.9/natural_pdf/export/mixin.py +137 -0
  46. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/exporters/base.py +3 -3
  47. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/exporters/paddleocr.py +4 -3
  48. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/extraction/manager.py +50 -49
  49. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/extraction/mixin.py +90 -57
  50. natural_pdf-0.1.9/natural_pdf/extraction/result.py +23 -0
  51. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/__init__.py +5 -5
  52. natural_pdf-0.1.9/natural_pdf/ocr/engine_doctr.py +346 -0
  53. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_factory.py +24 -4
  54. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_manager.py +61 -25
  55. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_options.py +70 -10
  56. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/utils.py +6 -4
  57. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/__init__.py +20 -34
  58. natural_pdf-0.1.9/natural_pdf/search/haystack_search_service.py +687 -0
  59. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/haystack_utils.py +99 -75
  60. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/search_service_protocol.py +11 -12
  61. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/selectors/parser.py +219 -143
  62. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/debug.py +3 -3
  63. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/locks.py +1 -1
  64. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/packaging.py +8 -6
  65. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/text_extraction.py +24 -16
  66. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/tqdm_utils.py +18 -10
  67. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/visualization.py +18 -0
  68. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/widgets/viewer.py +4 -25
  69. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/PKG-INFO +12 -3
  70. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/SOURCES.txt +4 -1
  71. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/requires.txt +13 -2
  72. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/top_level.txt +0 -2
  73. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pyproject.toml +28 -12
  74. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/tests/exporters/test_paddleocr_exporter.py +4 -3
  75. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/tests/test_optional_deps.py +43 -17
  76. natural_pdf-0.1.8/MANIFEST.in +0 -8
  77. natural_pdf-0.1.8/docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  78. natural_pdf-0.1.8/docs/tutorials/02-finding-elements.ipynb +0 -417
  79. natural_pdf-0.1.8/docs/tutorials/03-extracting-blocks.ipynb +0 -152
  80. natural_pdf-0.1.8/docs/tutorials/05-excluding-content.ipynb +0 -275
  81. natural_pdf-0.1.8/docs/tutorials/07-layout-analysis.ipynb +0 -293
  82. natural_pdf-0.1.8/docs/tutorials/07-working-with-regions.ipynb +0 -414
  83. natural_pdf-0.1.8/docs/tutorials/08-spatial-navigation.ipynb +0 -513
  84. natural_pdf-0.1.8/docs/tutorials/09-section-extraction.ipynb +0 -2439
  85. natural_pdf-0.1.8/docs/tutorials/10-form-field-extraction.ipynb +0 -517
  86. natural_pdf-0.1.8/natural_pdf/classification/manager.py +0 -343
  87. natural_pdf-0.1.8/natural_pdf/classification/results.py +0 -62
  88. natural_pdf-0.1.8/natural_pdf/collections/mixins.py +0 -63
  89. natural_pdf-0.1.8/natural_pdf/extraction/result.py +0 -37
  90. natural_pdf-0.1.8/natural_pdf/search/haystack_search_service.py +0 -643
  91. natural_pdf-0.1.8/notebooks/Examples.ipynb +0 -1293
  92. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/analysis_framework.mdc +0 -0
  93. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/coding-style.mdc +0 -0
  94. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  95. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/minimal-comments.mdc +0 -0
  96. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  97. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  98. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.github/workflows/docs.yml +0 -0
  99. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/.gitignore +0 -0
  100. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/01-execute_notebooks.py +0 -0
  101. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/02-run_all_tutorials.sh +0 -0
  102. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/CLAUDE.md +0 -0
  103. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/LICENSE +0 -0
  104. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/README.md +0 -0
  105. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/check_run_md.sh +0 -0
  106. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/api/index.md +0 -0
  107. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/favicon.png +0 -0
  108. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/favicon.svg +0 -0
  109. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/javascripts/custom.js +0 -0
  110. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/logo.svg +0 -0
  111. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/sample-screen.png +0 -0
  112. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/social-preview.png +0 -0
  113. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/social-preview.svg +0 -0
  114. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/assets/stylesheets/custom.css +0 -0
  115. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/categorizing-documents/index.md +0 -0
  116. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/document-qa/index.ipynb +0 -0
  117. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/document-qa/index.md +0 -0
  118. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/element-selection/index.ipynb +0 -0
  119. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/element-selection/index.md +0 -0
  120. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/finetuning/index.md +0 -0
  121. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/index.md +0 -0
  122. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/installation/index.md +0 -0
  123. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/interactive-widget/index.ipynb +0 -0
  124. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/interactive-widget/index.md +0 -0
  125. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/layout-analysis/index.ipynb +0 -0
  126. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/layout-analysis/index.md +0 -0
  127. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/ocr/index.md +0 -0
  128. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/pdf-navigation/index.ipynb +0 -0
  129. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/pdf-navigation/index.md +0 -0
  130. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/regions/index.ipynb +0 -0
  131. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/regions/index.md +0 -0
  132. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tables/index.ipynb +0 -0
  133. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tables/index.md +0 -0
  134. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/text-analysis/index.ipynb +0 -0
  135. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/text-analysis/index.md +0 -0
  136. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/text-extraction/index.ipynb +0 -0
  137. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/text-extraction/index.md +0 -0
  138. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/01-loading-and-extraction.md +0 -0
  139. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/03-extracting-blocks.md +0 -0
  140. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/04-table-extraction.md +0 -0
  141. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/05-excluding-content.md +0 -0
  142. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/06-document-qa.md +0 -0
  143. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/07-layout-analysis.md +0 -0
  144. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/07-working-with-regions.md +0 -0
  145. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/08-spatial-navigation.md +0 -0
  146. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/09-section-extraction.md +0 -0
  147. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/10-form-field-extraction.md +0 -0
  148. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  149. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/tutorials/12-ocr-integration.md +0 -0
  150. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/visual-debugging/index.ipynb +0 -0
  151. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/visual-debugging/index.md +0 -0
  152. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/docs/visual-debugging/region.png +0 -0
  153. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/mkdocs.yml +0 -0
  154. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/__init__.py +0 -0
  155. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/__init__.py +0 -0
  156. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/docling.py +0 -0
  157. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/paddle.py +0 -0
  158. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/yolo.py +0 -0
  159. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/analyzers/text_options.py +0 -0
  160. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/core/__init__.py +0 -0
  161. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/__init__.py +0 -0
  162. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/line.py +0 -0
  163. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/elements/rect.py +0 -0
  164. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/exporters/__init__.py +0 -0
  165. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/exporters/searchable_pdf.py +0 -0
  166. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/engine.py +0 -0
  167. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_easyocr.py +0 -0
  168. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_paddle.py +0 -0
  169. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_surya.py +0 -0
  170. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/qa/__init__.py +0 -0
  171. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/qa/document_qa.py +0 -0
  172. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/search_options.py +0 -0
  173. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/search/searchable_mixin.py +0 -0
  174. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/selectors/__init__.py +0 -0
  175. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/__init__.py +0 -0
  176. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  177. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/spa/css/style.css +0 -0
  178. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/spa/index.html +0 -0
  179. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/spa/js/app.js +0 -0
  180. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/templates/spa/words.txt +0 -0
  181. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/__init__.py +0 -0
  182. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/highlighting.py +0 -0
  183. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/identifiers.py +1 -1
  184. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/utils/reading_order.py +0 -0
  185. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/widgets/__init__.py +0 -0
  186. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf/widgets/frontend/viewer.js +0 -0
  187. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/natural_pdf.egg-info/dependency_links.txt +0 -0
  188. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/noxfile.py +0 -0
  189. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/.gitkeep +0 -0
  190. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/01-practice.pdf +0 -0
  191. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/0500000US42001.pdf +0 -0
  192. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/0500000US42007.pdf +0 -0
  193. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/2014 Statistics.pdf +0 -0
  194. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/2019 Statistics.pdf +0 -0
  195. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  196. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/pdfs/needs-ocr.pdf +0 -0
  197. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/publish.sh +0 -0
  198. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/sample-screen.png +0 -0
  199. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/setup.cfg +0 -0
  200. {natural_pdf-0.1.8 → natural_pdf-0.1.9}/tests/test_loading.py +0 -0
@@ -0,0 +1,48 @@
1
+ include README.md
2
+ include LICENSE
3
+
4
+ # HTML templates
5
+ recursive-include natural_pdf/templates *.html
6
+
7
+ # Documentation assets
8
+ recursive-include docs *.md *.png *.jpg *.gif
9
+
10
+ # Remove common build garbage
11
+ global-exclude __pycache__ *.py[cod] *.so .DS_Store
12
+ global-exclude *hidden*
13
+
14
+ # 💣 Critical: prevent recursion bugs
15
+ prune build
16
+ prune dist
17
+ prune .nox
18
+ prune .venv
19
+ prune env
20
+ prune venv
21
+
22
+ # General junk
23
+ exclude .notebook_cache.json
24
+ exclude Untitled.ipynb
25
+ exclude conversation.md
26
+ exclude transcript.md
27
+ exclude sample.py
28
+ exclude sample2.py
29
+ exclude requirements.lock
30
+ exclude install.sh
31
+
32
+ # Directories to exclude
33
+ prune .venv
34
+ prune output
35
+ prune results
36
+ prune natural_pdf_index
37
+ prune hidden
38
+ prune pdfs/hidden
39
+ prune my_paddleocr_finetune_data
40
+ prune notebooks
41
+ prune docs/tutorials/pdfs
42
+
43
+ # Individual files in nested directories
44
+ exclude docs/tutorials/needs-ocr-searchable.pdf
45
+ exclude notebooks/Examples.md
46
+
47
+ # File patterns
48
+ global-exclude *.hocr
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -17,11 +17,13 @@ Requires-Dist: colour
17
17
  Requires-Dist: numpy
18
18
  Requires-Dist: urllib3
19
19
  Requires-Dist: tqdm
20
+ Requires-Dist: pydantic
20
21
  Provides-Extra: interactive
21
22
  Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
22
23
  Provides-Extra: haystack
23
24
  Requires-Dist: haystack-ai; extra == "haystack"
24
- Requires-Dist: chroma-haystack; extra == "haystack"
25
+ Requires-Dist: lancedb-haystack; extra == "haystack"
26
+ Requires-Dist: lancedb; extra == "haystack"
25
27
  Requires-Dist: sentence-transformers; extra == "haystack"
26
28
  Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
27
29
  Provides-Extra: easyocr
@@ -36,6 +38,9 @@ Requires-Dist: natural-pdf[core-ml]; extra == "layout-yolo"
36
38
  Provides-Extra: surya
37
39
  Requires-Dist: surya-ocr; extra == "surya"
38
40
  Requires-Dist: natural-pdf[core-ml]; extra == "surya"
41
+ Provides-Extra: doctr
42
+ Requires-Dist: python-doctr[torch]; extra == "doctr"
43
+ Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
39
44
  Provides-Extra: qa
40
45
  Requires-Dist: natural-pdf[core-ml]; extra == "qa"
41
46
  Provides-Extra: docling
@@ -43,7 +48,6 @@ Requires-Dist: docling; extra == "docling"
43
48
  Requires-Dist: natural-pdf[core-ml]; extra == "docling"
44
49
  Provides-Extra: llm
45
50
  Requires-Dist: openai>=1.0; extra == "llm"
46
- Requires-Dist: pydantic; extra == "llm"
47
51
  Provides-Extra: classification
48
52
  Requires-Dist: sentence-transformers; extra == "classification"
49
53
  Requires-Dist: timm; extra == "classification"
@@ -63,6 +67,9 @@ Requires-Dist: pipdeptree; extra == "dev"
63
67
  Requires-Dist: nbformat; extra == "dev"
64
68
  Requires-Dist: jupytext; extra == "dev"
65
69
  Requires-Dist: nbclient; extra == "dev"
70
+ Provides-Extra: deskew
71
+ Requires-Dist: deskew>=1.5; extra == "deskew"
72
+ Requires-Dist: img2pdf; extra == "deskew"
66
73
  Provides-Extra: all
67
74
  Requires-Dist: natural-pdf[interactive]; extra == "all"
68
75
  Requires-Dist: natural-pdf[haystack]; extra == "all"
@@ -70,11 +77,13 @@ Requires-Dist: natural-pdf[easyocr]; extra == "all"
70
77
  Requires-Dist: natural-pdf[paddle]; extra == "all"
71
78
  Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
72
79
  Requires-Dist: natural-pdf[surya]; extra == "all"
80
+ Requires-Dist: natural-pdf[doctr]; extra == "all"
73
81
  Requires-Dist: natural-pdf[qa]; extra == "all"
74
82
  Requires-Dist: natural-pdf[ocr-export]; extra == "all"
75
83
  Requires-Dist: natural-pdf[docling]; extra == "all"
76
84
  Requires-Dist: natural-pdf[llm]; extra == "all"
77
85
  Requires-Dist: natural-pdf[classification]; extra == "all"
86
+ Requires-Dist: natural-pdf[deskew]; extra == "all"
78
87
  Requires-Dist: natural-pdf[test]; extra == "all"
79
88
  Provides-Extra: core-ml
80
89
  Requires-Dist: torch; extra == "core-ml"
@@ -0,0 +1,56 @@
1
+ import subprocess
2
+ import tarfile
3
+ import zipfile
4
+ from pathlib import Path
5
+
6
+ DIST_DIR = Path("dist")
7
+
8
+
9
+ def build_package():
10
+ subprocess.run(["python", "-m", "build", "--sdist", "--wheel"], check=True)
11
+
12
+
13
+ def get_sdist_files():
14
+ sdist_path = next(DIST_DIR.glob("*.tar.gz"))
15
+ with tarfile.open(sdist_path, "r:gz") as tar:
16
+ return sorted(str(Path(m.name)) for m in tar.getmembers() if m.isfile())
17
+
18
+
19
+ def get_wheel_files():
20
+ wheel_path = next(DIST_DIR.glob("*.whl"))
21
+ with zipfile.ZipFile(wheel_path, "r") as zipf:
22
+ return sorted(str(f) for f in zipf.namelist() if not f.endswith("/"))
23
+
24
+
25
+ def get_gitignored_files():
26
+ proc = subprocess.run(
27
+ ["git", "ls-files", "--others", "-i", "--exclude-standard"],
28
+ check=True,
29
+ capture_output=True,
30
+ text=True,
31
+ )
32
+ return sorted(proc.stdout.strip().splitlines())
33
+
34
+
35
+ def diff_lists(packaged, ignored):
36
+ return sorted(set(packaged) & set(ignored))
37
+
38
+
39
+ def main():
40
+ build_package()
41
+
42
+ sdist_files = get_sdist_files()
43
+ wheel_files = get_wheel_files()
44
+ ignored_files = get_gitignored_files()
45
+
46
+ print("\n🚫 Files in *sdist* that are also .gitignored:")
47
+ for f in diff_lists(sdist_files, ignored_files):
48
+ print(" •", f)
49
+
50
+ print("\n🚫 Files in *wheel* that are also .gitignored:")
51
+ for f in diff_lists(wheel_files, ignored_files):
52
+ print(" •", f)
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -1,42 +1,56 @@
1
1
  # Structured Data Extraction
2
2
 
3
- Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with Large Language Models (LLMs) via Pydantic schemas to achieve this.
3
+ Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with LLMs to pull out [structured data](https://platform.openai.com/docs/guides/structured-outputs).
4
+
5
+ You need to install more than just the tiny baby default `natural_pdf` for this:
6
+ ```
7
+ # Install just the LLM portions
8
+ pip install "natural_pdf[llm]"
9
+
10
+ # Install eeeeeverything
11
+ pip install "natural_pdf[all]"
12
+ ```
4
13
 
5
14
  ## Introduction
6
15
 
7
16
  This feature allows you to define the exact data structure you want using a Pydantic model and then instruct an LLM to populate that structure based on the content of a PDF element (like a `Page` or `Region`).
8
17
 
18
+ > Not sure how to write a Pydantic schema? Just ask an LLM! "Write me a Pydantic schema to pull out an invoice number (an integer), a company name (string) and a date (string)." It'll go fine.
19
+
9
20
  ## Basic Extraction
10
21
 
11
22
  1. **Define a Schema:** Create a Pydantic model for your desired data.
12
- 2. **Extract:** Use the `.extract()` method on a `PDF`, `Page`, or `Region` object.
13
- 3. **Access:** Use the `.extracted()` method to retrieve the results.
23
+ 2. **Extract:** Use `.extract()` on a `PDF`, `Page`, or `Region` object.
24
+ 3. **Access:** Use `.extracted()` to retrieve the results.
14
25
 
15
26
  ```python
16
27
  from natural_pdf import PDF
17
28
  from pydantic import BaseModel, Field
18
- from openai import OpenAI # Example client
29
+ from openai import OpenAI
19
30
 
20
- # Example: Initialize your LLM client
21
- client = OpenAI()
31
+ # Initialize your LLM client
32
+ # Anything OpenAI-compatible works!
33
+ client = OpenAI(
34
+ api_key="ANTHROPIC_API_KEY", # Your Anthropic API key
35
+ base_url="https://api.anthropic.com/v1/" # Anthropic's API endpoint
36
+ )
22
37
 
23
38
  # Load the PDF
24
39
  pdf = PDF("path/to/your/document.pdf")
25
40
  page = pdf.pages[0]
26
41
 
27
- # 1. Define your schema
42
+ # Define your schema
28
43
  class InvoiceInfo(BaseModel):
29
44
  invoice_number: str = Field(description="The main invoice identifier")
30
45
  total_amount: float = Field(description="The final amount due")
31
46
  company_name: Optional[str] = Field(None, description="The name of the issuing company")
32
47
 
33
- # 2. Extract data (using default analysis_key="default-structured")
48
+ # Extract data
34
49
  page.extract(schema=InvoiceInfo, client=client)
35
50
 
36
- # 3. Access the results
37
51
  # Access the full result object
38
52
  full_data = page.extracted()
39
- print(full_data)
53
+ print(full_data)
40
54
 
41
55
  # Access a single field
42
56
  inv_num = page.extracted('invoice_number')
@@ -51,16 +65,23 @@ print(f"Invoice Number: {inv_num}")
51
65
 
52
66
  ```python
53
67
  # Extract using a specific key
54
- page.extract(InvoiceInfo, client, analysis_key="invoice_header")
68
+ page.extract(InvoiceInfo, client=client, analysis_key="invoice_header")
55
69
 
56
70
  # Access using the specific key
57
71
  header_data = page.extracted(analysis_key="invoice_header")
58
72
  company = page.extracted('company_name', analysis_key="invoice_header")
59
73
  ```
60
74
 
61
- ## Applying to Regions and Collections
75
+ ## Text vs vision
76
+
77
+ When sending a page (or a region or etc) to an LLM, you can choose either `using='text'` (default) or `using='vision'`.
78
+
79
+ - `text` sends the text, somewhat respecting layout using `.extract_text(layout=True)`
80
+ - `vision` sends an image of the page with `.to_image(resolution=72)` (no highlights or labels)
81
+
82
+ ## Batch and bulk extraction
62
83
 
63
- The `.extract()` and `.extracted()` methods work identically on `Region` objects, allowing you to target specific areas of a page for structured data extraction.
84
+ If you have a lot of pages or a lot of PDFs or a lot of anything, the `.extract()` and `.extracted()` methods work identically on most parts of a PDF - regions, pages, collections of pdfs, etc, allowing a lot of flexibility in what you analyze.
64
85
 
65
86
  ```python
66
87
  # Assuming 'header_region' is a Region object you defined
@@ -73,15 +94,16 @@ Furthermore, you can apply extraction to collections of elements (like `pdf.page
73
94
  ```python
74
95
  # Example: Extract InvoiceInfo from the first 5 pages
75
96
  results = pdf.pages[:5].apply(
76
- 'extract',
77
- schema=InvoiceInfo,
78
- client=client,
79
- analysis_key="page_invoice_info", # Use a specific key for batch results
80
- overwrite=True # Allow overwriting if run multiple times
97
+ lambda page: page.extract(
98
+ client=client,
99
+ schema=InvoiceInfo,
100
+ client=client,
101
+ analysis_key="page_invoice_info",
102
+ )
81
103
  )
82
104
 
83
105
  # Access results for the first page in the collection
84
- first_page_company = results[0].extracted('company_name', analysis_key="page_invoice_info")
106
+ pdf.pages[0].extracted('company_name', analysis_key="page_invoice_info")
85
107
  ```
86
108
 
87
109
  This provides a powerful way to turn unstructured PDF content into structured, usable data.