natural-pdf 0.1.7__tar.gz → 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. natural_pdf-0.1.9/.cursor/rules/analysis_framework.mdc +58 -0
  2. natural_pdf-0.1.9/.cursor/rules/coding-style.mdc +5 -0
  3. natural_pdf-0.1.9/.cursor/rules/edit-md-instead-of-ipynb.mdc +5 -0
  4. natural_pdf-0.1.9/.cursor/rules/minimal-comments.mdc +5 -0
  5. natural_pdf-0.1.9/.cursor/rules/natural-pdf-overview.mdc +5 -0
  6. natural_pdf-0.1.9/.cursor/rules/user-friendly-library-code.mdc +5 -0
  7. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/.gitignore +1 -0
  8. natural_pdf-0.1.7/execute_notebooks.py → natural_pdf-0.1.9/01-execute_notebooks.py +2 -0
  9. natural_pdf-0.1.9/MANIFEST.in +48 -0
  10. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/PKG-INFO +17 -3
  11. natural_pdf-0.1.9/audit_packaging.py +56 -0
  12. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/check_run_md.sh +15 -1
  13. natural_pdf-0.1.9/docs/categorizing-documents/index.md +168 -0
  14. natural_pdf-0.1.9/docs/data-extraction/index.md +109 -0
  15. natural_pdf-0.1.9/docs/element-selection/index.ipynb +969 -0
  16. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/element-selection/index.md +20 -0
  17. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/index.md +19 -0
  18. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/ocr/index.md +63 -16
  19. natural_pdf-0.1.9/docs/tutorials/01-loading-and-extraction.ipynb +1628 -0
  20. natural_pdf-0.1.9/docs/tutorials/02-finding-elements.ipynb +374 -0
  21. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/02-finding-elements.md +3 -3
  22. natural_pdf-0.1.9/docs/tutorials/03-extracting-blocks.ipynb +152 -0
  23. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/04-table-extraction.ipynb +17 -12
  24. natural_pdf-0.1.9/docs/tutorials/05-excluding-content.ipynb +275 -0
  25. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/06-document-qa.ipynb +36 -31
  26. natural_pdf-0.1.9/docs/tutorials/07-layout-analysis.ipynb +269 -0
  27. natural_pdf-0.1.9/docs/tutorials/07-working-with-regions.ipynb +414 -0
  28. natural_pdf-0.1.9/docs/tutorials/08-spatial-navigation.ipynb +513 -0
  29. natural_pdf-0.1.9/docs/tutorials/09-section-extraction.ipynb +2439 -0
  30. natural_pdf-0.1.9/docs/tutorials/10-form-field-extraction.ipynb +503 -0
  31. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  32. natural_pdf-0.1.9/docs/tutorials/12-ocr-integration.ipynb +3712 -0
  33. natural_pdf-0.1.9/docs/tutorials/12-ocr-integration.md +137 -0
  34. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/13-semantic-search.ipynb +629 -546
  35. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/13-semantic-search.md +8 -7
  36. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/mkdocs.yml +3 -1
  37. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/__init__.py +3 -0
  38. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/base.py +1 -5
  39. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/gemini.py +61 -51
  40. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  41. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_manager.py +26 -84
  42. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/layout_options.py +7 -0
  43. natural_pdf-0.1.9/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  44. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/surya.py +46 -123
  45. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/tatr.py +51 -4
  46. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/text_structure.py +3 -5
  47. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/utils.py +3 -3
  48. natural_pdf-0.1.9/natural_pdf/classification/manager.py +422 -0
  49. natural_pdf-0.1.9/natural_pdf/classification/mixin.py +163 -0
  50. natural_pdf-0.1.9/natural_pdf/classification/results.py +80 -0
  51. natural_pdf-0.1.9/natural_pdf/collections/mixins.py +111 -0
  52. natural_pdf-0.1.9/natural_pdf/collections/pdf_collection.py +730 -0
  53. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/core/element_manager.py +83 -0
  54. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/core/highlighting_service.py +13 -22
  55. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/core/page.py +578 -93
  56. natural_pdf-0.1.9/natural_pdf/core/pdf.py +1539 -0
  57. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/base.py +134 -40
  58. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/collections.py +712 -109
  59. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/region.py +722 -69
  60. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/text.py +4 -1
  61. natural_pdf-0.1.9/natural_pdf/export/mixin.py +137 -0
  62. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/exporters/base.py +3 -3
  63. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/exporters/paddleocr.py +5 -4
  64. natural_pdf-0.1.9/natural_pdf/extraction/manager.py +135 -0
  65. natural_pdf-0.1.9/natural_pdf/extraction/mixin.py +279 -0
  66. natural_pdf-0.1.9/natural_pdf/extraction/result.py +23 -0
  67. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/__init__.py +5 -5
  68. natural_pdf-0.1.9/natural_pdf/ocr/engine_doctr.py +346 -0
  69. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_easyocr.py +6 -3
  70. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_factory.py +24 -4
  71. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/ocr_manager.py +122 -26
  72. natural_pdf-0.1.9/natural_pdf/ocr/ocr_options.py +198 -0
  73. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/utils.py +19 -6
  74. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/qa/document_qa.py +0 -4
  75. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/__init__.py +20 -34
  76. natural_pdf-0.1.9/natural_pdf/search/haystack_search_service.py +687 -0
  77. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/haystack_utils.py +99 -75
  78. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/search_service_protocol.py +11 -12
  79. natural_pdf-0.1.9/natural_pdf/selectors/parser.py +612 -0
  80. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  81. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/debug.py +3 -3
  82. natural_pdf-0.1.9/natural_pdf/utils/locks.py +8 -0
  83. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/packaging.py +8 -6
  84. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/text_extraction.py +60 -1
  85. natural_pdf-0.1.9/natural_pdf/utils/tqdm_utils.py +51 -0
  86. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/visualization.py +18 -0
  87. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/widgets/viewer.py +4 -25
  88. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/PKG-INFO +17 -3
  89. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/SOURCES.txt +23 -3
  90. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/requires.txt +19 -2
  91. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/top_level.txt +1 -3
  92. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pyproject.toml +35 -12
  93. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/tests/exporters/test_paddleocr_exporter.py +4 -3
  94. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/tests/test_optional_deps.py +43 -17
  95. natural_pdf-0.1.7/MANIFEST.in +0 -8
  96. natural_pdf-0.1.7/docs/element-selection/index.ipynb +0 -915
  97. natural_pdf-0.1.7/docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  98. natural_pdf-0.1.7/docs/tutorials/02-finding-elements.ipynb +0 -340
  99. natural_pdf-0.1.7/docs/tutorials/03-extracting-blocks.ipynb +0 -147
  100. natural_pdf-0.1.7/docs/tutorials/05-excluding-content.ipynb +0 -270
  101. natural_pdf-0.1.7/docs/tutorials/07-layout-analysis.ipynb +0 -288
  102. natural_pdf-0.1.7/docs/tutorials/07-working-with-regions.ipynb +0 -413
  103. natural_pdf-0.1.7/docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. natural_pdf-0.1.7/docs/tutorials/09-section-extraction.ipynb +0 -2434
  105. natural_pdf-0.1.7/docs/tutorials/10-form-field-extraction.ipynb +0 -512
  106. natural_pdf-0.1.7/docs/tutorials/12-ocr-integration.ipynb +0 -604
  107. natural_pdf-0.1.7/docs/tutorials/12-ocr-integration.md +0 -175
  108. natural_pdf-0.1.7/natural_pdf/collections/pdf_collection.py +0 -311
  109. natural_pdf-0.1.7/natural_pdf/core/pdf.py +0 -1087
  110. natural_pdf-0.1.7/natural_pdf/ocr/ocr_options.py +0 -115
  111. natural_pdf-0.1.7/natural_pdf/search/haystack_search_service.py +0 -643
  112. natural_pdf-0.1.7/natural_pdf/selectors/parser.py +0 -411
  113. natural_pdf-0.1.7/notebooks/Examples.ipynb +0 -1293
  114. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/.github/workflows/docs.yml +0 -0
  115. natural_pdf-0.1.7/run_all_tutorials.sh → natural_pdf-0.1.9/02-run_all_tutorials.sh +0 -0
  116. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/CLAUDE.md +0 -0
  117. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/LICENSE +0 -0
  118. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/README.md +0 -0
  119. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/api/index.md +0 -0
  120. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/favicon.png +0 -0
  121. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/favicon.svg +0 -0
  122. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/javascripts/custom.js +0 -0
  123. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/logo.svg +0 -0
  124. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/sample-screen.png +0 -0
  125. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/social-preview.png +0 -0
  126. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/social-preview.svg +0 -0
  127. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/assets/stylesheets/custom.css +0 -0
  128. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/document-qa/index.ipynb +0 -0
  129. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/document-qa/index.md +0 -0
  130. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/finetuning/index.md +0 -0
  131. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/installation/index.md +0 -0
  132. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/interactive-widget/index.ipynb +0 -0
  133. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/interactive-widget/index.md +0 -0
  134. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/layout-analysis/index.ipynb +0 -0
  135. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/layout-analysis/index.md +0 -0
  136. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/pdf-navigation/index.ipynb +0 -0
  137. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/pdf-navigation/index.md +0 -0
  138. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/regions/index.ipynb +0 -0
  139. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/regions/index.md +0 -0
  140. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tables/index.ipynb +0 -0
  141. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tables/index.md +0 -0
  142. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/text-analysis/index.ipynb +0 -0
  143. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/text-analysis/index.md +0 -0
  144. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/text-extraction/index.ipynb +0 -0
  145. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/text-extraction/index.md +0 -0
  146. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/01-loading-and-extraction.md +0 -0
  147. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/03-extracting-blocks.md +0 -0
  148. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/04-table-extraction.md +0 -0
  149. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/05-excluding-content.md +0 -0
  150. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/06-document-qa.md +0 -0
  151. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/07-layout-analysis.md +0 -0
  152. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/07-working-with-regions.md +0 -0
  153. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/08-spatial-navigation.md +0 -0
  154. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/09-section-extraction.md +0 -0
  155. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/10-form-field-extraction.md +0 -0
  156. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  157. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/visual-debugging/index.ipynb +0 -0
  158. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/visual-debugging/index.md +0 -0
  159. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/docs/visual-debugging/region.png +0 -0
  160. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/__init__.py +0 -0
  161. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/__init__.py +0 -0
  162. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/docling.py +0 -0
  163. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/paddle.py +0 -0
  164. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/layout/yolo.py +0 -0
  165. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/analyzers/text_options.py +0 -0
  166. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/core/__init__.py +0 -0
  167. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/__init__.py +0 -0
  168. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/line.py +0 -0
  169. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/elements/rect.py +0 -0
  170. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/exporters/__init__.py +0 -0
  171. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/exporters/searchable_pdf.py +0 -0
  172. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/engine.py +0 -0
  173. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_paddle.py +0 -0
  174. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/ocr/engine_surya.py +0 -0
  175. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/qa/__init__.py +0 -0
  176. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/search_options.py +0 -0
  177. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/search/searchable_mixin.py +0 -0
  178. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/selectors/__init__.py +0 -0
  179. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/__init__.py +0 -0
  180. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/spa/css/style.css +0 -0
  181. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/spa/index.html +0 -0
  182. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/spa/js/app.js +0 -0
  183. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/templates/spa/words.txt +0 -0
  184. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/__init__.py +0 -0
  185. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/highlighting.py +0 -0
  186. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/identifiers.py +1 -1
  187. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/utils/reading_order.py +0 -0
  188. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/widgets/__init__.py +0 -0
  189. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf/widgets/frontend/viewer.js +0 -0
  190. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/natural_pdf.egg-info/dependency_links.txt +0 -0
  191. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/noxfile.py +0 -0
  192. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/.gitkeep +0 -0
  193. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/01-practice.pdf +0 -0
  194. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/0500000US42001.pdf +0 -0
  195. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/0500000US42007.pdf +0 -0
  196. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/2014 Statistics.pdf +0 -0
  197. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/2019 Statistics.pdf +0 -0
  198. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  199. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/pdfs/needs-ocr.pdf +0 -0
  200. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/publish.sh +0 -0
  201. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/sample-screen.png +0 -0
  202. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/setup.cfg +0 -0
  203. {natural_pdf-0.1.7 → natural_pdf-0.1.9}/tests/test_loading.py +0 -0
@@ -0,0 +1,58 @@
1
+ ---
2
+ description:
3
+ globs:
4
+ alwaysApply: false
5
+ ---
6
+ \
7
+ # Analysis Framework Guide
8
+
9
+ This document outlines the agreed-upon structure for adding and managing machine learning analysis results (like classification, NER, summarization) on `Page` and `Region` objects within the `natural-pdf` library.
10
+
11
+ ## Summary of Framework (Implemented for Classification)
12
+
13
+ 1. **Central Registry (`element.analyses`)**:
14
+ * A dictionary attribute named `analyses` exists on [natural_pdf/core/page.py](mdc:natural_pdf/core/page.py) (`Page`) and [natural_pdf/elements/region.py](mdc:natural_pdf/elements/region.py) (`Region`) objects.
15
+ * It stores results from different analysis types, keyed by an `analysis_key` string.
16
+ * Example: `page.analyses = {'classification': <ClassificationResult...>, 'ner_run_1': <NERResult...>}`
17
+
18
+ 2. **Structured Result Objects**:
19
+ * Each analysis type should have a dedicated `Result` class (e.g., [natural_pdf/classification/results.py](mdc:natural_pdf/classification/results.py) contains `ClassificationResult`, `CategoryScore`).
20
+ * These objects store structured findings and metadata (model used, parameters, timestamp, `using` mode).
21
+ * Stored as values in the `analyses` dictionary.
22
+
23
+ 3. **Manager Registry (`PDF.get_manager`)**:
24
+ * The [natural_pdf/core/pdf.py](mdc:natural_pdf/core/pdf.py) `PDF` class has a `get_manager(manager_type)` method.
25
+ * This handles lazy initialization and retrieval of specific analysis managers (e.g., `ClassificationManager` from [natural_pdf/classification/manager.py](mdc:natural_pdf/classification/manager.py)).
26
+ * Managers encapsulate ML model interaction and result processing.
27
+
28
+ 4. **Invocation Methods (`element.classify`, etc.)**:
29
+ * Methods are added to `Page`/`Region` (often via Mixins like [natural_pdf/classification/mixin.py](mdc:natural_pdf/classification/mixin.py)).
30
+ * They accept an optional `analysis_key: str` parameter.
31
+ * **Default:** If omitted, uses a standard key (e.g., `'classification'`) and *overwrites* previous results under that key.
32
+ * **Custom:** If provided, stores the result under the custom key, allowing multiple results of the same type to coexist.
33
+ * They use the `PDF.get_manager` to get the appropriate manager, call it, and store the returned `Result` object in `element.analyses[analysis_key]`.
34
+
35
+ 5. **Parameter Renaming**:
36
+ * The parameter specifying text vs. vision analysis has been standardized to `using=` (e.g., `using='text'`, `using='vision'`).
37
+
38
+ 6. **Convenience Accessors**:
39
+ * Simple properties/methods (e.g., `element.category`, `element.category_confidence`) provide easy access to results.
40
+ * These *always* read from the **default** key in the `analyses` registry (e.g., `analyses['classification']`).
41
+
42
+ ## TODO List for New Analysis Features
43
+
44
+ * [ ] **NER**: Create `NERManager`, `NERResult`, `Entity`, `element.apply_ner()`, `element.entities` property. Implement optional `source_elements` mapping.
45
+ * [ ] **Summarization**: Create `SummarizationManager`, `SummarizationResult`, `element.summarize()`, `element.summary` property.
46
+ * [ ] **Translation**: Create `TranslationManager`, `TranslationResult`, `element.translate()`, `element.translated_text()` method.
47
+ * [ ] **Structured Data Extraction**: Create `StructuredDataManager`, `StructuredDataResult`, `element.extract_structured_data()`.
48
+ * [ ] **Ad-hoc Analysis**: Implement `element.run_custom_analysis()` or similar.
49
+ * [ ] **Documentation**: Update user docs for the framework.
50
+
51
+ ## Coding Conventions for New Analyses
52
+
53
+ 1. **Manager**: New `Manager` class in `natural_pdf/<task>/manager.py`. Handles ML logic.
54
+ 2. **Registration**: Update `PDF.get_manager` to initialize the new manager. Check `is_available()`.
55
+ 3. **Result Object**: New `Result` class(es) in `natural_pdf/<task>/results.py`. Stores params and findings.
56
+ 4. **Element Method**: Add method to `Page`/`Region` (via Mixin?). Must take `analysis_key` (defaulting to standard task name). Calls manager, stores result in `analyses[analysis_key]`.
57
+ 5. **Accessor**: Add convenience property/method accessing `analyses[DEFAULT_KEY]`.
58
+ 6. **Dependencies**: Use `try...except ImportError` and extras in `pyproject.toml`.
@@ -0,0 +1,5 @@
1
+ ---
2
+ description:
3
+ globs:
4
+ alwaysApply: false
5
+ ---
@@ -0,0 +1,5 @@
1
+ ---
2
+ description:
3
+ globs:
4
+ alwaysApply: false
5
+ ---
@@ -0,0 +1,5 @@
1
+ ---
2
+ description:
3
+ globs:
4
+ alwaysApply: false
5
+ ---
@@ -0,0 +1,5 @@
1
+ ---
2
+ description:
3
+ globs:
4
+ alwaysApply: false
5
+ ---
@@ -0,0 +1,5 @@
1
+ ---
2
+ description:
3
+ globs:
4
+ alwaysApply: false
5
+ ---
@@ -13,6 +13,7 @@ docs/tutorials/needs-ocr-searchable.pdf
13
13
  sample.py
14
14
  sample2.py
15
15
  requirements.lock
16
+ hidden
16
17
  pdfs/hidden
17
18
  *.hocr
18
19
  my_paddleocr_finetune_data/
@@ -28,6 +28,8 @@ EXCLUDE_PATTERNS = [
28
28
  "explanations",
29
29
  "api/index.md",
30
30
  "finetuning/index.md",
31
+ "categorizing-documents/index.md",
32
+ "data-extraction/index.md",
31
33
  ]
32
34
  MAX_WORKERS = os.cpu_count()
33
35
 
@@ -0,0 +1,48 @@
1
+ include README.md
2
+ include LICENSE
3
+
4
+ # HTML templates
5
+ recursive-include natural_pdf/templates *.html
6
+
7
+ # Documentation assets
8
+ recursive-include docs *.md *.png *.jpg *.gif
9
+
10
+ # Remove common build garbage
11
+ global-exclude __pycache__ *.py[cod] *.so .DS_Store
12
+ global-exclude *hidden*
13
+
14
+ # 💣 Critical: prevent recursion bugs
15
+ prune build
16
+ prune dist
17
+ prune .nox
18
+ prune .venv
19
+ prune env
20
+ prune venv
21
+
22
+ # General junk
23
+ exclude .notebook_cache.json
24
+ exclude Untitled.ipynb
25
+ exclude conversation.md
26
+ exclude transcript.md
27
+ exclude sample.py
28
+ exclude sample2.py
29
+ exclude requirements.lock
30
+ exclude install.sh
31
+
32
+ # Directories to exclude
33
+ prune .venv
34
+ prune output
35
+ prune results
36
+ prune natural_pdf_index
37
+ prune hidden
38
+ prune pdfs/hidden
39
+ prune my_paddleocr_finetune_data
40
+ prune notebooks
41
+ prune docs/tutorials/pdfs
42
+
43
+ # Individual files in nested directories
44
+ exclude docs/tutorials/needs-ocr-searchable.pdf
45
+ exclude notebooks/Examples.md
46
+
47
+ # File patterns
48
+ global-exclude *.hocr
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.7
3
+ Version: 0.1.9
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -17,11 +17,13 @@ Requires-Dist: colour
17
17
  Requires-Dist: numpy
18
18
  Requires-Dist: urllib3
19
19
  Requires-Dist: tqdm
20
+ Requires-Dist: pydantic
20
21
  Provides-Extra: interactive
21
22
  Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
22
23
  Provides-Extra: haystack
23
24
  Requires-Dist: haystack-ai; extra == "haystack"
24
- Requires-Dist: chroma-haystack; extra == "haystack"
25
+ Requires-Dist: lancedb-haystack; extra == "haystack"
26
+ Requires-Dist: lancedb; extra == "haystack"
25
27
  Requires-Dist: sentence-transformers; extra == "haystack"
26
28
  Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
27
29
  Provides-Extra: easyocr
@@ -36,6 +38,9 @@ Requires-Dist: natural-pdf[core-ml]; extra == "layout-yolo"
36
38
  Provides-Extra: surya
37
39
  Requires-Dist: surya-ocr; extra == "surya"
38
40
  Requires-Dist: natural-pdf[core-ml]; extra == "surya"
41
+ Provides-Extra: doctr
42
+ Requires-Dist: python-doctr[torch]; extra == "doctr"
43
+ Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
39
44
  Provides-Extra: qa
40
45
  Requires-Dist: natural-pdf[core-ml]; extra == "qa"
41
46
  Provides-Extra: docling
@@ -43,7 +48,10 @@ Requires-Dist: docling; extra == "docling"
43
48
  Requires-Dist: natural-pdf[core-ml]; extra == "docling"
44
49
  Provides-Extra: llm
45
50
  Requires-Dist: openai>=1.0; extra == "llm"
46
- Requires-Dist: pydantic; extra == "llm"
51
+ Provides-Extra: classification
52
+ Requires-Dist: sentence-transformers; extra == "classification"
53
+ Requires-Dist: timm; extra == "classification"
54
+ Requires-Dist: natural-pdf[core-ml]; extra == "classification"
47
55
  Provides-Extra: test
48
56
  Requires-Dist: pytest; extra == "test"
49
57
  Provides-Extra: dev
@@ -59,6 +67,9 @@ Requires-Dist: pipdeptree; extra == "dev"
59
67
  Requires-Dist: nbformat; extra == "dev"
60
68
  Requires-Dist: jupytext; extra == "dev"
61
69
  Requires-Dist: nbclient; extra == "dev"
70
+ Provides-Extra: deskew
71
+ Requires-Dist: deskew>=1.5; extra == "deskew"
72
+ Requires-Dist: img2pdf; extra == "deskew"
62
73
  Provides-Extra: all
63
74
  Requires-Dist: natural-pdf[interactive]; extra == "all"
64
75
  Requires-Dist: natural-pdf[haystack]; extra == "all"
@@ -66,10 +77,13 @@ Requires-Dist: natural-pdf[easyocr]; extra == "all"
66
77
  Requires-Dist: natural-pdf[paddle]; extra == "all"
67
78
  Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
68
79
  Requires-Dist: natural-pdf[surya]; extra == "all"
80
+ Requires-Dist: natural-pdf[doctr]; extra == "all"
69
81
  Requires-Dist: natural-pdf[qa]; extra == "all"
70
82
  Requires-Dist: natural-pdf[ocr-export]; extra == "all"
71
83
  Requires-Dist: natural-pdf[docling]; extra == "all"
72
84
  Requires-Dist: natural-pdf[llm]; extra == "all"
85
+ Requires-Dist: natural-pdf[classification]; extra == "all"
86
+ Requires-Dist: natural-pdf[deskew]; extra == "all"
73
87
  Requires-Dist: natural-pdf[test]; extra == "all"
74
88
  Provides-Extra: core-ml
75
89
  Requires-Dist: torch; extra == "core-ml"
@@ -0,0 +1,56 @@
1
+ import subprocess
2
+ import tarfile
3
+ import zipfile
4
+ from pathlib import Path
5
+
6
+ DIST_DIR = Path("dist")
7
+
8
+
9
+ def build_package():
10
+ subprocess.run(["python", "-m", "build", "--sdist", "--wheel"], check=True)
11
+
12
+
13
+ def get_sdist_files():
14
+ sdist_path = next(DIST_DIR.glob("*.tar.gz"))
15
+ with tarfile.open(sdist_path, "r:gz") as tar:
16
+ return sorted(str(Path(m.name)) for m in tar.getmembers() if m.isfile())
17
+
18
+
19
+ def get_wheel_files():
20
+ wheel_path = next(DIST_DIR.glob("*.whl"))
21
+ with zipfile.ZipFile(wheel_path, "r") as zipf:
22
+ return sorted(str(f) for f in zipf.namelist() if not f.endswith("/"))
23
+
24
+
25
+ def get_gitignored_files():
26
+ proc = subprocess.run(
27
+ ["git", "ls-files", "--others", "-i", "--exclude-standard"],
28
+ check=True,
29
+ capture_output=True,
30
+ text=True,
31
+ )
32
+ return sorted(proc.stdout.strip().splitlines())
33
+
34
+
35
+ def diff_lists(packaged, ignored):
36
+ return sorted(set(packaged) & set(ignored))
37
+
38
+
39
+ def main():
40
+ build_package()
41
+
42
+ sdist_files = get_sdist_files()
43
+ wheel_files = get_wheel_files()
44
+ ignored_files = get_gitignored_files()
45
+
46
+ print("\n🚫 Files in *sdist* that are also .gitignored:")
47
+ for f in diff_lists(sdist_files, ignored_files):
48
+ print(" •", f)
49
+
50
+ print("\n🚫 Files in *wheel* that are also .gitignored:")
51
+ for f in diff_lists(wheel_files, ignored_files):
52
+ print(" •", f)
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -9,12 +9,26 @@ fi
9
9
 
10
10
  MARKDOWN_FILE=$1
11
11
  NOTEBOOK_FILE="${MARKDOWN_FILE%.md}.ipynb"
12
+ KERNEL_NAME="natural-pdf"
12
13
 
13
14
  echo "Converting $MARKDOWN_FILE to notebook..."
14
15
  # Jupytext will now automatically add tags based on markdown metadata
15
16
  jupytext --to ipynb "$MARKDOWN_FILE" || { echo "Conversion failed"; exit 1; }
16
17
 
18
+ echo "Patching notebook $NOTEBOOK_FILE with kernel $KERNEL_NAME..."
19
+ python3 - <<EOF
20
+ import nbformat
21
+ nb = nbformat.read("$NOTEBOOK_FILE", as_version=4)
22
+ nb.metadata["kernelspec"] = {
23
+ "name": "$KERNEL_NAME",
24
+ "display_name": "Python ($KERNEL_NAME)",
25
+ "language": "python"
26
+ }
27
+ nbformat.write(nb, "$NOTEBOOK_FILE")
28
+ EOF
29
+
30
+
17
31
  echo "Executing notebook $NOTEBOOK_FILE..."
18
- jupyter execute "$NOTEBOOK_FILE" --inplace || { echo "Execution failed"; exit 1; }
32
+ jupyter execute "$NOTEBOOK_FILE" --inplace --ExecutePreprocessor.kernel_name=natural-pdf || { echo "Execution failed"; exit 1; }
19
33
 
20
34
  echo "Success! Notebook executed and results saved to $NOTEBOOK_FILE"
@@ -0,0 +1,168 @@
1
+ # Categorizing Pages and Regions
2
+
3
+ Natural PDF allows you to automatically categorize pages or specific regions within a page using machine learning models. This is incredibly useful for filtering large collections of documents or understanding the structure and content of individual PDFs.
4
+
5
+ ## Installation
6
+
7
+ To use the classification features, you need to install the optional dependencies:
8
+
9
+ ```bash
10
+ pip install "natural-pdf[classification]"
11
+ ```
12
+
13
+ This installs necessary libraries like `torch`, `transformers`, and others.
14
+
15
+ ## Core Concept: The `.classify()` Method
16
+
17
+ The primary way to perform categorization is using the `.classify()` method available on `Page` and `Region` objects.
18
+
19
+ ```python
20
+ from natural_pdf import PDF
21
+
22
+ # Example: Classify a Page
23
+ pdf = PDF("pdfs/01-practice.pdf")
24
+ page = pdf.pages[0]
25
+ categories = ["invoice", "letter", "report cover", "data table"]
26
+ results = page.classify(categories=categories, model="text")
27
+
28
+ # Access the top result
29
+ print(f"Top Category: {page.category}")
30
+ print(f"Confidence: {page.category_confidence:.3f}")
31
+
32
+ # Access all results
33
+ # print(page.classification_results)
34
+ ```
35
+
36
+ **Key Arguments:**
37
+
38
+ * `categories` (required): A list of strings representing the potential categories you want to classify the item into.
39
+ * `model` (optional): Specifies which classification model or strategy to use. Defaults to `"text"`.
40
+ * `"text"`: Uses a text-based model (default: `facebook/bart-large-mnli`) suitable for classifying based on language content.
41
+ * `"vision"`: Uses a vision-based model (default: `openai/clip-vit-base-patch32`) suitable for classifying based on visual layout and appearance.
42
+ * Specific Model ID: You can provide a Hugging Face model ID (e.g., `"google/siglip-base-patch16-224"`, `"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"`) compatible with zero-shot text or image classification. The library attempts to infer whether it's text or vision, but you might need `using`.
43
+ * `using` (optional): Explicitly set to `"text"` or `"vision"` if the automatic inference based on the `model` ID fails or is ambiguous.
44
+ * `min_confidence` (optional): A float between 0.0 and 1.0. Only categories with a confidence score greater than or equal to this threshold will be included in the results (default: 0.0).
45
+
46
+ ## Text vs. Vision Classification
47
+
48
+ Choosing the right model type depends on your goal:
49
+
50
+ ### Text Classification (`model="text"`)
51
+
52
+ * **How it works:** Extracts the text from the page or region and analyzes the language content.
53
+ * **Best for:**
54
+ * **Topic Identification:** Determining what a page or section is *about* (e.g., "budget discussion," "environmental impact," "legal terms").
55
+ * **Content-Driven Document Types:** Identifying document types primarily defined by their text (e.g., emails, meeting minutes, news articles, reports).
56
+ * **Data Journalism Example:** You have thousands of pages of government reports. You can use text classification to find all pages discussing "public health funding" or classify paragraphs within environmental impact statements to find mentions of specific endangered species.
57
+
58
+ ```python
59
+ # Find pages related to finance
60
+ financial_categories = ["budget", "revenue", "expenditure", "forecast"]
61
+ pdf.classify_pages(categories=financial_categories, model="text")
62
+ budget_pages = [p for p in pdf.pages if p.category == "budget"]
63
+ ```
64
+
65
+ ### Vision Classification (`model="vision"`)
66
+
67
+ * **How it works:** Renders the page or region as an image and analyzes its visual layout, structure, and appearance.
68
+ * **Best for:**
69
+ * **Layout-Driven Document Types:** Identifying documents recognizable by their structure (e.g., invoices, receipts, forms, presentation slides, title pages).
70
+ * **Identifying Visual Elements:** Distinguishing between pages dominated by text, tables, charts, or images.
71
+ * **Data Journalism Example:** You have a scanned archive of campaign finance filings containing various document types. You can use vision classification to quickly isolate all the pages that look like donation receipts or expenditure forms, even if the OCR quality is poor.
72
+
73
+ ```python
74
+ # Find pages that look like invoices or receipts
75
+ visual_categories = ["invoice", "receipt", "letter", "form"]
76
+ page.classify(categories=visual_categories, model="vision")
77
+ if page.category in ["invoice", "receipt"]:
78
+ print(f"Page {page.number} looks like an invoice or receipt.")
79
+ ```
80
+
81
+ ## Classifying Specific Objects
82
+
83
+ ### Pages (`page.classify(...)`)
84
+
85
+ Classifying a whole page is useful for sorting documents or identifying the overall purpose of a page within a larger document.
86
+
87
+ ```python
88
+ # Classify the first page
89
+ page = pdf.pages[0]
90
+ page_types = ["cover page", "table of contents", "chapter start", "appendix"]
91
+ page.classify(categories=page_types, model="vision") # Vision often good for page structure
92
+ print(f"Page 1 Type: {page.category}")
93
+ ```
94
+
95
+ ### Regions (`region.classify(...)`)
96
+
97
+ Classifying a specific region allows for more granular analysis within a page. You might first detect regions using Layout Analysis and then classify those regions.
98
+
99
+ ```python
100
+ # Assume layout analysis has run, find paragraphs
101
+ paragraphs = page.find_all("region[type=paragraph]")
102
+ if paragraphs:
103
+ # Classify the topic of the first paragraph
104
+ topic_categories = ["introduction", "methodology", "results", "conclusion"]
105
+ # Use text model for topic
106
+ paragraphs[0].classify(categories=topic_categories, model="text")
107
+ print(f"First paragraph category: {paragraphs[0].category}")
108
+ ```
109
+
110
+ ## Accessing Classification Results
111
+
112
+ After running `.classify()`, you can access the results:
113
+
114
+ * `page.category` or `region.category`: Returns the string label of the category with the highest confidence score from the *last* classification run. Returns `None` if no classification has been run or no category met the threshold.
115
+ * `page.category_confidence` or `region.category_confidence`: Returns the float confidence score (0.0-1.0) for the top category. Returns `None` otherwise.
116
+ * `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type, categories provided, timestamp, and a list of all scores above the threshold sorted by confidence. Returns `None` if no classification has been run.
117
+
118
+ ```python
119
+ results = page.classify(categories=["invoice", "letter"], model="text", min_confidence=0.5)
120
+
121
+ if page.category == "invoice":
122
+ print(f"Found an invoice with confidence {page.category_confidence:.2f}")
123
+
124
+ # See all results above the threshold
125
+ # print(page.classification_results['scores'])
126
+ ```
127
+
128
+ ## Classifying Collections
129
+
130
+ For batch processing, use the `.classify_all()` method on `PDFCollection` or `ElementCollection` objects. This displays a progress bar tracking individual items (pages or elements).
131
+
132
+ ### PDFCollection (`collection.classify_all(...)`)
133
+
134
+ Classifies pages across all PDFs in the collection. Use `max_workers` for parallel processing across different PDF files.
135
+
136
+ ```python
137
+ collection = natural_pdf.PDFCollection.from_directory("./documents/")
138
+ categories = ["form", "datasheet", "image", "text document"]
139
+
140
+ # Classify all pages using vision model, processing 4 PDFs concurrently
141
+ collection.classify_all(categories=categories, model="vision", max_workers=4)
142
+
143
+ # Filter PDFs containing forms
144
+ form_pdfs = []
145
+ for pdf in collection:
146
+ if any(p.category == "form" for p in pdf.pages if p.category):
147
+ form_pdfs.append(pdf.path)
148
+ pdf.close() # Remember to close PDFs
149
+
150
+ print(f"Found forms in: {form_pdfs}")
151
+ ```
152
+
153
+ ### ElementCollection (`element_collection.classify_all(...)`)
154
+
155
+ Classifies all classifiable elements (currently `Page` and `Region`) within the collection.
156
+
157
+ ```python
158
+ # Assume 'pdf' is loaded and 'layout_regions' is an ElementCollection of Regions
159
+ layout_regions = pdf.find_all("region")
160
+ region_types = ["paragraph", "list", "table", "figure", "caption"]
161
+
162
+ # Classify all detected regions based on vision
163
+ layout_regions.classify_all(categories=region_types, model="vision")
164
+
165
+ # Count table regions
166
+ table_count = sum(1 for r in layout_regions if r.category == "table")
167
+ print(f"Found {table_count} regions classified as tables.")
168
+ ```
@@ -0,0 +1,109 @@
1
+ # Structured Data Extraction
2
+
3
+ Extracting specific, structured information (like invoice numbers, dates, or addresses) from documents often requires more than simple text extraction. Natural PDF integrates with LLMs to pull out [structured data](https://platform.openai.com/docs/guides/structured-outputs).
4
+
5
+ You need to install more than just the tiny baby default `natural_pdf` for this:
6
+ ```
7
+ # Install just the LLM portions
8
+ pip install "natural_pdf[llm]"
9
+
10
+ # Install eeeeeverything
11
+ pip install "natural_pdf[all]"
12
+ ```
13
+
14
+ ## Introduction
15
+
16
+ This feature allows you to define the exact data structure you want using a Pydantic model and then instruct an LLM to populate that structure based on the content of a PDF element (like a `Page` or `Region`).
17
+
18
+ > Not sure how to write a Pydantic schema? Just ask an LLM! "Write me a Pydantic schema to pull out an invoice number (an integer), a company name (string) and a date (string)." It'll go fine.
19
+
20
+ ## Basic Extraction
21
+
22
+ 1. **Define a Schema:** Create a Pydantic model for your desired data.
23
+ 2. **Extract:** Use `.extract()` on a `PDF`, `Page`, or `Region` object.
24
+ 3. **Access:** Use `.extracted()` to retrieve the results.
25
+
26
+ ```python
27
+ from natural_pdf import PDF
28
+ from pydantic import BaseModel, Field
29
+ from openai import OpenAI
30
+
31
+ # Initialize your LLM client
32
+ # Anything OpenAI-compatible works!
33
+ client = OpenAI(
34
+ api_key="ANTHROPIC_API_KEY", # Your Anthropic API key
35
+ base_url="https://api.anthropic.com/v1/" # Anthropic's API endpoint
36
+ )
37
+
38
+ # Load the PDF
39
+ pdf = PDF("path/to/your/document.pdf")
40
+ page = pdf.pages[0]
41
+
42
+ # Define your schema
43
+ class InvoiceInfo(BaseModel):
44
+ invoice_number: str = Field(description="The main invoice identifier")
45
+ total_amount: float = Field(description="The final amount due")
46
+ company_name: Optional[str] = Field(None, description="The name of the issuing company")
47
+
48
+ # Extract data
49
+ page.extract(schema=InvoiceInfo, client=client)
50
+
51
+ # Access the full result object
52
+ full_data = page.extracted()
53
+ print(full_data)
54
+
55
+ # Access a single field
56
+ inv_num = page.extracted('invoice_number')
57
+ print(f"Invoice Number: {inv_num}")
58
+ ```
59
+
60
+ ## Keys and Overwriting
61
+
62
+ - By default, results are stored under the key `"default-structured"` in the element's `.analyses` dictionary.
63
+ - Use the `analysis_key` parameter in `.extract()` to store results under a different name (e.g., `analysis_key="customer_details"`).
64
+ - Attempting to extract using an existing `analysis_key` will raise an error unless `overwrite=True` is specified.
65
+
66
+ ```python
67
+ # Extract using a specific key
68
+ page.extract(InvoiceInfo, client=client, analysis_key="invoice_header")
69
+
70
+ # Access using the specific key
71
+ header_data = page.extracted(analysis_key="invoice_header")
72
+ company = page.extracted('company_name', analysis_key="invoice_header")
73
+ ```
74
+
75
+ ## Text vs vision
76
+
77
+ When sending a page (or a region or etc) to an LLM, you can choose either `using='text'` (default) or `using='vision'`.
78
+
79
+ - `text` sends the text, somewhat respecting layout using `.extract_text(layout=True)`
80
+ - `vision` sends an image of the page with `.to_image(resolution=72)` (no highlights or labels)
81
+
82
+ ## Batch and bulk extraction
83
+
84
+ If you have a lot of pages or a lot of PDFs or a lot of anything, the `.extract()` and `.extracted()` methods work identically on most parts of a PDF - regions, pages, collections of pdfs, etc, allowing a lot of flexibility in what you analyze.
85
+
86
+ ```python
87
+ # Assuming 'header_region' is a Region object you defined
88
+ header_region.extract(InvoiceInfo, client)
89
+ company = header_region.extracted('company_name')
90
+ ```
91
+
92
+ Furthermore, you can apply extraction to collections of elements (like `pdf.pages`, or the result of `pdf.find_all(...)`) using the `.apply()` method. This iterates through the collection and calls `.extract()` on each item.
93
+
94
+ ```python
95
+ # Example: Extract InvoiceInfo from the first 5 pages
96
+ results = pdf.pages[:5].apply(
97
+ lambda page: page.extract(
98
+ client=client,
99
+ schema=InvoiceInfo,
100
+ client=client,
101
+ analysis_key="page_invoice_info",
102
+ )
103
+ )
104
+
105
+ # Access results for the first page in the collection
106
+ pdf.pages[0].extracted('company_name', analysis_key="page_invoice_info")
107
+ ```
108
+
109
+ This provides a powerful way to turn unstructured PDF content into structured, usable data.