natural-pdf 0.1.9__tar.gz → 0.1.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/PKG-INFO +1 -1
  2. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/categorizing-documents/index.md +20 -23
  3. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/index.md +4 -4
  4. natural_pdf-0.1.10/docs/tutorials/01-loading-and-extraction.ipynb +1628 -0
  5. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/02-finding-elements.ipynb +46 -46
  6. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/03-extracting-blocks.ipynb +17 -17
  7. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/04-table-extraction.ipynb +12 -12
  8. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/05-excluding-content.ipynb +30 -30
  9. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/06-document-qa.ipynb +28 -28
  10. natural_pdf-0.1.10/docs/tutorials/07-layout-analysis.ipynb +269 -0
  11. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/07-working-with-regions.ipynb +48 -48
  12. natural_pdf-0.1.10/docs/tutorials/08-spatial-navigation.ipynb +513 -0
  13. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/09-section-extraction.ipynb +111 -111
  14. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/10-form-field-extraction.ipynb +52 -52
  15. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/11-enhanced-table-processing.ipynb +6 -6
  16. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/12-ocr-integration.ipynb +998 -998
  17. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/13-semantic-search.ipynb +331 -331
  18. natural_pdf-0.1.10/docs/tutorials/14-categorizing-documents.ipynb +2365 -0
  19. natural_pdf-0.1.10/docs/tutorials/14-categorizing-documents.md +99 -0
  20. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/mkdocs.yml +1 -0
  21. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/classification/manager.py +26 -22
  22. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/classification/mixin.py +7 -7
  23. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/classification/results.py +17 -9
  24. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/collections/mixins.py +17 -0
  25. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/collections/pdf_collection.py +78 -46
  26. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/pdf.py +62 -6
  27. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/collections.py +107 -3
  28. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/PKG-INFO +1 -1
  29. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/SOURCES.txt +3 -0
  30. natural_pdf-0.1.10/pdfs/cia-doc.pdf +0 -0
  31. natural_pdf-0.1.9/docs/tutorials/01-loading-and-extraction.ipynb +0 -1628
  32. natural_pdf-0.1.9/docs/tutorials/07-layout-analysis.ipynb +0 -269
  33. natural_pdf-0.1.9/docs/tutorials/08-spatial-navigation.ipynb +0 -513
  34. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/analysis_framework.mdc +0 -0
  35. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/coding-style.mdc +0 -0
  36. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/edit-md-instead-of-ipynb.mdc +0 -0
  37. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/minimal-comments.mdc +0 -0
  38. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/natural-pdf-overview.mdc +0 -0
  39. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.cursor/rules/user-friendly-library-code.mdc +0 -0
  40. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.github/workflows/docs.yml +0 -0
  41. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/.gitignore +0 -0
  42. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/01-execute_notebooks.py +0 -0
  43. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/02-run_all_tutorials.sh +0 -0
  44. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/CLAUDE.md +0 -0
  45. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/LICENSE +0 -0
  46. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/MANIFEST.in +0 -0
  47. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/README.md +0 -0
  48. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/audit_packaging.py +0 -0
  49. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/check_run_md.sh +0 -0
  50. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/api/index.md +0 -0
  51. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/favicon.png +0 -0
  52. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/favicon.svg +0 -0
  53. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/javascripts/custom.js +0 -0
  54. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/logo.svg +0 -0
  55. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/sample-screen.png +0 -0
  56. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/social-preview.png +0 -0
  57. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/social-preview.svg +0 -0
  58. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/assets/stylesheets/custom.css +0 -0
  59. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/data-extraction/index.md +0 -0
  60. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/document-qa/index.ipynb +0 -0
  61. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/document-qa/index.md +0 -0
  62. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/element-selection/index.ipynb +0 -0
  63. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/element-selection/index.md +0 -0
  64. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/finetuning/index.md +0 -0
  65. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/installation/index.md +0 -0
  66. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/interactive-widget/index.ipynb +0 -0
  67. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/interactive-widget/index.md +0 -0
  68. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/layout-analysis/index.ipynb +0 -0
  69. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/layout-analysis/index.md +0 -0
  70. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/ocr/index.md +0 -0
  71. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/pdf-navigation/index.ipynb +0 -0
  72. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/pdf-navigation/index.md +0 -0
  73. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/regions/index.ipynb +0 -0
  74. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/regions/index.md +0 -0
  75. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tables/index.ipynb +0 -0
  76. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tables/index.md +0 -0
  77. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/text-analysis/index.ipynb +0 -0
  78. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/text-analysis/index.md +0 -0
  79. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/text-extraction/index.ipynb +0 -0
  80. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/text-extraction/index.md +0 -0
  81. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/01-loading-and-extraction.md +0 -0
  82. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/02-finding-elements.md +0 -0
  83. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/03-extracting-blocks.md +0 -0
  84. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/04-table-extraction.md +0 -0
  85. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/05-excluding-content.md +0 -0
  86. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/06-document-qa.md +0 -0
  87. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/07-layout-analysis.md +0 -0
  88. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/07-working-with-regions.md +0 -0
  89. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/08-spatial-navigation.md +0 -0
  90. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/09-section-extraction.md +0 -0
  91. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/10-form-field-extraction.md +0 -0
  92. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/11-enhanced-table-processing.md +0 -0
  93. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/12-ocr-integration.md +0 -0
  94. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/tutorials/13-semantic-search.md +0 -0
  95. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/visual-debugging/index.ipynb +0 -0
  96. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/visual-debugging/index.md +0 -0
  97. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/docs/visual-debugging/region.png +0 -0
  98. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/__init__.py +0 -0
  99. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/__init__.py +0 -0
  100. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/__init__.py +0 -0
  101. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/base.py +0 -0
  102. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/docling.py +0 -0
  103. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/gemini.py +0 -0
  104. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/layout_analyzer.py +0 -0
  105. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/layout_manager.py +0 -0
  106. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/layout_options.py +0 -0
  107. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/paddle.py +0 -0
  108. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/pdfplumber_table_finder.py +0 -0
  109. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/surya.py +0 -0
  110. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/tatr.py +0 -0
  111. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/layout/yolo.py +0 -0
  112. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/text_options.py +0 -0
  113. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/text_structure.py +0 -0
  114. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/analyzers/utils.py +0 -0
  115. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/__init__.py +0 -0
  116. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/element_manager.py +0 -0
  117. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/highlighting_service.py +0 -0
  118. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/core/page.py +0 -0
  119. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/__init__.py +0 -0
  120. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/base.py +0 -0
  121. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/line.py +0 -0
  122. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/rect.py +0 -0
  123. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/region.py +0 -0
  124. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/elements/text.py +0 -0
  125. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/export/mixin.py +0 -0
  126. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/exporters/__init__.py +0 -0
  127. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/exporters/base.py +0 -0
  128. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/exporters/paddleocr.py +0 -0
  129. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/exporters/searchable_pdf.py +0 -0
  130. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/extraction/manager.py +0 -0
  131. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/extraction/mixin.py +0 -0
  132. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/extraction/result.py +0 -0
  133. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/__init__.py +0 -0
  134. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine.py +0 -0
  135. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_doctr.py +0 -0
  136. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_easyocr.py +0 -0
  137. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_paddle.py +0 -0
  138. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/engine_surya.py +0 -0
  139. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/ocr_factory.py +0 -0
  140. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/ocr_manager.py +0 -0
  141. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/ocr_options.py +0 -0
  142. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/ocr/utils.py +0 -0
  143. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/qa/__init__.py +0 -0
  144. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/qa/document_qa.py +0 -0
  145. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/__init__.py +0 -0
  146. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/haystack_search_service.py +0 -0
  147. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/haystack_utils.py +0 -0
  148. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/search_options.py +0 -0
  149. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/search_service_protocol.py +0 -0
  150. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/search/searchable_mixin.py +0 -0
  151. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/selectors/__init__.py +0 -0
  152. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/selectors/parser.py +0 -0
  153. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/__init__.py +0 -0
  154. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -0
  155. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/spa/css/style.css +0 -0
  156. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/spa/index.html +0 -0
  157. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/spa/js/app.js +0 -0
  158. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/templates/spa/words.txt +0 -0
  159. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/__init__.py +0 -0
  160. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/debug.py +0 -0
  161. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/highlighting.py +0 -0
  162. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/identifiers.py +0 -0
  163. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/locks.py +0 -0
  164. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/packaging.py +0 -0
  165. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/reading_order.py +0 -0
  166. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/text_extraction.py +0 -0
  167. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/tqdm_utils.py +0 -0
  168. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/utils/visualization.py +0 -0
  169. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/widgets/__init__.py +0 -0
  170. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/widgets/frontend/viewer.js +0 -0
  171. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf/widgets/viewer.py +0 -0
  172. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/dependency_links.txt +0 -0
  173. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/requires.txt +0 -0
  174. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/natural_pdf.egg-info/top_level.txt +0 -0
  175. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/noxfile.py +0 -0
  176. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/.gitkeep +0 -0
  177. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/01-practice.pdf +0 -0
  178. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/0500000US42001.pdf +0 -0
  179. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/0500000US42007.pdf +0 -0
  180. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/2014 Statistics.pdf +0 -0
  181. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/2019 Statistics.pdf +0 -0
  182. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  183. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pdfs/needs-ocr.pdf +0 -0
  184. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/publish.sh +0 -0
  185. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/pyproject.toml +0 -0
  186. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/sample-screen.png +0 -0
  187. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/setup.cfg +0 -0
  188. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/tests/exporters/test_paddleocr_exporter.py +0 -0
  189. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/tests/test_loading.py +0 -0
  190. {natural_pdf-0.1.9 → natural_pdf-0.1.10}/tests/test_optional_deps.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.9
3
+ Version: 0.1.10
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -22,32 +22,29 @@ from natural_pdf import PDF
22
22
  # Example: Classify a Page
23
23
  pdf = PDF("pdfs/01-practice.pdf")
24
24
  page = pdf.pages[0]
25
- categories = ["invoice", "letter", "report cover", "data table"]
26
- results = page.classify(categories=categories, model="text")
25
+ labels = ["invoice", "letter", "report cover", "data table"]
26
+ page.classify(labels, using="text")
27
27
 
28
28
  # Access the top result
29
29
  print(f"Top Category: {page.category}")
30
30
  print(f"Confidence: {page.category_confidence:.3f}")
31
-
32
- # Access all results
33
- # print(page.classification_results)
34
31
  ```
35
32
 
36
33
  **Key Arguments:**
37
34
 
38
- * `categories` (required): A list of strings representing the potential categories you want to classify the item into.
39
- * `model` (optional): Specifies which classification model or strategy to use. Defaults to `"text"`.
35
+ * `labels` (required): A list of strings representing the potential labels you want to classify the item into.
36
+ * `using` (optional): Specifies which classification model or strategy to use. Defaults to `"text"`.
40
37
  * `"text"`: Uses a text-based model (default: `facebook/bart-large-mnli`) suitable for classifying based on language content.
41
38
  * `"vision"`: Uses a vision-based model (default: `openai/clip-vit-base-patch32`) suitable for classifying based on visual layout and appearance.
42
39
  * Specific Model ID: You can provide a Hugging Face model ID (e.g., `"google/siglip-base-patch16-224"`, `"MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"`) compatible with zero-shot text or image classification. The library attempts to infer whether it's text or vision, but you might need `using`.
43
- * `using` (optional): Explicitly set to `"text"` or `"vision"` if the automatic inference based on the `model` ID fails or is ambiguous.
44
- * `min_confidence` (optional): A float between 0.0 and 1.0. Only categories with a confidence score greater than or equal to this threshold will be included in the results (default: 0.0).
40
+ * `model` (optional): Explicitly model ID (HuggingFace repo name)
41
+ * `min_confidence` (optional): A float between 0.0 and 1.0. Only labels with a confidence score greater than or equal to this threshold will be included in the results (default: 0.0).
45
42
 
46
43
  ## Text vs. Vision Classification
47
44
 
48
45
  Choosing the right model type depends on your goal:
49
46
 
50
- ### Text Classification (`model="text"`)
47
+ ### Text Classification (`using="text"`)
51
48
 
52
49
  * **How it works:** Extracts the text from the page or region and analyzes the language content.
53
50
  * **Best for:**
@@ -57,12 +54,12 @@ Choosing the right model type depends on your goal:
57
54
 
58
55
  ```python
59
56
  # Find pages related to finance
60
- financial_categories = ["budget", "revenue", "expenditure", "forecast"]
61
- pdf.classify_pages(categories=financial_categories, model="text")
57
+ financial_labels = ["budget", "revenue", "expenditure", "forecast"]
58
+ pdf.classify_pages(financial_labels, using="text")
62
59
  budget_pages = [p for p in pdf.pages if p.category == "budget"]
63
60
  ```
64
61
 
65
- ### Vision Classification (`model="vision"`)
62
+ ### Vision Classification (`using="vision"`)
66
63
 
67
64
  * **How it works:** Renders the page or region as an image and analyzes its visual layout, structure, and appearance.
68
65
  * **Best for:**
@@ -72,8 +69,8 @@ budget_pages = [p for p in pdf.pages if p.category == "budget"]
72
69
 
73
70
  ```python
74
71
  # Find pages that look like invoices or receipts
75
- visual_categories = ["invoice", "receipt", "letter", "form"]
76
- page.classify(categories=visual_categories, model="vision")
72
+ visual_labels = ["invoice", "receipt", "letter", "form"]
73
+ page.classify(visual_labels, using="vision")
77
74
  if page.category in ["invoice", "receipt"]:
78
75
  print(f"Page {page.number} looks like an invoice or receipt.")
79
76
  ```
@@ -88,7 +85,7 @@ Classifying a whole page is useful for sorting documents or identifying the over
88
85
  # Classify the first page
89
86
  page = pdf.pages[0]
90
87
  page_types = ["cover page", "table of contents", "chapter start", "appendix"]
91
- page.classify(categories=page_types, model="vision") # Vision often good for page structure
88
+ page.classify(page_types, using="vision") # Vision often good for page structure
92
89
  print(f"Page 1 Type: {page.category}")
93
90
  ```
94
91
 
@@ -101,9 +98,9 @@ Classifying a specific region allows for more granular analysis within a page. Y
101
98
  paragraphs = page.find_all("region[type=paragraph]")
102
99
  if paragraphs:
103
100
  # Classify the topic of the first paragraph
104
- topic_categories = ["introduction", "methodology", "results", "conclusion"]
101
+ topic_labels = ["introduction", "methodology", "results", "conclusion"]
105
102
  # Use text model for topic
106
- paragraphs[0].classify(categories=topic_categories, model="text")
103
+ paragraphs[0].classify(topic_labels, using="text")
107
104
  print(f"First paragraph category: {paragraphs[0].category}")
108
105
  ```
109
106
 
@@ -113,10 +110,10 @@ After running `.classify()`, you can access the results:
113
110
 
114
111
  * `page.category` or `region.category`: Returns the string label of the category with the highest confidence score from the *last* classification run. Returns `None` if no classification has been run or no category met the threshold.
115
112
  * `page.category_confidence` or `region.category_confidence`: Returns the float confidence score (0.0-1.0) for the top category. Returns `None` otherwise.
116
- * `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type, categories provided, timestamp, and a list of all scores above the threshold sorted by confidence. Returns `None` if no classification has been run.
113
+ * `page.classification_results` or `region.classification_results`: Returns the full result dictionary stored in the object's `.metadata['classification']`, containing the model used, engine type, labels provided, timestamp, and a list of all scores above the threshold sorted by confidence. Returns `None` if no classification has been run.
117
114
 
118
115
  ```python
119
- results = page.classify(categories=["invoice", "letter"], model="text", min_confidence=0.5)
116
+ results = page.classify(["invoice", "letter"], using="text", min_confidence=0.5)
120
117
 
121
118
  if page.category == "invoice":
122
119
  print(f"Found an invoice with confidence {page.category_confidence:.2f}")
@@ -135,10 +132,10 @@ Classifies pages across all PDFs in the collection. Use `max_workers` for parall
135
132
 
136
133
  ```python
137
134
  collection = natural_pdf.PDFCollection.from_directory("./documents/")
138
- categories = ["form", "datasheet", "image", "text document"]
135
+ labels = ["form", "datasheet", "image", "text document"]
139
136
 
140
137
  # Classify all pages using vision model, processing 4 PDFs concurrently
141
- collection.classify_all(categories=categories, model="vision", max_workers=4)
138
+ collection.classify_all(labels, using="vision", max_workers=4)
142
139
 
143
140
  # Filter PDFs containing forms
144
141
  form_pdfs = []
@@ -160,7 +157,7 @@ layout_regions = pdf.find_all("region")
160
157
  region_types = ["paragraph", "list", "table", "figure", "caption"]
161
158
 
162
159
  # Classify all detected regions based on vision
163
- layout_regions.classify_all(categories=region_types, model="vision")
160
+ layout_regions.classify_all(region_types, model="vision")
164
161
 
165
162
  # Count table regions
166
163
  table_count = sum(1 for r in layout_regions if r.category == "table")
@@ -140,14 +140,14 @@ Categorize pages or specific regions based on their content using text or vision
140
140
 
141
141
  ```python
142
142
  # Classify a page based on text
143
- categories = ["invoice", "scientific article", "presentation"]
144
- page.classify(categories=categories, model="text")
143
+ labels = ["invoice", "scientific article", "presentation"]
144
+ page.classify(labels, using="text")
145
145
  print(f"Page Category: {page.category} (Confidence: {page.category_confidence:.2f})")
146
146
 
147
147
 
148
148
  # Classify a page based on what it looks like
149
- categories = ["invoice", "scientific article", "presentation"]
150
- page.classify(categories=categories, model="vision")
149
+ labels = ["invoice", "scientific article", "presentation"]
150
+ page.classify(labels, using="vision")
151
151
  print(f"Page Category: {page.category} (Confidence: {page.category_confidence:.2f})")
152
152
  ```
153
153