natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,151 +0,0 @@
1
- # Working with Regions
2
-
3
- Regions are rectangular areas on a page that let you focus on specific parts of a document. They're perfect for extracting text from defined areas, finding elements within certain boundaries, and working with document sections.
4
-
5
- ```python
6
- #%pip install "natural-pdf[all]"
7
- ```
8
-
9
- ```python
10
- from natural_pdf import PDF
11
-
12
- # Load a PDF
13
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
14
- page = pdf.pages[0]
15
-
16
- # Create a region in the top portion of the page
17
- top_region = page.create_region(
18
- 50, # x0 (left)
19
- 50, # y0 (top)
20
- page.width - 50, # x1 (right)
21
- 200 # y1 (bottom)
22
- )
23
-
24
- # Visualize the region
25
- top_region.show(color="blue", label="Top Region")
26
-
27
- # Extract text from this region
28
- top_region.extract_text()
29
- ```
30
-
31
- ## Creating Regions from Elements
32
-
33
- ```python
34
- # Find an element to create regions around
35
- title = page.find('text:contains("Jungle Health")')
36
-
37
- # Create regions relative to this element
38
- below_title = title.below(height=100)
39
- right_of_title = title.right(width=200)
40
- above_title = title.above(height=50)
41
-
42
- # Visualize these regions
43
- below_title.show(color="green", label="Below")
44
- right_of_title.show(color="red", label="Right")
45
- above_title.show(color="orange", label="Above")
46
-
47
- # Extract text from the region below the title
48
- below_title.extract_text()
49
- ```
50
-
51
- ## Finding Elements Within Regions
52
-
53
- ```python
54
- # Create a region for a specific document section
55
- form_region = page.create_region(50, 100, page.width - 50, 300)
56
-
57
- # Find elements only within this region
58
- labels = form_region.find_all('text:contains(":")')
59
-
60
- # Visualize the region and the elements found
61
- form_region.show(color=(0, 0, 1, 0.2), label="Form Region")
62
- labels.show(color="purple", label="Labels")
63
-
64
- # Count the elements found
65
- len(labels)
66
- ```
67
-
68
- ## Expanding and Adjusting Regions
69
-
70
- ```python
71
- # Find an element to work with
72
- element = page.find('text:contains("Summary:")')
73
-
74
- # Create a tight region around the element
75
- tight_region = page.create_region(
76
- element.x0, element.top,
77
- element.x1, element.bottom
78
- )
79
-
80
- # Expand it to include surrounding content
81
- expanded_region = tight_region.expand(
82
- left=10, # Expand 10 points to the left
83
- right=200, # Expand 200 points to the right
84
- top=5, # Expand 5 points above
85
- bottom=100 # Expand 100 points below
86
- )
87
-
88
- # Visualize both regions
89
- tight_region.show(color="red", label="Original")
90
- expanded_region.show(color="blue", label="Expanded")
91
-
92
- # Extract the content from the expanded region
93
- expanded_region.extract_text()
94
- ```
95
-
96
- ## Creating Bounded Regions
97
-
98
- ```python
99
- # Find two elements to serve as boundaries
100
- start_elem = page.find('text:contains("Summary:")')
101
- end_elem = page.find('text:contains("Statute")')
102
-
103
- # Create a region from start to end element
104
- bounded_region = start_elem.until(end_elem)
105
-
106
- # Visualize the bounded region
107
- bounded_region.show(color="green", label="Bounded Region")
108
-
109
- # Extract text from this bounded region
110
- bounded_region.extract_text()[:200] + "..." if len(bounded_region.extract_text()) > 200 else bounded_region.extract_text()
111
- ```
112
-
113
- ## Working with Multiple Regions
114
-
115
- ```python
116
- # Define multiple regions to extract different parts of the document
117
- header_region = page.create_region(0, 0, page.width, 100)
118
- main_region = page.create_region(100, 100, page.width - 100, page.height - 150)
119
- footer_region = page.create_region(0, page.height - 50, page.width, page.height)
120
-
121
- # Visualize all regions
122
- header_region.show(color="blue", label="Header")
123
- main_region.show(color="green", label="Main Content")
124
- footer_region.show(color="red", label="Footer")
125
-
126
- # Extract content from each region
127
- document_parts = {
128
- "header": header_region.extract_text(),
129
- "main": main_region.extract_text()[:100] + "...",
130
- "footer": footer_region.extract_text()
131
- }
132
-
133
- # Show what we extracted
134
- document_parts
135
- ```
136
-
137
- ## Creating an Image of a Region
138
-
139
- ```python
140
- # Find a region of interest
141
- table_header = page.find('text:contains("Statute")')
142
- table_region = table_header.below(height=100)
143
-
144
- # Visualize the region
145
- table_region.show(color="purple", label="Table Region")
146
-
147
- # Create an image of just this region
148
- table_region.to_image(resolution=150)
149
- ```
150
-
151
- Regions allow you to precisely target specific parts of a document for extraction and analysis. They're essential for handling complex document layouts and isolating the exact content you need.