natural-pdf 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docs/categorizing-documents/index.md +168 -0
  2. docs/data-extraction/index.md +87 -0
  3. docs/element-selection/index.ipynb +218 -164
  4. docs/element-selection/index.md +20 -0
  5. docs/index.md +19 -0
  6. docs/ocr/index.md +63 -16
  7. docs/tutorials/01-loading-and-extraction.ipynb +1713 -34
  8. docs/tutorials/02-finding-elements.ipynb +123 -46
  9. docs/tutorials/03-extracting-blocks.ipynb +24 -19
  10. docs/tutorials/04-table-extraction.ipynb +17 -12
  11. docs/tutorials/05-excluding-content.ipynb +37 -32
  12. docs/tutorials/06-document-qa.ipynb +36 -31
  13. docs/tutorials/07-layout-analysis.ipynb +45 -40
  14. docs/tutorials/07-working-with-regions.ipynb +61 -60
  15. docs/tutorials/08-spatial-navigation.ipynb +76 -71
  16. docs/tutorials/09-section-extraction.ipynb +160 -155
  17. docs/tutorials/10-form-field-extraction.ipynb +71 -66
  18. docs/tutorials/11-enhanced-table-processing.ipynb +11 -6
  19. docs/tutorials/12-ocr-integration.ipynb +3420 -312
  20. docs/tutorials/12-ocr-integration.md +68 -106
  21. docs/tutorials/13-semantic-search.ipynb +641 -251
  22. natural_pdf/__init__.py +2 -0
  23. natural_pdf/classification/manager.py +343 -0
  24. natural_pdf/classification/mixin.py +149 -0
  25. natural_pdf/classification/results.py +62 -0
  26. natural_pdf/collections/mixins.py +63 -0
  27. natural_pdf/collections/pdf_collection.py +321 -15
  28. natural_pdf/core/element_manager.py +67 -0
  29. natural_pdf/core/page.py +227 -64
  30. natural_pdf/core/pdf.py +387 -378
  31. natural_pdf/elements/collections.py +272 -41
  32. natural_pdf/elements/region.py +99 -15
  33. natural_pdf/elements/text.py +5 -2
  34. natural_pdf/exporters/paddleocr.py +1 -1
  35. natural_pdf/extraction/manager.py +134 -0
  36. natural_pdf/extraction/mixin.py +246 -0
  37. natural_pdf/extraction/result.py +37 -0
  38. natural_pdf/ocr/engine_easyocr.py +6 -3
  39. natural_pdf/ocr/ocr_manager.py +85 -25
  40. natural_pdf/ocr/ocr_options.py +33 -10
  41. natural_pdf/ocr/utils.py +14 -3
  42. natural_pdf/qa/document_qa.py +0 -4
  43. natural_pdf/selectors/parser.py +363 -238
  44. natural_pdf/templates/finetune/fine_tune_paddleocr.md +10 -5
  45. natural_pdf/utils/locks.py +8 -0
  46. natural_pdf/utils/text_extraction.py +52 -1
  47. natural_pdf/utils/tqdm_utils.py +43 -0
  48. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/METADATA +6 -1
  49. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/RECORD +52 -41
  50. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/WHEEL +1 -1
  51. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/licenses/LICENSE +0 -0
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.8.dist-info}/top_level.txt +0 -0
@@ -18,139 +18,97 @@ text_without_ocr = page.extract_text()
18
18
  f"Without OCR: {len(text_without_ocr)} characters extracted"
19
19
  ```
20
20
 
21
- ## Finding Text Elements with OCR
21
+ ## Applying OCR and Finding Elements
22
+
23
+ The core method is `page.apply_ocr()`. This runs the OCR process and adds `TextElement` objects to the page. You can specify the engine and languages.
24
+
25
+ **Note:** Re-applying OCR to the same page or region will automatically remove any previously generated OCR elements for that area before adding the new ones.
22
26
 
23
27
  ```python
24
- # Convert text-as-image to text elements
25
- page.apply_ocr()
28
+ # Apply OCR using the default engine (EasyOCR) for English
29
+ page.apply_ocr(languages=['en'])
26
30
 
27
- # Select all text pieces on the page
28
- text_elements = page.find_all('text')
29
- f"Found {len(text_elements)} text elements"
31
+ # Select all text pieces found by OCR
32
+ text_elements = page.find_all('text[source=ocr]')
33
+ print(f"Found {len(text_elements)} text elements using default OCR")
30
34
 
31
35
  # Visualize the elements
32
36
  text_elements.highlight()
33
- ```
34
37
 
35
- ## OCR Configuration Options
36
-
37
- ```python
38
- # Set OCR configuration for better results
39
- page.ocr_config = {
40
- 'language': 'eng', # English
41
- 'dpi': 300, # Higher resolution
42
- }
38
+ # Apply OCR using PaddleOCR for English and Chinese
39
+ page.apply_ocr(engine='paddle', languages=['en', 'ch_sim'])
43
40
 
44
- # Extract text with the improved configuration
45
- improved_text = page.extract_text()
41
+ # Apply OCR using SuryaOCR for English and German
42
+ page.apply_ocr(engine='surya', languages=['en', 'de'])
46
43
 
47
- # Preview the text
48
- improved_text[:200] + "..." if len(improved_text) > 200 else improved_text
44
+ text_with_ocr = page.extract_text()
45
+ print(f"\nExtracted text after OCR:\n{text_with_ocr[:150]}...")
49
46
  ```
50
47
 
51
- ## Working with Multi-language Documents
48
+ ## Advanced OCR Configuration
52
49
 
53
- ```python
54
- # Configure for multiple languages
55
- page.ocr_config = {
56
- 'language': 'eng+fra+deu', # English, French, German
57
- 'dpi': 300
58
- }
59
-
60
- # Extract text with multi-language support
61
- multilang_text = page.extract_text()
62
- multilang_text[:200]
63
- ```
64
-
65
- ## Extracting Tables from Scanned Documents
50
+ For more control, import and use the specific `Options` class for your chosen engine within the `apply_ocr` call.
66
51
 
67
52
  ```python
68
- # Enable OCR and analyze the document layout
69
- page.use_ocr = True
70
- page.analyze_layout()
71
-
72
- # Find table regions
73
- table_regions = page.find_all('region[type=table]')
74
-
75
- # Visualize any detected tables
76
- table_regions.highlight()
77
-
78
- # Extract the first table if found
79
- if table_regions:
80
- table_data = table_regions[0].extract_table()
81
- table_data
82
- else:
83
- "No tables found in the document"
53
+ from natural_pdf.ocr import PaddleOCROptions, EasyOCROptions, SuryaOCROptions
54
+
55
+ # Re-apply OCR using EasyOCR with specific options
56
+ easy_opts = EasyOCROptions(
57
+ paragraph=False,
58
+ )
59
+ page.apply_ocr(engine='easyocr', languages=['en'], min_confidence=0.1, options=easy_opts)
60
+
61
+ paddle_opts = PaddleOCROptions(
62
+ use_angle_cls=False,
63
+ det_db_thresh=0.3,
64
+ )
65
+ page.apply_ocr(engine='paddle', languages=['en'], options=paddle_opts)
66
+
67
+ surya_opts = SuryaOCROptions()
68
+ page.apply_ocr(engine='surya', languages=['en'], min_confidence=0.5, detect_only=True, options=surya_opts)
84
69
  ```
85
70
 
86
- ## Finding Form Fields in Scanned Documents
71
+ ## Interactive OCR Correction / Debugging
87
72
 
88
- ```python
89
- # Look for potential form labels (containing a colon)
90
- labels = page.find_all('text:contains(":")')
91
-
92
- # Visualize the labels
93
- labels.highlight()
94
-
95
- # Extract form data by looking to the right of each label
96
- form_data = {}
97
- for label in labels:
98
- # Clean the label text
99
- field_name = label.text.strip().rstrip(':')
100
-
101
- # Find the value to the right
102
- value_element = label.right(width=200)
103
- value = value_element.extract_text().strip()
104
-
105
- # Add to our dictionary
106
- form_data[field_name] = value
107
-
108
- # Display the extracted data
109
- form_data
110
- ```
73
+ If OCR results aren't perfect, you can use the bundled interactive web application (SPA) to review and correct them.
111
74
 
112
- ## Combining OCR with Layout Analysis
75
+ 1. **Package the data:**
76
+ After running `apply_ocr` (or `apply_layout`), use `create_correction_task_package` to create a zip file containing the PDF images and detected elements.
113
77
 
114
- ```python
115
- # Apply OCR and analyze layout
116
- page.use_ocr = True
117
- page.analyze_layout()
78
+ ```python
79
+ from natural_pdf.utils.packaging import create_correction_task_package
118
80
 
119
- # Find document structure elements
120
- headings = page.find_all('region[type=heading]')
121
- paragraphs = page.find_all('region[type=paragraph]')
81
+ page.apply_ocr()
122
82
 
123
- # Visualize the structure
124
- headings.highlight(color="red", label="Headings")
125
- paragraphs.highlight(color="blue", label="Paragraphs")
83
+ create_correction_task_package(pdf, "correction_package.zip", overwrite=True)
84
+ ```
126
85
 
127
- # Create a simple document outline
128
- document_outline = []
129
- for heading in headings:
130
- heading_text = heading.extract_text()
131
- document_outline.append(heading_text)
86
+ 2. **Run the SPA:**
87
+ Navigate to the SPA directory within the installed `natural_pdf` library in your terminal and start a simple web server.
88
+
89
+ 3. **Use the SPA:**
90
+ Open `http://localhost:8000` in your browser. Drag the `correction_package.zip` file onto the page to load the document. You can then click on text elements to correct the OCR results.
132
91
 
133
- document_outline
134
- ```
135
92
 
136
93
  ## Working with Multiple Pages
137
94
 
95
+ Apply OCR or layout analysis to all pages using the `PDF` object.
96
+
138
97
  ```python
139
98
  # Process all pages in the document
140
- all_text = []
141
-
142
- for i, page in enumerate(pdf.pages):
143
- # Enable OCR for each page
144
- page.use_ocr = True
145
-
146
- # Extract text
147
- page_text = page.extract_text()
148
-
149
- # Add to our collection with page number
150
- all_text.append(f"Page {i+1}: {page_text[:100]}...")
151
-
152
- # Show the first few pages
153
- all_text
99
+
100
+ # Apply OCR to all pages (example using EasyOCR)
101
+ pdf.apply_ocr(engine='easyocr', languages=['en'])
102
+ print(f"Applied OCR to {len(pdf.pages)} pages.")
103
+
104
+ # Or apply layout analysis to all pages (example using Paddle)
105
+ # pdf.apply_layout(engine='paddle')
106
+ # print(f"Applied Layout Analysis to {len(pdf.pages)} pages.")
107
+
108
+ # Extract text from all pages (uses OCR results if available)
109
+ all_text_content = pdf.extract_text(page_separator="\\n\\n---\\n\\n")
110
+
111
+ print(f"\nCombined text from all pages:\n{all_text_content[:500]}...")
154
112
  ```
155
113
 
156
114
  ## Saving PDFs with Searchable Text
@@ -165,9 +123,13 @@ from natural_pdf import PDF
165
123
  input_pdf_path = "https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/needs-ocr.pdf"
166
124
 
167
125
  pdf = PDF(input_pdf_path)
168
- pdf.apply_ocr()
126
+ # Apply OCR to all pages before saving
127
+ # Use desired engine and options
128
+ pdf.apply_ocr(engine='easyocr', languages=['en'])
169
129
 
170
130
  pdf.save_searchable("needs-ocr-searchable.pdf")
131
+
132
+ print("Saved searchable PDF to needs-ocr-searchable.pdf")
171
133
  ```
172
134
 
173
135
  This creates `needs-ocr-searchable.pdf`, which looks identical to the original but now has a text layer corresponding to the OCR results. You can adjust the rendering resolution used during saving with the `dpi` parameter (default is 300).