natural-pdf 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +126 -98
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +910 -516
  81. natural_pdf/core/pdf.py +387 -289
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +714 -514
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +29 -15
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.3.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.3.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.3.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,185 @@
1
+ # Document Layout Analysis
2
+
3
+ Natural PDF can automatically detect the structure of a document (titles, paragraphs, tables, figures) using layout analysis models. This guide shows how to use this feature.
4
+
5
+ ## Setup
6
+
7
+ We'll use a sample PDF that includes various layout elements.
8
+
9
+ ```python
10
+ from natural_pdf import PDF
11
+
12
+ pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
13
+ page = pdf.pages[0]
14
+
15
+ page.to_image(width=700)
16
+ ```
17
+
18
+ ## Running Basic Layout Analysis
19
+
20
+ Use the `analyze_layout()` method. By default, it uses the YOLO model.
21
+
22
+ ```python
23
+ # Analyze the layout using the default engine (YOLO)
24
+ # This adds 'region' elements to the page
25
+ page.analyze_layout()
26
+ ```
27
+
28
+ ```python
29
+ # Find all detected regions
30
+ regions = page.find_all('region')
31
+ len(regions) # Show how many regions were detected
32
+ ```
33
+
34
+ ```python
35
+ first_region = regions[0]
36
+ f"First region: type='{first_region.type}', confidence={first_region.confidence:.2f}"
37
+ ```
38
+
39
+ ## Visualizing Detected Layout
40
+
41
+ Use `highlight()` or `show()` on the detected regions.
42
+
43
+ ```python
44
+ # Highlight all detected regions, colored by type
45
+ regions.highlight(group_by='type')
46
+ page.to_image(width=700)
47
+ ```
48
+
49
+ ## Finding Specific Region Types
50
+
51
+ Use attribute selectors to find regions of a specific type.
52
+
53
+ ```python
54
+ # Find all detected titles
55
+ titles = page.find_all('region[type=title]')
56
+ titles
57
+ ```
58
+
59
+ ```python
60
+ titles.show()
61
+ ```
62
+
63
+ ```python
64
+ page.find_all('region[type=table]').show()
65
+ ```
66
+
67
+ ## Working with Layout Regions
68
+
69
+ Detected regions are like any other `Region` object. You can extract text, find elements within them, etc.
70
+
71
+ ```python
72
+ page.find('region[type=table]').extract_text(layout=True)
73
+ ```
74
+
75
+ ## Using Different Layout Models
76
+
77
+ Natural PDF supports multiple engines (`yolo`, `paddle`, `tatr`). Specify the engine when calling `analyze_layout`.
78
+
79
+ *Note: Using different engines requires installing the corresponding extras (e.g., `natural-pdf[layout_paddle]`).* `yolo` is the default.
80
+
81
+ ```python
82
+ page.clear_detected_layout_regions()
83
+ page.clear_highlights()
84
+
85
+ page.analyze_layout(engine="paddle")
86
+ page.find_all('region[model=paddle]').highlight(group_by='region_type')
87
+ page.to_image(width=700)
88
+ ```
89
+
90
+ ```python
91
+ # Analyze using Table Transformer (TATR) - specialized for tables
92
+ page.clear_detected_layout_regions()
93
+ page.clear_highlights()
94
+
95
+ page.analyze_layout(engine="tatr")
96
+ page.find_all('region[model=tatr]').highlight(group_by='region_type')
97
+ page.to_image(width=700)
98
+ ```
99
+
100
+ ```python
101
+ # Analyze using Table Transformer (TATR) - specialized for tables
102
+ page.clear_detected_layout_regions()
103
+ page.clear_highlights()
104
+
105
+ page.analyze_layout(engine="docling")
106
+ page.find_all('region[model=docling]').highlight(group_by='region_type')
107
+ page.to_image(width=700)
108
+ ```
109
+
110
+ ```python
111
+ # Analyze using Table Transformer (TATR) - specialized for tables
112
+ page.clear_detected_layout_regions()
113
+ page.clear_highlights()
114
+
115
+ page.analyze_layout(engine="surya")
116
+ page.find_all('region[model=surya]').highlight(group_by='region_type')
117
+ page.to_image(width=700)
118
+ ```
119
+
120
+ *Note: Calling `analyze_layout` multiple times (even with the same engine) can add duplicate regions. You might want to use `page.clear_detected_layout_regions()` first, or filter by model using `region[model=yolo]`.*
121
+
122
+ ## Controlling Confidence Threshold
123
+
124
+ Filter detections by their confidence score.
125
+
126
+ ```python
127
+ # Re-run YOLO analysis (clearing previous results might be good practice)
128
+ page.clear_detected_layout_regions()
129
+ page.analyze_layout(engine="yolo")
130
+
131
+ # Find only high-confidence regions (e.g., >= 0.8)
132
+ high_conf_regions = page.find_all('region[confidence>=0.8]')
133
+ len(high_conf_regions)
134
+ ```
135
+
136
+ ## Table Structure with TATR
137
+
138
+ The TATR engine provides detailed table structure elements (`table`, `table-row`, `table-column`, `table-column-header`). This is very useful for precise table extraction.
139
+
140
+ ```python
141
+ # Ensure TATR analysis has been run
142
+ page.clear_detected_layout_regions()
143
+ page.clear_highlights()
144
+
145
+ page.analyze_layout(engine="tatr")
146
+ page.find_all('region[model=tatr]').highlight(group_by='region_type')
147
+ page.to_image(width=700)
148
+ ```
149
+
150
+ ```python
151
+ # Find different structural elements from TATR
152
+ tables = page.find_all('region[type=table][model=tatr]')
153
+ rows = page.find_all('region[type=table-row][model=tatr]')
154
+ cols = page.find_all('region[type=table-column][model=tatr]')
155
+ hdrs = page.find_all('region[type=table-column-header][model=tatr]')
156
+
157
+ f"Found: {len(tables)} tables, {len(rows)} rows, {len(cols)} columns, {len(hdrs)} headers (from TATR)"
158
+ ```
159
+
160
+ ### Enhanced Table Extraction with TATR
161
+
162
+ When a `region[type=table]` comes from the TATR model, `extract_table()` can use the underlying row/column structure for more robust extraction.
163
+
164
+ ```python
165
+ # Find the TATR table region again
166
+ tatr_table = page.find('region[type=table][model=tatr]')
167
+
168
+ # This extraction uses the detected rows/columns
169
+ tatr_table.extract_table()
170
+ ```
171
+
172
+ if you'd like the normal approach instead of the "intelligent" one, you can ask for pdfplumber.
173
+
174
+ ```python
175
+ # This extraction uses the detected rows/columns
176
+ tatr_table.extract_table(method='pdfplumber')
177
+ ```
178
+
179
+ ## Next Steps
180
+
181
+ Layout analysis provides regions that you can use for:
182
+
183
+ - [Table Extraction](../tables/index.ipynb): Especially powerful with TATR regions.
184
+ - [Text Extraction](../text-extraction/index.ipynb): Extract text only from specific region types (e.g., paragraphs).
185
+ - [Document QA](../document-qa/index.ipynb): Focus question answering on specific detected regions.
docs/ocr/index.md ADDED
@@ -0,0 +1,222 @@
1
+ # OCR Integration
2
+
3
+ Natural PDF includes OCR (Optical Character Recognition) to extract text from scanned documents or images embedded in PDFs.
4
+
5
+ ## OCR Engine Comparison
6
+
7
+ Natural PDF supports multiple OCR engines:
8
+
9
+ | Feature | EasyOCR | PaddleOCR | Surya OCR |
10
+ |----------------------|------------------------------------|------------------------------------------|---------------------------------------|
11
+ | **Installation** | `natural-pdf[easyocr]` | `natural-pdf[paddle]` | `natural-pdf[surya]` |
12
+ | **Primary Strength** | Good general performance, simpler | Excellent Asian language, speed | High accuracy, multilingual lines |
13
+ | **Speed** | Moderate | Fast | Moderate (GPU recommended) |
14
+ | **Memory Usage** | Higher | Efficient | Higher (GPU recommended) |
15
+ | **Paragraph Detect** | Yes (via option) | No | No (focuses on lines) |
16
+ | **Handwritten** | Better support | Limited | Limited |
17
+ | **Small Text** | Moderate | Good | Good |
18
+ | **When to Use** | General documents, handwritten text| Asian languages, speed-critical tasks | Highest accuracy needed, line-level |
19
+
20
+ ## Basic OCR Usage
21
+
22
+ Apply OCR directly to a page or region:
23
+
24
+ ```python
25
+ from natural_pdf import PDF
26
+
27
+ # Assume 'page' is a Page object from a PDF
28
+ page = pdf.pages[0]
29
+
30
+ # Apply OCR using the default engine (or specify one)
31
+ ocr_elements = page.apply_ocr(languages=['en'])
32
+
33
+ # Extract text (will use the results from apply_ocr if run previously)
34
+ text = page.extract_text()
35
+ print(text)
36
+ ```
37
+
38
+ ## Configuring OCR
39
+
40
+ Specify the engine and basic options directly:
41
+
42
+ ## OCR Configuration
43
+
44
+ ```python
45
+ # Use PaddleOCR for Chinese and English
46
+ ocr_elements = page.apply_ocr(engine='paddle', languages=['zh-cn', 'en'])
47
+
48
+ # Use EasyOCR with a lower confidence threshold
49
+ ocr_elements = page.apply_ocr(engine='easyocr', languages=['en'], min_confidence=0.3)
50
+ ```
51
+
52
+ For advanced, engine-specific settings, use the Options classes:
53
+
54
+ ```python
55
+ from natural_pdf.ocr import PaddleOCROptions, EasyOCROptions, SuryaOCROptions
56
+
57
+ # --- Configure PaddleOCR ---
58
+ paddle_opts = PaddleOCROptions(
59
+ languages=['en', 'zh-cn'],
60
+ use_gpu=True, # Explicitly enable GPU if available
61
+ use_angle_cls=False, # Disable text direction classification (if text is upright)
62
+ det_db_thresh=0.25, # Lower detection threshold (more boxes, potentially noisy)
63
+ rec_batch_num=16 # Increase recognition batch size for potential speedup on GPU
64
+ # rec_char_dict_path='/path/to/custom_dict.txt' # Optional: Path to a custom character dictionary
65
+ # See PaddleOCROptions documentation or source code for all parameters
66
+ )
67
+ ocr_elements = page.apply_ocr(engine='paddle', options=paddle_opts)
68
+
69
+ # --- Configure EasyOCR ---
70
+ easy_opts = EasyOCROptions(
71
+ languages=['en', 'fr'],
72
+ gpu=True, # Explicitly enable GPU if available
73
+ paragraph=True, # Group results into paragraphs (if structure is clear)
74
+ detail=1, # Ensure bounding boxes are returned (required)
75
+ text_threshold=0.6, # Confidence threshold for text detection (adjust based on tuning table)
76
+ link_threshold=0.4, # Standard EasyOCR param, uncomment if confirmed in wrapper
77
+ low_text=0.4, # Standard EasyOCR param, uncomment if confirmed in wrapper
78
+ batch_size=8 # Processing batch size (adjust based on memory)
79
+ # See EasyOCROptions documentation or source code for all parameters
80
+ )
81
+ ocr_elements = page.apply_ocr(engine='easyocr', options=easy_opts)
82
+
83
+ # --- Configure Surya OCR ---
84
+ # Surya focuses on line detection and recognition
85
+ surya_opts = SuryaOCROptions(
86
+ languages=['en', 'de'], # Specify languages for recognition
87
+ # device='cuda', # Use GPU ('cuda') or CPU ('cpu') <-- Set via env var TORCH_DEVICE
88
+ min_confidence=0.4 # Example: Adjust minimum confidence for results
89
+ # Core Surya options like device, batch size, and thresholds are typically
90
+ # set via environment variables (see note below).
91
+ )
92
+ ocr_elements = page.apply_ocr(engine='surya', options=surya_opts)
93
+ ```
94
+
95
+ ## Multiple Languages
96
+
97
+ OCR supports multiple languages:
98
+
99
+ ```python
100
+ # Recognize English and Spanish text
101
+ pdf = PDF('multilingual.pdf', ocr={
102
+ 'enabled': True,
103
+ 'languages': ['en', 'es']
104
+ })
105
+
106
+ # Multiple languages with PaddleOCR
107
+ pdf = PDF('multilingual_document.pdf',
108
+ ocr_engine='paddleocr',
109
+ ocr={
110
+ 'enabled': True,
111
+ 'languages': ['zh', 'ja', 'ko', 'en'] # Chinese, Japanese, Korean, English
112
+ })
113
+ ```
114
+
115
+ ## Applying OCR Directly
116
+
117
+ The `page.apply_ocr(...)` and `region.apply_ocr(...)` methods are the primary way to run OCR:
118
+
119
+ ```python
120
+ # Apply OCR to a page and get the OCR elements
121
+ ocr_elements = page.apply_ocr(engine='easyocr')
122
+ print(f"Found {len(ocr_elements)} text elements via OCR")
123
+
124
+ # Apply OCR to a specific region
125
+ title = page.find('text:contains("Title")')
126
+ content_region = title.below(height=300)
127
+ region_ocr_elements = content_region.apply_ocr(engine='paddle', languages=['en'])
128
+ ```
129
+
130
+ ## OCR Engines
131
+
132
+ Choose the engine best suited for your document and language requirements using the `engine` parameter in `apply_ocr`.
133
+
134
+ ## Finding and Working with OCR Text
135
+
136
+ After applying OCR, work with the text just like regular text:
137
+
138
+ ```python
139
+ # Find all OCR text elements
140
+ ocr_text = page.find_all('text[source=ocr]')
141
+
142
+ # Find high-confidence OCR text
143
+ high_conf = page.find_all('text[source=ocr][confidence>=0.8]')
144
+
145
+ # Extract text only from OCR elements
146
+ ocr_text_content = page.find_all('text[source=ocr]').extract_text()
147
+
148
+ # Filter OCR text by content
149
+ names = page.find_all('text[source=ocr]:contains("Smith")', case=False)
150
+ ```
151
+
152
+ ## Visualizing OCR Results
153
+
154
+ See OCR results to help debug issues:
155
+
156
+ ```python
157
+ # Apply OCR
158
+ ocr_elements = page.apply_ocr()
159
+
160
+ # Highlight all OCR elements
161
+ for element in ocr_elements:
162
+ # Color based on confidence
163
+ if element.confidence >= 0.8:
164
+ color = "green" # High confidence
165
+ elif element.confidence >= 0.5:
166
+ color = "yellow" # Medium confidence
167
+ else:
168
+ color = "red" # Low confidence
169
+
170
+ element.highlight(color=color, label=f"OCR ({element.confidence:.2f})")
171
+
172
+ # Get the visualization as an image
173
+ image = page.to_image(labels=True)
174
+ # Just return the image in a Jupyter cell
175
+ image
176
+
177
+ # Highlight only high-confidence elements
178
+ high_conf = page.find_all('text[source=ocr][confidence>=0.8]')
179
+ high_conf.highlight(color="green", label="High Confidence OCR")
180
+ ```
181
+
182
+ ## OCR Debugging
183
+
184
+ For troubleshooting OCR problems:
185
+
186
+ ```python
187
+ # Create an interactive HTML debug report
188
+ pdf.debug_ocr("ocr_debug.html")
189
+
190
+ # Specify which pages to include
191
+ pdf.debug_ocr("ocr_debug.html", pages=[0, 1, 2])
192
+ ```
193
+
194
+ The debug report shows:
195
+ - The original image
196
+ - Text found with confidence scores
197
+ - Boxes around each detected word
198
+ - Options to sort and filter results
199
+
200
+ ## OCR Parameter Tuning
201
+
202
+ ### Parameter Recommendation Table
203
+
204
+ | Issue | Engine | Parameter | Recommended Value | Effect |
205
+ |-------|--------|-----------|-------------------|--------|
206
+ | Missing text | EasyOCR | `text_threshold` | 0.1 - 0.3 (default: 0.7) | Lower values detect more text but may increase false positives |
207
+ | Missing text | PaddleOCR | `det_db_thresh` | 0.1 - 0.3 (default: 0.3) | Lower values detect more text areas |
208
+ | Low quality scan | EasyOCR | `contrast_ths` | 0.05 - 0.1 (default: 0.1) | Lower values help with low contrast documents |
209
+ | Low quality scan | PaddleOCR | `det_limit_side_len` | 1280 - 2560 (default: 960) | Higher values improve detail detection |
210
+ | Accuracy vs. speed | EasyOCR | `decoder` | "wordbeamsearch" (accuracy)<br>"greedy" (speed) | Word beam search is more accurate but slower |
211
+ | Accuracy vs. speed | PaddleOCR | `rec_batch_num` | 1 (accuracy)<br>8+ (speed) | Larger batches process faster but use more memory |
212
+ | Small text | Both | `min_confidence` | 0.3 - 0.4 (default: 0.5) | Lower confidence threshold to capture small/blurry text |
213
+ | Text orientation | PaddleOCR | `use_angle_cls` | `True` | Enable angle classification for rotated text |
214
+ | Asian languages | PaddleOCR | `lang` | "ch", "japan", "korea" | Use PaddleOCR for Asian languages |
215
+
216
+ ## Next Steps
217
+
218
+ With OCR capabilities, you can explore:
219
+
220
+ - [Layout Analysis](../layout-analysis/index.ipynb) for automatically detecting document structure
221
+ - [Document QA](../document-qa/index.ipynb) for asking questions about your documents
222
+ - [Visual Debugging](../visual-debugging/index.ipynb) for visualizing OCR results