natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +3 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +422 -0
  13. natural_pdf/classification/mixin.py +163 -0
  14. natural_pdf/classification/results.py +80 -0
  15. natural_pdf/collections/mixins.py +111 -0
  16. natural_pdf/collections/pdf_collection.py +434 -15
  17. natural_pdf/core/element_manager.py +83 -0
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +578 -93
  20. natural_pdf/core/pdf.py +912 -460
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +712 -109
  23. natural_pdf/elements/region.py +722 -69
  24. natural_pdf/elements/text.py +4 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +5 -4
  28. natural_pdf/extraction/manager.py +135 -0
  29. natural_pdf/extraction/mixin.py +279 -0
  30. natural_pdf/extraction/result.py +23 -0
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/engine_easyocr.py +6 -3
  34. natural_pdf/ocr/ocr_factory.py +24 -4
  35. natural_pdf/ocr/ocr_manager.py +122 -26
  36. natural_pdf/ocr/ocr_options.py +94 -11
  37. natural_pdf/ocr/utils.py +19 -6
  38. natural_pdf/qa/document_qa.py +0 -4
  39. natural_pdf/search/__init__.py +20 -34
  40. natural_pdf/search/haystack_search_service.py +309 -265
  41. natural_pdf/search/haystack_utils.py +99 -75
  42. natural_pdf/search/search_service_protocol.py +11 -12
  43. natural_pdf/selectors/parser.py +431 -230
  44. natural_pdf/utils/debug.py +3 -3
  45. natural_pdf/utils/identifiers.py +1 -1
  46. natural_pdf/utils/locks.py +8 -0
  47. natural_pdf/utils/packaging.py +8 -6
  48. natural_pdf/utils/text_extraction.py +60 -1
  49. natural_pdf/utils/tqdm_utils.py +51 -0
  50. natural_pdf/utils/visualization.py +18 -0
  51. natural_pdf/widgets/viewer.py +4 -25
  52. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
  53. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  54. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  55. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  56. docs/api/index.md +0 -386
  57. docs/assets/favicon.png +0 -3
  58. docs/assets/favicon.svg +0 -3
  59. docs/assets/javascripts/custom.js +0 -17
  60. docs/assets/logo.svg +0 -3
  61. docs/assets/sample-screen.png +0 -0
  62. docs/assets/social-preview.png +0 -17
  63. docs/assets/social-preview.svg +0 -17
  64. docs/assets/stylesheets/custom.css +0 -65
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -915
  68. docs/element-selection/index.md +0 -229
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -170
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -209
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -194
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -340
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -147
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -114
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -270
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -332
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -288
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -413
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -508
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2434
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -512
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -604
  112. docs/tutorials/12-ocr-integration.md +0 -175
  113. docs/tutorials/13-semantic-search.ipynb +0 -1328
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.7.dist-info/RECORD +0 -145
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,185 +0,0 @@
1
- # Document Layout Analysis
2
-
3
- Natural PDF can automatically detect the structure of a document (titles, paragraphs, tables, figures) using layout analysis models. This guide shows how to use this feature.
4
-
5
- ## Setup
6
-
7
- We'll use a sample PDF that includes various layout elements.
8
-
9
- ```python
10
- from natural_pdf import PDF
11
-
12
- pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
13
- page = pdf.pages[0]
14
-
15
- page.to_image(width=700)
16
- ```
17
-
18
- ## Running Basic Layout Analysis
19
-
20
- Use the `analyze_layout()` method. By default, it uses the YOLO model.
21
-
22
- ```python
23
- # Analyze the layout using the default engine (YOLO)
24
- # This adds 'region' elements to the page
25
- page.analyze_layout()
26
- ```
27
-
28
- ```python
29
- # Find all detected regions
30
- regions = page.find_all('region')
31
- len(regions) # Show how many regions were detected
32
- ```
33
-
34
- ```python
35
- first_region = regions[0]
36
- f"First region: type='{first_region.type}', confidence={first_region.confidence:.2f}"
37
- ```
38
-
39
- ## Visualizing Detected Layout
40
-
41
- Use `highlight()` or `show()` on the detected regions.
42
-
43
- ```python
44
- # Highlight all detected regions, colored by type
45
- regions.highlight(group_by='type')
46
- page.to_image(width=700)
47
- ```
48
-
49
- ## Finding Specific Region Types
50
-
51
- Use attribute selectors to find regions of a specific type.
52
-
53
- ```python
54
- # Find all detected titles
55
- titles = page.find_all('region[type=title]')
56
- titles
57
- ```
58
-
59
- ```python
60
- titles.show()
61
- ```
62
-
63
- ```python
64
- page.find_all('region[type=table]').show()
65
- ```
66
-
67
- ## Working with Layout Regions
68
-
69
- Detected regions are like any other `Region` object. You can extract text, find elements within them, etc.
70
-
71
- ```python
72
- page.find('region[type=table]').extract_text(layout=True)
73
- ```
74
-
75
- ## Using Different Layout Models
76
-
77
- Natural PDF supports multiple engines (`yolo`, `paddle`, `tatr`). Specify the engine when calling `analyze_layout`.
78
-
79
- *Note: Using different engines requires installing the corresponding extras (e.g., `natural-pdf[layout_paddle]`).* `yolo` is the default.
80
-
81
- ```python
82
- page.clear_detected_layout_regions()
83
- page.clear_highlights()
84
-
85
- page.analyze_layout(engine="paddle")
86
- page.find_all('region[model=paddle]').highlight(group_by='region_type')
87
- page.to_image(width=700)
88
- ```
89
-
90
- ```python
91
- # Analyze using Table Transformer (TATR) - specialized for tables
92
- page.clear_detected_layout_regions()
93
- page.clear_highlights()
94
-
95
- page.analyze_layout(engine="tatr")
96
- page.find_all('region[model=tatr]').highlight(group_by='region_type')
97
- page.to_image(width=700)
98
- ```
99
-
100
- ```python
101
- # Analyze using Table Transformer (TATR) - specialized for tables
102
- page.clear_detected_layout_regions()
103
- page.clear_highlights()
104
-
105
- page.analyze_layout(engine="docling")
106
- page.find_all('region[model=docling]').highlight(group_by='region_type')
107
- page.to_image(width=700)
108
- ```
109
-
110
- ```python
111
- # Analyze using Table Transformer (TATR) - specialized for tables
112
- page.clear_detected_layout_regions()
113
- page.clear_highlights()
114
-
115
- page.analyze_layout(engine="surya")
116
- page.find_all('region[model=surya]').highlight(group_by='region_type')
117
- page.to_image(width=700)
118
- ```
119
-
120
- *Note: Calling `analyze_layout` multiple times (even with the same engine) can add duplicate regions. You might want to use `page.clear_detected_layout_regions()` first, or filter by model using `region[model=yolo]`.*
121
-
122
- ## Controlling Confidence Threshold
123
-
124
- Filter detections by their confidence score.
125
-
126
- ```python
127
- # Re-run YOLO analysis (clearing previous results might be good practice)
128
- page.clear_detected_layout_regions()
129
- page.analyze_layout(engine="yolo")
130
-
131
- # Find only high-confidence regions (e.g., >= 0.8)
132
- high_conf_regions = page.find_all('region[confidence>=0.8]')
133
- len(high_conf_regions)
134
- ```
135
-
136
- ## Table Structure with TATR
137
-
138
- The TATR engine provides detailed table structure elements (`table`, `table-row`, `table-column`, `table-column-header`). This is very useful for precise table extraction.
139
-
140
- ```python
141
- # Ensure TATR analysis has been run
142
- page.clear_detected_layout_regions()
143
- page.clear_highlights()
144
-
145
- page.analyze_layout(engine="tatr")
146
- page.find_all('region[model=tatr]').highlight(group_by='region_type')
147
- page.to_image(width=700)
148
- ```
149
-
150
- ```python
151
- # Find different structural elements from TATR
152
- tables = page.find_all('region[type=table][model=tatr]')
153
- rows = page.find_all('region[type=table-row][model=tatr]')
154
- cols = page.find_all('region[type=table-column][model=tatr]')
155
- hdrs = page.find_all('region[type=table-column-header][model=tatr]')
156
-
157
- f"Found: {len(tables)} tables, {len(rows)} rows, {len(cols)} columns, {len(hdrs)} headers (from TATR)"
158
- ```
159
-
160
- ### Enhanced Table Extraction with TATR
161
-
162
- When a `region[type=table]` comes from the TATR model, `extract_table()` can use the underlying row/column structure for more robust extraction.
163
-
164
- ```python
165
- # Find the TATR table region again
166
- tatr_table = page.find('region[type=table][model=tatr]')
167
-
168
- # This extraction uses the detected rows/columns
169
- tatr_table.extract_table()
170
- ```
171
-
172
- if you'd like the normal approach instead of the "intelligent" one, you can ask for pdfplumber.
173
-
174
- ```python
175
- # This extraction uses the detected rows/columns
176
- tatr_table.extract_table(method='pdfplumber')
177
- ```
178
-
179
- ## Next Steps
180
-
181
- Layout analysis provides regions that you can use for:
182
-
183
- - [Table Extraction](../tables/index.ipynb): Especially powerful with TATR regions.
184
- - [Text Extraction](../text-extraction/index.ipynb): Extract text only from specific region types (e.g., paragraphs).
185
- - [Document QA](../document-qa/index.ipynb): Focus question answering on specific detected regions.
docs/ocr/index.md DELETED
@@ -1,209 +0,0 @@
1
- # OCR Integration
2
-
3
- Natural PDF includes OCR (Optical Character Recognition) to extract text from scanned documents or images embedded in PDFs.
4
-
5
- ## OCR Engine Comparison
6
-
7
- Natural PDF supports multiple OCR engines:
8
-
9
- | Feature | EasyOCR | PaddleOCR | Surya OCR |
10
- |----------------------|------------------------------------|------------------------------------------|---------------------------------------|
11
- | **Installation** | `natural-pdf[easyocr]` | `natural-pdf[paddle]` | `natural-pdf[surya]` |
12
- | **Primary Strength** | Good general performance, simpler | Excellent Asian language, speed | High accuracy, multilingual lines |
13
- | **Speed** | Moderate | Fast | Moderate (GPU recommended) |
14
- | **Memory Usage** | Higher | Efficient | Higher (GPU recommended) |
15
- | **Paragraph Detect** | Yes (via option) | No | No (focuses on lines) |
16
- | **Handwritten** | Better support | Limited | Limited |
17
- | **Small Text** | Moderate | Good | Good |
18
- | **When to Use** | General documents, handwritten text| Asian languages, speed-critical tasks | Highest accuracy needed, line-level |
19
-
20
- ## Basic OCR Usage
21
-
22
- Apply OCR directly to a page or region:
23
-
24
- ```python
25
- from natural_pdf import PDF
26
-
27
- # Assume 'page' is a Page object from a PDF
28
- page = pdf.pages[0]
29
-
30
- # Apply OCR using the default engine (or specify one)
31
- ocr_elements = page.apply_ocr(languages=['en'])
32
-
33
- # Extract text (will use the results from apply_ocr if run previously)
34
- text = page.extract_text()
35
- print(text)
36
- ```
37
-
38
- ## Configuring OCR
39
-
40
- Specify the engine and basic options directly:
41
-
42
- ## OCR Configuration
43
-
44
- ```python
45
- # Use PaddleOCR for Chinese and English
46
- ocr_elements = page.apply_ocr(engine='paddle', languages=['zh-cn', 'en'])
47
-
48
- # Use EasyOCR with a lower confidence threshold
49
- ocr_elements = page.apply_ocr(engine='easyocr', languages=['en'], min_confidence=0.3)
50
- ```
51
-
52
- For advanced, engine-specific settings, use the Options classes:
53
-
54
- ```python
55
- from natural_pdf.ocr import PaddleOCROptions, EasyOCROptions, SuryaOCROptions
56
-
57
- # --- Configure PaddleOCR ---
58
- paddle_opts = PaddleOCROptions(
59
- languages=['en', 'zh-cn'],
60
- use_gpu=True, # Explicitly enable GPU if available
61
- use_angle_cls=False, # Disable text direction classification (if text is upright)
62
- det_db_thresh=0.25, # Lower detection threshold (more boxes, potentially noisy)
63
- rec_batch_num=16 # Increase recognition batch size for potential speedup on GPU
64
- # rec_char_dict_path='/path/to/custom_dict.txt' # Optional: Path to a custom character dictionary
65
- # See PaddleOCROptions documentation or source code for all parameters
66
- )
67
- ocr_elements = page.apply_ocr(engine='paddle', options=paddle_opts)
68
-
69
- # --- Configure EasyOCR ---
70
- easy_opts = EasyOCROptions(
71
- languages=['en', 'fr'],
72
- gpu=True, # Explicitly enable GPU if available
73
- paragraph=True, # Group results into paragraphs (if structure is clear)
74
- detail=1, # Ensure bounding boxes are returned (required)
75
- text_threshold=0.6, # Confidence threshold for text detection (adjust based on tuning table)
76
- link_threshold=0.4, # Standard EasyOCR param, uncomment if confirmed in wrapper
77
- low_text=0.4, # Standard EasyOCR param, uncomment if confirmed in wrapper
78
- batch_size=8 # Processing batch size (adjust based on memory)
79
- # See EasyOCROptions documentation or source code for all parameters
80
- )
81
- ocr_elements = page.apply_ocr(engine='easyocr', options=easy_opts)
82
-
83
- # --- Configure Surya OCR ---
84
- # Surya focuses on line detection and recognition
85
- surya_opts = SuryaOCROptions(
86
- languages=['en', 'de'], # Specify languages for recognition
87
- # device='cuda', # Use GPU ('cuda') or CPU ('cpu') <-- Set via env var TORCH_DEVICE
88
- min_confidence=0.4 # Example: Adjust minimum confidence for results
89
- # Core Surya options like device, batch size, and thresholds are typically
90
- # set via environment variables (see note below).
91
- )
92
- ocr_elements = page.apply_ocr(engine='surya', options=surya_opts)
93
- ```
94
-
95
- ## Applying OCR Directly
96
-
97
- The `page.apply_ocr(...)` and `region.apply_ocr(...)` methods are the primary way to run OCR:
98
-
99
- ```python
100
- # Apply OCR to a page and get the OCR elements
101
- ocr_elements = page.apply_ocr(engine='easyocr')
102
- print(f"Found {len(ocr_elements)} text elements via OCR")
103
-
104
- # Apply OCR to a specific region
105
- title = page.find('text:contains("Title")')
106
- content_region = title.below(height=300)
107
- region_ocr_elements = content_region.apply_ocr(engine='paddle', languages=['en'])
108
- ```
109
-
110
- ## OCR Engines
111
-
112
- Choose the engine best suited for your document and language requirements using the `engine` parameter in `apply_ocr`.
113
-
114
- ## Finding and Working with OCR Text
115
-
116
- After applying OCR, work with the text just like regular text:
117
-
118
- ```python
119
- # Find all OCR text elements
120
- ocr_text = page.find_all('text[source=ocr]')
121
-
122
- # Find high-confidence OCR text
123
- high_conf = page.find_all('text[source=ocr][confidence>=0.8]')
124
-
125
- # Extract text only from OCR elements
126
- ocr_text_content = page.find_all('text[source=ocr]').extract_text()
127
-
128
- # Filter OCR text by content
129
- names = page.find_all('text[source=ocr]:contains("Smith")', case=False)
130
- ```
131
-
132
- ## Visualizing OCR Results
133
-
134
- See OCR results to help debug issues:
135
-
136
- ```python
137
- # Apply OCR
138
- ocr_elements = page.apply_ocr()
139
-
140
- # Highlight all OCR elements
141
- for element in ocr_elements:
142
- # Color based on confidence
143
- if element.confidence >= 0.8:
144
- color = "green" # High confidence
145
- elif element.confidence >= 0.5:
146
- color = "yellow" # Medium confidence
147
- else:
148
- color = "red" # Low confidence
149
-
150
- element.highlight(color=color, label=f"OCR ({element.confidence:.2f})")
151
-
152
- # Get the visualization as an image
153
- image = page.to_image(labels=True)
154
- # Just return the image in a Jupyter cell
155
- image
156
-
157
- # Highlight only high-confidence elements
158
- high_conf = page.find_all('text[source=ocr][confidence>=0.8]')
159
- high_conf.highlight(color="green", label="High Confidence OCR")
160
- ```
161
-
162
- ## Detect + LLM OCR
163
-
164
- Sometimes you have a difficult piece of content where you need to use a local model to identify the content, then send it off in pieces to be identified by the LLM. You can do this with Natural PDF!
165
-
166
- ```python
167
- from natural_pdf import PDF
168
- from natural_pdf.ocr.utils import direct_ocr_llm
169
- import openai
170
-
171
- pdf = PDF("needs-ocr.pdf")
172
- page = pdf.pages[0]
173
-
174
- # Detect
175
- page.apply_ocr('paddle', resolution=120, detect_only=True)
176
-
177
- # Build the framework
178
- client = openai.OpenAI(base_url="https://api.anthropic.com/v1/", api_key='sk-XXXXX')
179
- prompt = """OCR this image. Return only the exact text from the image. Include misspellings,
180
- punctuation, etc. Do not surround it with quotation marks. Do not include translations or comments.
181
- The text is from a Greek spreadsheet, so most likely content is Modern Greek or numeric."""
182
-
183
- # This returns the cleaned-up text
184
- def correct(region):
185
- return direct_ocr_llm(region, client, prompt=prompt, resolution=300, model="claude-3-5-haiku-20241022")
186
-
187
- # Run 'correct' on each text element
188
- page.correct_ocr(correct)
189
-
190
- # You're done!
191
- ```
192
-
193
- ## Debugging OCR
194
-
195
- ```python
196
- from natural_pdf.utils.packaging import create_correction_task_package
197
-
198
- create_correction_task_package(pdf, "original.zip", overwrite=True)
199
- ```
200
-
201
- This will at *some point* be official-ized, but for now you can look at `templates/spa` and see the correction package.
202
-
203
- ## Next Steps
204
-
205
- With OCR capabilities, you can explore:
206
-
207
- - [Layout Analysis](../layout-analysis/index.ipynb) for automatically detecting document structure
208
- - [Document QA](../document-qa/index.ipynb) for asking questions about your documents
209
- - [Visual Debugging](../visual-debugging/index.ipynb) for visualizing OCR results
@@ -1,314 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "bba1860e",
6
- "metadata": {},
7
- "source": [
8
- "# PDF Navigation\n",
9
- "\n",
10
- "This guide covers the basics of working with PDFs in Natural PDF - opening documents, accessing pages, and navigating through content.\n",
11
- "\n",
12
- "## Opening a PDF\n",
13
- "\n",
14
- "The main entry point to Natural PDF is the `PDF` class:"
15
- ]
16
- },
17
- {
18
- "cell_type": "code",
19
- "execution_count": 1,
20
- "id": "56d12ab5",
21
- "metadata": {
22
- "execution": {
23
- "iopub.execute_input": "2025-04-03T14:50:38.434157Z",
24
- "iopub.status.busy": "2025-04-03T14:50:38.433170Z",
25
- "iopub.status.idle": "2025-04-03T14:50:49.768101Z",
26
- "shell.execute_reply": "2025-04-03T14:50:49.767384Z"
27
- }
28
- },
29
- "outputs": [],
30
- "source": [
31
- "from natural_pdf import PDF\n",
32
- "\n",
33
- "# Open a PDF file\n",
34
- "pdf = PDF(\"https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/0500000US42001.pdf\")"
35
- ]
36
- },
37
- {
38
- "cell_type": "markdown",
39
- "id": "c425482a",
40
- "metadata": {},
41
- "source": [
42
- "## Accessing Pages\n",
43
- "\n",
44
- "Once you have a PDF object, you can access its pages:"
45
- ]
46
- },
47
- {
48
- "cell_type": "code",
49
- "execution_count": 2,
50
- "id": "a3405aa9",
51
- "metadata": {
52
- "execution": {
53
- "iopub.execute_input": "2025-04-03T14:50:49.770604Z",
54
- "iopub.status.busy": "2025-04-03T14:50:49.770419Z",
55
- "iopub.status.idle": "2025-04-03T14:50:50.700808Z",
56
- "shell.execute_reply": "2025-04-03T14:50:50.699634Z"
57
- }
58
- },
59
- "outputs": [
60
- {
61
- "name": "stdout",
62
- "output_type": "stream",
63
- "text": [
64
- "This PDF has 153 pages\n",
65
- "Page 1 has 985 characters\n",
66
- "Page 2 has 778 characters\n",
67
- "Page 3 has 522 characters\n",
68
- "Page 4 has 984 characters\n",
69
- "Page 5 has 778 characters\n",
70
- "Page 6 has 523 characters\n"
71
- ]
72
- },
73
- {
74
- "name": "stdout",
75
- "output_type": "stream",
76
- "text": [
77
- "Page 7 has 982 characters\n",
78
- "Page 8 has 772 characters\n",
79
- "Page 9 has 522 characters\n",
80
- "Page 10 has 1008 characters\n"
81
- ]
82
- },
83
- {
84
- "name": "stdout",
85
- "output_type": "stream",
86
- "text": [
87
- "Page 11 has 796 characters\n",
88
- "Page 12 has 532 characters\n",
89
- "Page 13 has 986 characters\n",
90
- "Page 14 has 780 characters\n",
91
- "Page 15 has 523 characters\n",
92
- "Page 16 has 990 characters\n",
93
- "Page 17 has 782 characters\n"
94
- ]
95
- },
96
- {
97
- "name": "stdout",
98
- "output_type": "stream",
99
- "text": [
100
- "Page 18 has 520 characters\n",
101
- "Page 19 has 1006 characters\n",
102
- "Page 20 has 795 characters\n"
103
- ]
104
- }
105
- ],
106
- "source": [
107
- "# Get the total number of pages\n",
108
- "num_pages = len(pdf)\n",
109
- "print(f\"This PDF has {num_pages} pages\")\n",
110
- "\n",
111
- "# Get a specific page (0-indexed)\n",
112
- "first_page = pdf.pages[0]\n",
113
- "last_page = pdf.pages[-1]\n",
114
- "\n",
115
- "# Iterate through the first 20 pages\n",
116
- "for page in pdf.pages[:20]:\n",
117
- " print(f\"Page {page.number} has {len(page.extract_text())} characters\")"
118
- ]
119
- },
120
- {
121
- "cell_type": "markdown",
122
- "id": "2eca7327",
123
- "metadata": {},
124
- "source": [
125
- "## Page Properties\n",
126
- "\n",
127
- "Each `Page` object has useful properties:"
128
- ]
129
- },
130
- {
131
- "cell_type": "code",
132
- "execution_count": 3,
133
- "id": "348f28d7",
134
- "metadata": {
135
- "execution": {
136
- "iopub.execute_input": "2025-04-03T14:50:50.713325Z",
137
- "iopub.status.busy": "2025-04-03T14:50:50.711638Z",
138
- "iopub.status.idle": "2025-04-03T14:50:50.738737Z",
139
- "shell.execute_reply": "2025-04-03T14:50:50.726839Z"
140
- }
141
- },
142
- "outputs": [
143
- {
144
- "name": "stdout",
145
- "output_type": "stream",
146
- "text": [
147
- "612 792\n",
148
- "20\n",
149
- "19\n"
150
- ]
151
- }
152
- ],
153
- "source": [
154
- "# Page dimensions in points (1/72 inch)\n",
155
- "print(page.width, page.height)\n",
156
- "\n",
157
- "# Page number (1-indexed as shown in PDF viewers)\n",
158
- "print(page.number)\n",
159
- "\n",
160
- "# Page index (0-indexed position in the PDF)\n",
161
- "print(page.index)"
162
- ]
163
- },
164
- {
165
- "cell_type": "markdown",
166
- "id": "c7cf1839",
167
- "metadata": {},
168
- "source": [
169
- "## Working Across Pages\n",
170
- "\n",
171
- "Natural PDF makes it easy to work with content across multiple pages:"
172
- ]
173
- },
174
- {
175
- "cell_type": "code",
176
- "execution_count": 4,
177
- "id": "71a8f1ec",
178
- "metadata": {
179
- "execution": {
180
- "iopub.execute_input": "2025-04-03T14:50:50.765495Z",
181
- "iopub.status.busy": "2025-04-03T14:50:50.764444Z",
182
- "iopub.status.idle": "2025-04-03T14:50:57.735494Z",
183
- "shell.execute_reply": "2025-04-03T14:50:57.726489Z"
184
- }
185
- },
186
- "outputs": [
187
- {
188
- "data": {
189
- "text/plain": [
190
- "<natural_pdf.core.pdf.PDF at 0x1045224d0>"
191
- ]
192
- },
193
- "execution_count": 4,
194
- "metadata": {},
195
- "output_type": "execute_result"
196
- }
197
- ],
198
- "source": [
199
- "# Extract text from all pages\n",
200
- "all_text = pdf.extract_text()\n",
201
- "\n",
202
- "# Find elements across all pages\n",
203
- "all_headings = pdf.find_all('text[size>=14]:bold')\n",
204
- "\n",
205
- "# Add exclusion zones to all pages (like headers/footers)\n",
206
- "pdf.add_exclusion(\n",
207
- " lambda page: page.find('text:contains(\"CONFIDENTIAL\")').above() if page.find('text:contains(\"CONFIDENTIAL\")') else None,\n",
208
- " label=\"header\"\n",
209
- ")"
210
- ]
211
- },
212
- {
213
- "cell_type": "markdown",
214
- "id": "e18051a4",
215
- "metadata": {},
216
- "source": [
217
- "## The Page Collection\n",
218
- "\n",
219
- "The `pdf.pages` object is a `PageCollection` that allows batch operations on pages:"
220
- ]
221
- },
222
- {
223
- "cell_type": "code",
224
- "execution_count": 5,
225
- "id": "e5f1c662",
226
- "metadata": {
227
- "execution": {
228
- "iopub.execute_input": "2025-04-03T14:50:57.752240Z",
229
- "iopub.status.busy": "2025-04-03T14:50:57.751868Z",
230
- "iopub.status.idle": "2025-04-03T14:50:57.770738Z",
231
- "shell.execute_reply": "2025-04-03T14:50:57.759415Z"
232
- }
233
- },
234
- "outputs": [],
235
- "source": [
236
- "# Extract text from specific pages\n",
237
- "text = pdf.pages[2:5].extract_text()\n",
238
- "\n",
239
- "# Find elements across specific pages\n",
240
- "elements = pdf.pages[2:5].find_all('text:contains(\"Annual Report\")')"
241
- ]
242
- },
243
- {
244
- "cell_type": "markdown",
245
- "id": "9713e392",
246
- "metadata": {},
247
- "source": [
248
- "## Document Sections Across Pages\n",
249
- "\n",
250
- "You can extract sections that span across multiple pages:"
251
- ]
252
- },
253
- {
254
- "cell_type": "code",
255
- "execution_count": 6,
256
- "id": "d5b89a2b",
257
- "metadata": {
258
- "execution": {
259
- "iopub.execute_input": "2025-04-03T14:50:57.782621Z",
260
- "iopub.status.busy": "2025-04-03T14:50:57.781776Z",
261
- "iopub.status.idle": "2025-04-03T14:50:57.811508Z",
262
- "shell.execute_reply": "2025-04-03T14:50:57.805310Z"
263
- }
264
- },
265
- "outputs": [],
266
- "source": [
267
- "# Get sections with headings as section starts\n",
268
- "sections = pdf.pages.get_sections(\n",
269
- " start_elements='text[size>=14]:bold',\n",
270
- " new_section_on_page_break=False\n",
271
- ")"
272
- ]
273
- },
274
- {
275
- "cell_type": "markdown",
276
- "id": "f51594ce",
277
- "metadata": {},
278
- "source": [
279
- "## Next Steps\n",
280
- "\n",
281
- "Now that you know how to navigate PDFs, you can:\n",
282
- "\n",
283
- "- [Find elements using selectors](../element-selection/index.ipynb)\n",
284
- "- [Extract text from your documents](../text-extraction/index.ipynb)\n",
285
- "- [Work with specific regions](../regions/index.ipynb)"
286
- ]
287
- }
288
- ],
289
- "metadata": {
290
- "jupytext": {
291
- "cell_metadata_filter": "-all",
292
- "main_language": "python",
293
- "notebook_metadata_filter": "-all",
294
- "text_representation": {
295
- "extension": ".md",
296
- "format_name": "markdown"
297
- }
298
- },
299
- "language_info": {
300
- "codemirror_mode": {
301
- "name": "ipython",
302
- "version": 3
303
- },
304
- "file_extension": ".py",
305
- "mimetype": "text/x-python",
306
- "name": "python",
307
- "nbconvert_exporter": "python",
308
- "pygments_lexer": "ipython3",
309
- "version": "3.10.13"
310
- }
311
- },
312
- "nbformat": 4,
313
- "nbformat_minor": 5
314
- }