natural-pdf 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. docs/api/index.md +386 -0
  2. docs/assets/favicon.png +3 -0
  3. docs/assets/favicon.svg +3 -0
  4. docs/assets/javascripts/custom.js +17 -0
  5. docs/assets/logo.svg +3 -0
  6. docs/assets/sample-screen.png +0 -0
  7. docs/assets/social-preview.png +17 -0
  8. docs/assets/social-preview.svg +17 -0
  9. docs/assets/stylesheets/custom.css +65 -0
  10. docs/document-qa/index.ipynb +435 -0
  11. docs/document-qa/index.md +79 -0
  12. docs/element-selection/index.ipynb +915 -0
  13. docs/element-selection/index.md +229 -0
  14. docs/index.md +170 -0
  15. docs/installation/index.md +69 -0
  16. docs/interactive-widget/index.ipynb +962 -0
  17. docs/interactive-widget/index.md +12 -0
  18. docs/layout-analysis/index.ipynb +818 -0
  19. docs/layout-analysis/index.md +185 -0
  20. docs/ocr/index.md +222 -0
  21. docs/pdf-navigation/index.ipynb +314 -0
  22. docs/pdf-navigation/index.md +97 -0
  23. docs/regions/index.ipynb +816 -0
  24. docs/regions/index.md +294 -0
  25. docs/tables/index.ipynb +658 -0
  26. docs/tables/index.md +144 -0
  27. docs/text-analysis/index.ipynb +370 -0
  28. docs/text-analysis/index.md +105 -0
  29. docs/text-extraction/index.ipynb +1478 -0
  30. docs/text-extraction/index.md +292 -0
  31. docs/tutorials/01-loading-and-extraction.ipynb +1696 -0
  32. docs/tutorials/01-loading-and-extraction.md +95 -0
  33. docs/tutorials/02-finding-elements.ipynb +340 -0
  34. docs/tutorials/02-finding-elements.md +149 -0
  35. docs/tutorials/03-extracting-blocks.ipynb +147 -0
  36. docs/tutorials/03-extracting-blocks.md +48 -0
  37. docs/tutorials/04-table-extraction.ipynb +114 -0
  38. docs/tutorials/04-table-extraction.md +50 -0
  39. docs/tutorials/05-excluding-content.ipynb +270 -0
  40. docs/tutorials/05-excluding-content.md +109 -0
  41. docs/tutorials/06-document-qa.ipynb +332 -0
  42. docs/tutorials/06-document-qa.md +91 -0
  43. docs/tutorials/07-layout-analysis.ipynb +260 -0
  44. docs/tutorials/07-layout-analysis.md +66 -0
  45. docs/tutorials/07-working-with-regions.ipynb +409 -0
  46. docs/tutorials/07-working-with-regions.md +151 -0
  47. docs/tutorials/08-spatial-navigation.ipynb +508 -0
  48. docs/tutorials/08-spatial-navigation.md +190 -0
  49. docs/tutorials/09-section-extraction.ipynb +2434 -0
  50. docs/tutorials/09-section-extraction.md +256 -0
  51. docs/tutorials/10-form-field-extraction.ipynb +484 -0
  52. docs/tutorials/10-form-field-extraction.md +201 -0
  53. docs/tutorials/11-enhanced-table-processing.ipynb +54 -0
  54. docs/tutorials/11-enhanced-table-processing.md +9 -0
  55. docs/tutorials/12-ocr-integration.ipynb +586 -0
  56. docs/tutorials/12-ocr-integration.md +188 -0
  57. docs/tutorials/13-semantic-search.ipynb +1888 -0
  58. docs/tutorials/13-semantic-search.md +77 -0
  59. docs/visual-debugging/index.ipynb +2970 -0
  60. docs/visual-debugging/index.md +157 -0
  61. docs/visual-debugging/region.png +0 -0
  62. natural_pdf/__init__.py +39 -20
  63. natural_pdf/analyzers/__init__.py +2 -1
  64. natural_pdf/analyzers/layout/base.py +32 -24
  65. natural_pdf/analyzers/layout/docling.py +131 -72
  66. natural_pdf/analyzers/layout/layout_analyzer.py +156 -113
  67. natural_pdf/analyzers/layout/layout_manager.py +98 -58
  68. natural_pdf/analyzers/layout/layout_options.py +32 -17
  69. natural_pdf/analyzers/layout/paddle.py +152 -95
  70. natural_pdf/analyzers/layout/surya.py +164 -92
  71. natural_pdf/analyzers/layout/tatr.py +149 -84
  72. natural_pdf/analyzers/layout/yolo.py +84 -44
  73. natural_pdf/analyzers/text_options.py +22 -15
  74. natural_pdf/analyzers/text_structure.py +131 -85
  75. natural_pdf/analyzers/utils.py +30 -23
  76. natural_pdf/collections/pdf_collection.py +125 -97
  77. natural_pdf/core/__init__.py +1 -1
  78. natural_pdf/core/element_manager.py +416 -337
  79. natural_pdf/core/highlighting_service.py +268 -196
  80. natural_pdf/core/page.py +907 -513
  81. natural_pdf/core/pdf.py +385 -287
  82. natural_pdf/elements/__init__.py +1 -1
  83. natural_pdf/elements/base.py +302 -214
  84. natural_pdf/elements/collections.py +708 -508
  85. natural_pdf/elements/line.py +39 -36
  86. natural_pdf/elements/rect.py +32 -30
  87. natural_pdf/elements/region.py +854 -883
  88. natural_pdf/elements/text.py +122 -99
  89. natural_pdf/exporters/__init__.py +0 -1
  90. natural_pdf/exporters/searchable_pdf.py +261 -102
  91. natural_pdf/ocr/__init__.py +23 -14
  92. natural_pdf/ocr/engine.py +17 -8
  93. natural_pdf/ocr/engine_easyocr.py +63 -47
  94. natural_pdf/ocr/engine_paddle.py +97 -68
  95. natural_pdf/ocr/engine_surya.py +54 -44
  96. natural_pdf/ocr/ocr_manager.py +88 -62
  97. natural_pdf/ocr/ocr_options.py +16 -10
  98. natural_pdf/qa/__init__.py +1 -1
  99. natural_pdf/qa/document_qa.py +119 -111
  100. natural_pdf/search/__init__.py +37 -31
  101. natural_pdf/search/haystack_search_service.py +312 -189
  102. natural_pdf/search/haystack_utils.py +186 -122
  103. natural_pdf/search/search_options.py +25 -14
  104. natural_pdf/search/search_service_protocol.py +12 -6
  105. natural_pdf/search/searchable_mixin.py +261 -176
  106. natural_pdf/selectors/__init__.py +2 -1
  107. natural_pdf/selectors/parser.py +159 -316
  108. natural_pdf/templates/__init__.py +1 -1
  109. natural_pdf/utils/highlighting.py +8 -2
  110. natural_pdf/utils/reading_order.py +65 -63
  111. natural_pdf/utils/text_extraction.py +195 -0
  112. natural_pdf/utils/visualization.py +70 -61
  113. natural_pdf/widgets/__init__.py +2 -3
  114. natural_pdf/widgets/viewer.py +749 -718
  115. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/METADATA +15 -1
  116. natural_pdf-0.1.5.dist-info/RECORD +134 -0
  117. natural_pdf-0.1.5.dist-info/top_level.txt +5 -0
  118. notebooks/Examples.ipynb +1293 -0
  119. pdfs/.gitkeep +0 -0
  120. pdfs/01-practice.pdf +543 -0
  121. pdfs/0500000US42001.pdf +0 -0
  122. pdfs/0500000US42007.pdf +0 -0
  123. pdfs/2014 Statistics.pdf +0 -0
  124. pdfs/2019 Statistics.pdf +0 -0
  125. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  126. pdfs/needs-ocr.pdf +0 -0
  127. tests/test_loading.py +50 -0
  128. tests/test_optional_deps.py +298 -0
  129. natural_pdf-0.1.4.dist-info/RECORD +0 -61
  130. natural_pdf-0.1.4.dist-info/top_level.txt +0 -1
  131. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/WHEEL +0 -0
  132. {natural_pdf-0.1.4.dist-info → natural_pdf-0.1.5.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,229 @@
1
+ # Finding Elements with Selectors
2
+
3
+ Natural PDF uses CSS-like selectors to find elements (text, lines, images, etc.) within a PDF page or document. This guide demonstrates how to use these selectors effectively.
4
+
5
+ ## Setup
6
+
7
+ Let's load a sample PDF to work with. We'll use `01-practice.pdf` which has various elements.
8
+
9
+ ```python
10
+ from natural_pdf import PDF
11
+
12
+ # Load the PDF
13
+ pdf = PDF("https://github.com/jsoma/natural-pdf/raw/refs/heads/main/pdfs/01-practice.pdf")
14
+
15
+ # Select the first page
16
+ page = pdf.pages[0]
17
+
18
+ # Display the page
19
+ page.show()
20
+ ```
21
+
22
+ ## Basic Element Finding
23
+
24
+ The core methods are `find()` (returns the first match) and `find_all()` (returns all matches as an `ElementCollection`).
25
+
26
+ The basic selector structure is `element_type[attribute_filter]:pseudo_class`.
27
+
28
+ ### Finding Text by Content
29
+
30
+ ```python
31
+ # Find the first text element containing "Summary"
32
+ summary_text = page.find('text:contains("Summary")')
33
+ summary_text
34
+ ```
35
+
36
+ ```python
37
+ # Find all text elements containing "Inadequate"
38
+ contains_inadequate = page.find_all('text:contains("Inadequate")')
39
+ len(contains_inadequate)
40
+ ```
41
+
42
+ ```python
43
+ summary_text.highlight(label='summary')
44
+ contains_inadequate.highlight(label="inadequate")
45
+ page.to_image(width=700)
46
+ ```
47
+
48
+ ## Selecting by Element Type
49
+
50
+ You can select specific types of elements found in PDFs.
51
+
52
+ ```python
53
+ # Find all text elements
54
+ all_text = page.find_all('text')
55
+ len(all_text)
56
+ ```
57
+
58
+ ```python
59
+ # Find all rectangle elements
60
+ all_rects = page.find_all('rect')
61
+ len(all_rects)
62
+ ```
63
+
64
+ ```python
65
+ # Find all line elements
66
+ all_lines = page.find_all('line')
67
+ len(all_lines)
68
+ ```
69
+
70
+ ```python
71
+ page.find_all('line').show()
72
+ ```
73
+
74
+ ## Filtering by Attributes
75
+
76
+ Use square brackets `[]` to filter elements by their properties (attributes).
77
+
78
+ ### Common Attributes & Operators
79
+
80
+ | Attribute | Example Usage | Operators | Notes |
81
+ |---------------|------------------------|-----------|-------|
82
+ | `size` (text) | `text[size>=12]` | `>`, `<`, `>=`, `<=` | Font size in points |
83
+ | `fontname` | `text[fontname*=Bold]` | `=`, `*=` | `*=` for contains substring |
84
+ | `color` (text)| `text[color~=red]` | `~=` | Approx. match (name, rgb, hex) |
85
+ | `width` (line)| `line[width>1]` | `>`, `<`, `>=`, `<=` | Line thickness |
86
+ | `source` | `text[source=ocr]` | `=` | `pdf`, `ocr`, `detected` |
87
+ | `type` (region)| `region[type=table]` | `=` | Layout analysis region type |
88
+
89
+ ```python
90
+ # Find large text (size >= 11 points)
91
+ page.find_all('text[size>=11]')
92
+ ```
93
+
94
+ ```python
95
+ # Find text with 'Helvetica' in the font name
96
+ page.find_all('text[fontname*=Helvetica]')
97
+ ```
98
+
99
+ ```python
100
+ # Find red text (using approximate color match)
101
+ # This PDF has text with color (0.8, 0.0, 0.0)
102
+ red_text = page.find_all('text[color~=red]')
103
+ ```
104
+
105
+ ```python
106
+ # Highlight the red text (ignoring existing highlights)
107
+ red_text.show()
108
+ ```
109
+
110
+ ```python
111
+ # Find thick lines (width >= 2)
112
+ page.find_all('line[width>=2]')
113
+ ```
114
+
115
+ ## Using Pseudo-Classes
116
+
117
+ Use colons `:` for special conditions (pseudo-classes).
118
+
119
+ ### Common Pseudo-Classes
120
+
121
+ | Pseudo-Class | Example Usage | Notes |
122
+ |-----------------------|-----------------------------------------|-------|
123
+ | `:contains('text')` | `text:contains('Report')` | Finds elements containing specific text |
124
+ | `:bold` | `text:bold` | Finds text heuristically identified as bold |
125
+ | `:italic` | `text:italic` | Finds text heuristically identified as italic |
126
+ | `:below(selector)` | `text:below('line[width>=2]')` | Finds elements physically below the reference element |
127
+ | `:above(selector)` | `text:above('text:contains("Summary")')`| Finds elements physically above the reference element |
128
+ | `:left-of(selector)` | `line:left-of('rect')` | Finds elements physically left of the reference element |
129
+ | `:right-of(selector)` | `text:right-of('rect')` | Finds elements physically right of the reference element |
130
+ | `:near(selector)` | `text:near('image')` | Finds elements physically near the reference element |
131
+
132
+ *Note: Spatial pseudo-classes like `:below`, `:above` identify elements based on bounding box positions relative to the **first** element matched by the inner selector.*
133
+
134
+ ```python
135
+ # Find bold text
136
+ page.find_all('text:bold').show()
137
+ ```
138
+
139
+ ```python
140
+ # Combine attribute and pseudo-class: bold text size >= 11
141
+ page.find_all('text[size>=11]:bold')
142
+ ```
143
+
144
+ ### Spatial Pseudo-Classes Examples
145
+
146
+ ```python
147
+ # Find the thick horizontal line first
148
+ ref_line = page.find('line[width>=2]')
149
+
150
+ # Find text elements strictly above that line
151
+ text_above_line = page.find_all('text:above("line[width>=2]")')
152
+ text_above_line
153
+ ```
154
+
155
+ ## Advanced Text Searching Options
156
+
157
+ Pass options to `find()` or `find_all()` for more control over text matching.
158
+
159
+ ```python
160
+ # Case-insensitive search for "summary"
161
+ page.find_all('text:contains("summary")', case=False)
162
+ ```
163
+
164
+ ```python
165
+ # Regular expression search for the inspection ID (e.g., INS-XXX...)
166
+ # The ID is in the red text we found earlier
167
+ page.find_all('text:contains("INS-\\w+")', regex=True)
168
+ ```
169
+
170
+ ```python
171
+ # Combine regex and case-insensitivity
172
+ page.find_all('text:contains("jungle health")', regex=True, case=False)
173
+ ```
174
+
175
+ ## Working with ElementCollections
176
+
177
+ `find_all()` returns an `ElementCollection`, which is like a list but with extra PDF-specific methods.
178
+
179
+ ```python
180
+ # Get all headings (using a selector for large, bold text)
181
+ headings = page.find_all('text[size>=11]:bold')
182
+ headings
183
+ ```
184
+
185
+ ```python
186
+ # Get the first and last heading in reading order
187
+ first = headings.first
188
+ last = headings.last
189
+ (first, last)
190
+ ```
191
+
192
+ ```python
193
+ # Get the physically highest/lowest element in the collection
194
+ highest = headings.highest()
195
+ lowest = headings.lowest()
196
+ (highest, lowest)
197
+ ```
198
+
199
+ ```python
200
+ # Filter the collection further: headings containing "Service"
201
+ service_headings = headings.find_all('text:contains("Service")')
202
+ service_headings
203
+ ```
204
+
205
+ ```python
206
+ # Extract text from all elements in the collection
207
+ headings.extract_text()
208
+ ```
209
+
210
+ *Remember: `.highest()`, `.lowest()`, `.leftmost()`, `.rightmost()` raise errors if the collection spans multiple pages.*
211
+
212
+ ## Font Variants
213
+
214
+ Sometimes PDFs use font variants (prefixes like `AAAAAB+`) which can be useful for selection.
215
+
216
+ ```python
217
+ # Find text elements with a specific font variant prefix (if any exist)
218
+ # This example PDF doesn't use variants, but the selector works like this:
219
+ page.find_all('text[font-variant=AAAAAB]')
220
+ ```
221
+
222
+ ## Next Steps
223
+
224
+ Now that you can find elements, explore:
225
+
226
+ - [Text Extraction](../text-extraction/index.ipynb): Get text content from found elements.
227
+ - [Spatial Navigation](../pdf-navigation/index.ipynb): Use found elements as anchors to navigate (`.above()`, `.below()`, etc.).
228
+ - [Working with Regions](../regions/index.ipynb): Define areas based on found elements.
229
+ - [Visual Debugging](../visual-debugging/index.ipynb): Techniques for highlighting and visualizing elements.
docs/index.md ADDED
@@ -0,0 +1,170 @@
1
+ # Natural PDF
2
+
3
+ A friendly library for working with PDFs, built on top of [pdfplumber](https://github.com/jsvine/pdfplumber).
4
+
5
+ Natural PDF lets you find and extract content from PDFs using simple code that makes sense.
6
+
7
+ - [Live demo here](https://colab.research.google.com/github/jsoma/natural-pdf/blob/main/notebooks/Examples.ipynb)
8
+
9
+ <div style="max-width: 400px; margin: auto"><a href="assets/sample-screen.png"><img src="assets/sample-screen.png"></a></div>
10
+
11
+ ## Installation
12
+
13
+ ```
14
+ pip install natural_pdf
15
+ # All the extras
16
+ pip install "natural_pdf[all]"
17
+ ```
18
+
19
+ ## Quick Example
20
+
21
+ ```python
22
+ from natural_pdf import PDF
23
+
24
+ pdf = PDF('document.pdf')
25
+ page = pdf.pages[0]
26
+
27
+ # Find the title and get content below it
28
+ title = page.find('text:contains("Summary"):bold')
29
+ content = title.below().extract_text()
30
+
31
+ # Exclude everything above 'CONFIDENTIAL' and below last line on page
32
+ page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
33
+ page.add_exclusion(page.find_all('line')[-1].below())
34
+
35
+ # Get the clean text without header/footer
36
+ clean_text = page.extract_text()
37
+ ```
38
+
39
+ ## Key Features
40
+
41
+ Here are a few highlights of what you can do:
42
+
43
+ ### Find Elements with Selectors
44
+
45
+ Use CSS-like selectors to find text, shapes, and more.
46
+
47
+ ```python
48
+ # Find bold text containing "Revenue"
49
+ page.find('text:contains("Revenue"):bold').extract_text()
50
+
51
+ # Find all large text
52
+ page.find_all('text[size>=12]').extract_text()
53
+ ```
54
+
55
+ [Learn more about selectors →](element-selection/index.ipynb)
56
+
57
+ ### Navigate Spatially
58
+
59
+ Move around the page relative to elements, not just coordinates.
60
+
61
+ ```python
62
+ # Extract text below a specific heading
63
+ intro_text = page.find('text:contains("Introduction")').below().extract_text()
64
+
65
+ # Extract text from one heading to the next
66
+ methods_text = page.find('text:contains("Methods")').below(
67
+ until='text:contains("Results")'
68
+ ).extract_text()
69
+ ```
70
+
71
+ [Explore more navigation methods →](pdf-navigation/index.ipynb)
72
+
73
+ ### Extract Clean Text
74
+
75
+ Easily extract text content, automatically handling common page elements like headers and footers (if exclusions are set).
76
+
77
+ ```python
78
+ # Extract all text from the page (respecting exclusions)
79
+ page_text = page.extract_text()
80
+
81
+ # Extract text from a specific region
82
+ some_region = page.find(...)
83
+ region_text = some_region.extract_text()
84
+ ```
85
+
86
+ [Learn about text extraction →](text-extraction/index.ipynb)
87
+ [Learn about exclusion zones →](regions/index.ipynb#exclusion-zones)
88
+
89
+ ### Apply OCR
90
+
91
+ Extract text from scanned documents using various OCR engines.
92
+
93
+ ```python
94
+ # Apply OCR using the default engine
95
+ ocr_elements = page.apply_ocr()
96
+
97
+ # Extract text (will use OCR results if available)
98
+ text = page.extract_text()
99
+ ```
100
+
101
+ [Explore OCR options →](ocr/index.md)
102
+
103
+ ### Analyze Document Layout
104
+
105
+ Use AI models to detect document structures like titles, paragraphs, and tables.
106
+
107
+ ```python
108
+ # Detect document structure
109
+ page.analyze_layout()
110
+
111
+ # Highlight titles and tables
112
+ page.find_all('region[type=title]').highlight(color="purple")
113
+ page.find_all('region[type=table]').highlight(color="blue")
114
+
115
+ # Extract data from the first table
116
+ table_data = page.find('region[type=table]').extract_table()
117
+ ```
118
+
119
+ [Learn about layout models →](layout-analysis/index.ipynb)
120
+ [Working with tables? →](tables/index.ipynb)
121
+
122
+ ### Document Question Answering
123
+
124
+ Ask natural language questions directly to your documents.
125
+
126
+ ```python
127
+ # Ask a question
128
+ result = pdf.ask("What was the company's revenue in 2022?")
129
+ if result.get("found", False):
130
+ print(f"Answer: {result['answer']}")
131
+ ```
132
+
133
+ [Learn about Document QA →](document-qa/index.ipynb)
134
+
135
+ ### Visualize Your Work
136
+
137
+ Debug and understand your extractions visually.
138
+
139
+ ```python
140
+ # Highlight headings
141
+ page.find_all('text[size>=14]').highlight(color="red", label="Headings")
142
+
143
+ # Launch the interactive viewer (Jupyter)
144
+ # Requires: pip install natural-pdf[interactive]
145
+ page.viewer()
146
+
147
+ # Or save an image
148
+ # page.save_image("highlighted.png")
149
+ ```
150
+
151
+ [See more visualization options →](visual-debugging/index.ipynb)
152
+
153
+ ## Documentation Topics
154
+
155
+ Choose what you want to learn about:
156
+
157
+ ### Task-based Guides
158
+ - [Getting Started](installation/index.md): Install the library and run your first extraction
159
+ - [PDF Navigation](pdf-navigation/index.ipynb): Open PDFs and work with pages
160
+ - [Element Selection](element-selection/index.ipynb): Find text and other elements using selectors
161
+ - [Text Extraction](text-extraction/index.ipynb): Extract clean text from documents
162
+ - [Regions](regions/index.ipynb): Work with specific areas of a page
163
+ - [Visual Debugging](visual-debugging/index.ipynb): See what you're extracting
164
+ - [OCR](ocr/index.md): Extract text from scanned documents
165
+ - [Layout Analysis](layout-analysis/index.ipynb): Detect document structure
166
+ - [Tables](tables/index.ipynb): Extract tabular data
167
+ - [Document QA](document-qa/index.ipynb): Ask questions to your documents
168
+
169
+ ### Reference
170
+ - [API Reference](api/index.md): Complete library reference
@@ -0,0 +1,69 @@
1
+ # Getting Started with Natural PDF
2
+
3
+ Let's get Natural PDF installed and run your first extraction.
4
+
5
+ ## Installation
6
+
7
+ The base installation includes the core library and necessary AI dependencies (like PyTorch and Transformers):
8
+
9
+ ```bash
10
+ pip install natural-pdf
11
+ ```
12
+
13
+ ### Optional Dependencies
14
+
15
+ Natural PDF has modular dependencies for different features. Install them based on your needs:
16
+
17
+ ```bash
18
+ # --- OCR Engines ---
19
+ # Install support for EasyOCR
20
+ pip install natural-pdf[easyocr]
21
+
22
+ # Install support for PaddleOCR (requires paddlepaddle)
23
+ pip install natural-pdf[paddle]
24
+
25
+ # Install support for Surya OCR
26
+ pip install natural-pdf[surya]
27
+
28
+ # --- Layout Detection ---
29
+ # Install support for YOLO layout model
30
+ pip install natural-pdf[layout_yolo]
31
+
32
+ # --- Interactive Widget ---
33
+ # Install support for the interactive .viewer() widget in Jupyter
34
+ pip install natural-pdf[interactive]
35
+
36
+ # --- All Features ---
37
+ # Install all optional dependencies
38
+ pip install natural-pdf[all]
39
+ ```
40
+
41
+ ## Your First PDF Extraction
42
+
43
+ Here's a quick example to make sure everything is working:
44
+
45
+ ```python
46
+ from natural_pdf import PDF
47
+
48
+ # Open a PDF
49
+ pdf = PDF('your_document.pdf')
50
+
51
+ # Get the first page
52
+ page = pdf.pages[0]
53
+
54
+ # Extract all text
55
+ text = page.extract_text()
56
+ print(text)
57
+
58
+ # Find something specific
59
+ title = page.find('text:bold')
60
+ print(f"Found title: {title.text}")
61
+ ```
62
+
63
+ ## What's Next?
64
+
65
+ Now that you have Natural PDF installed, you can:
66
+
67
+ - Learn to [navigate PDFs](../pdf-navigation/index.ipynb)
68
+ - Explore how to [select elements](../element-selection/index.ipynb)
69
+ - See how to [extract text](../text-extraction/index.ipynb)