natural-pdf 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. natural_pdf/__init__.py +7 -2
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +4 -2
  8. natural_pdf/core/pdf.py +53 -38
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +203 -59
  11. natural_pdf/elements/region.py +43 -11
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +40 -61
  16. natural_pdf/exporters/hocr_font.py +7 -13
  17. natural_pdf/exporters/original_pdf.py +10 -13
  18. natural_pdf/exporters/searchable_pdf.py +0 -10
  19. natural_pdf/search/__init__.py +65 -52
  20. natural_pdf/search/lancedb_search_service.py +325 -0
  21. natural_pdf/search/numpy_search_service.py +255 -0
  22. natural_pdf/search/searchable_mixin.py +25 -71
  23. natural_pdf/widgets/viewer.py +22 -31
  24. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -49
  25. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +28 -25
  26. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  27. natural_pdf/search/haystack_search_service.py +0 -687
  28. natural_pdf/search/haystack_utils.py +0 -474
  29. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  30. {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,6 @@ from abc import ABC, abstractmethod
4
4
  from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Type, Union
5
5
 
6
6
  # Now import the flag from the canonical source - this import should always work
7
- from .haystack_utils import HAS_HAYSTACK_EXTRAS
8
7
 
9
8
  DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
10
9
 
@@ -108,7 +107,6 @@ class SearchableMixin(ABC):
108
107
  logger.info(
109
108
  f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}')."
110
109
  )
111
- # TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
112
110
  self._search_service = service
113
111
  else:
114
112
  # Create new service
@@ -125,28 +123,17 @@ class SearchableMixin(ABC):
125
123
  logger.info(
126
124
  f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
127
125
  )
128
- try:
129
- service_args = {
130
- "collection_name": effective_collection_name,
131
- "persist": effective_persist,
132
- **kwargs,
133
- }
134
- if embedding_model:
135
- service_args["embedding_model"] = embedding_model
136
- self._search_service = get_search_service(**service_args)
137
- except ImportError as ie: # Catch the specific ImportError first
138
- logger.error(f"Failed to create SearchService due to missing dependency: {ie}")
139
- raise ie # Re-raise the original ImportError
140
- except Exception as e:
141
- logger.error(
142
- f"Failed to create SearchService due to unexpected error: {e}", exc_info=True
143
- )
144
- # Keep the RuntimeError for other unexpected creation errors
145
- raise RuntimeError(
146
- "Could not create SearchService instance due to an unexpected error."
147
- ) from e
126
+
127
+ # Direct creation without try/except
128
+ service_args = {
129
+ "collection_name": effective_collection_name,
130
+ "persist": effective_persist,
131
+ **kwargs,
132
+ }
133
+ if embedding_model:
134
+ service_args["embedding_model"] = embedding_model
135
+ self._search_service = get_search_service(**service_args)
148
136
 
149
- # --- Optional Immediate Indexing (with safety check for persistent) ---
150
137
  if index:
151
138
  if not self._search_service: # Should not happen if logic above is correct
152
139
  raise RuntimeError(
@@ -176,8 +163,6 @@ class SearchableMixin(ABC):
176
163
  logger.warning(
177
164
  f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted."
178
165
  )
179
- # else: # Not persistent, safe to proceed without existence check
180
- # logger.debug("Proceeding with index=True for non-persistent index.")
181
166
 
182
167
  # Proceed with indexing if checks passed or not applicable
183
168
  logger.info(
@@ -197,12 +182,8 @@ class SearchableMixin(ABC):
197
182
  f"Starting internal indexing process into SearchService collection '{collection_name}'..."
198
183
  )
199
184
 
200
- # Use the abstract method to get items
201
- try:
202
- indexable_items = list(self.get_indexable_items()) # Consume iterator
203
- except Exception as e:
204
- logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
205
- raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
185
+ # Get indexable items without try/except
186
+ indexable_items = list(self.get_indexable_items()) # Consume iterator
206
187
 
207
188
  if not indexable_items:
208
189
  logger.warning(
@@ -211,27 +192,19 @@ class SearchableMixin(ABC):
211
192
  return
212
193
 
213
194
  logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
214
- try:
215
- logger.debug(
216
- f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
217
- )
218
- self._search_service.index(
219
- documents=indexable_items,
220
- embedder_device=embedder_device,
221
- force_reindex=force_reindex,
222
- )
223
- logger.info(
224
- f"Successfully completed indexing into SearchService collection '{collection_name}'."
225
- )
226
- except IndexConfigurationError as ice:
227
- logger.error(
228
- f"Indexing failed due to configuration error in collection '{collection_name}': {ice}",
229
- exc_info=True,
230
- )
231
- raise # Re-raise specific error
232
- except Exception as e: # Catch other indexing errors from the service
233
- logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
234
- raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
195
+ logger.debug(
196
+ f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
197
+ )
198
+
199
+ # Call index without try/except
200
+ self._search_service.index(
201
+ documents=indexable_items,
202
+ embedder_device=embedder_device,
203
+ force_reindex=force_reindex,
204
+ )
205
+ logger.info(
206
+ f"Successfully completed indexing into SearchService collection '{collection_name}'."
207
+ )
235
208
 
236
209
  def index_for_search(
237
210
  self,
@@ -254,14 +227,12 @@ class SearchableMixin(ABC):
254
227
  Returns:
255
228
  Self for method chaining.
256
229
  """
257
- # --- Ensure Service is Initialized (Use Default if Needed) ---
258
230
  if not self._search_service:
259
231
  logger.info(
260
232
  "Search service not initialized prior to index_for_search. Initializing default in-memory service."
261
233
  )
262
234
  self.init_search() # Call init with defaults
263
235
 
264
- # --- Perform Indexing ---
265
236
  self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
266
237
  return self
267
238
 
@@ -289,7 +260,6 @@ class SearchableMixin(ABC):
289
260
  RuntimeError: If no search service is configured or provided, or if search fails.
290
261
  FileNotFoundError: If the collection managed by the service does not exist.
291
262
  """
292
- # --- Determine which Search Service to use ---
293
263
  effective_service = search_service or self._search_service
294
264
  if not effective_service:
295
265
  raise RuntimeError(
@@ -302,21 +272,9 @@ class SearchableMixin(ABC):
302
272
  f"Searching collection '{collection_name}' via {type(effective_service).__name__}..."
303
273
  )
304
274
 
305
- # --- Prepare Query and Options ---
306
275
  query_input = query
307
- # Example: Handle Region query - maybe move this logic into HaystackSearchService.search?
308
- # If we keep it here, it makes the mixin less generic.
309
- # Let's assume the SearchService handles the query type appropriately for now.
310
- # if isinstance(query, Region):
311
- # logger.debug("Query is a Region object. Extracting text.")
312
- # query_input = query.extract_text()
313
- # if not query_input or query_input.isspace():
314
- # logger.warning("Region provided for query has no extractable text.")
315
- # return []
316
-
317
276
  effective_options = options if options is not None else TextSearchOptions()
318
277
 
319
- # --- Call SearchService Search Method ---
320
278
  try:
321
279
  results = effective_service.search(
322
280
  query=query_input,
@@ -336,7 +294,6 @@ class SearchableMixin(ABC):
336
294
  # Consider wrapping in a SearchError?
337
295
  raise RuntimeError(f"Search failed in collection '{collection_name}'.") from e
338
296
 
339
- # --- NEW Sync Method ---
340
297
  def sync_index(
341
298
  self,
342
299
  strategy: str = "full", # 'full' (add/update/delete) or 'upsert_only'
@@ -378,7 +335,6 @@ class SearchableMixin(ABC):
378
335
  )
379
336
  summary = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
380
337
 
381
- # --- Check Service Capabilities for 'full' sync ---
382
338
  if strategy == "full":
383
339
  required_methods = ["list_documents", "delete_documents"]
384
340
  missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
@@ -388,7 +344,6 @@ class SearchableMixin(ABC):
388
344
  f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
389
345
  )
390
346
 
391
- # --- 1. Get Desired State (from current collection) ---
392
347
  desired_state: Dict[str, Indexable] = {} # {id: item}
393
348
  desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
394
349
  try:
@@ -426,7 +381,6 @@ class SearchableMixin(ABC):
426
381
 
427
382
  logger.info(f"Desired state contains {len(desired_state)} indexable items.")
428
383
 
429
- # --- 2. Handle Different Strategies ---
430
384
  if strategy == "upsert_only":
431
385
  # Simple case: just index everything, let the service handle upserts
432
386
  items_to_index = list(desired_state.values())
@@ -31,20 +31,6 @@ try:
31
31
  from PIL import Image
32
32
  from traitlets import Dict, List, Unicode, observe
33
33
 
34
- # --- Read JS code from file (only needed if widgets are defined) --- #
35
- _MODULE_DIR = os.path.dirname(__file__)
36
- _FRONTEND_JS_PATH = os.path.join(_MODULE_DIR, "frontend", "viewer.js")
37
- try:
38
- with open(_FRONTEND_JS_PATH, "r", encoding="utf-8") as f:
39
- _FRONTEND_JS_CODE = f.read()
40
- logger.debug(f"Successfully read frontend JS from: {_FRONTEND_JS_PATH}")
41
- except FileNotFoundError:
42
- logger.error(f"Frontend JS file not found at {_FRONTEND_JS_PATH}. Widget will likely fail.")
43
- _FRONTEND_JS_CODE = "console.error('Frontend JS file not found! Widget cannot load.');"
44
- except Exception as e:
45
- logger.error(f"Error reading frontend JS file {_FRONTEND_JS_PATH}: {e}")
46
- _FRONTEND_JS_CODE = f"console.error('Error reading frontend JS file: {e}');"
47
-
48
34
  # --- Define Widget Classes ONLY if ipywidgets is available ---
49
35
  class SimpleInteractiveViewerWidget(widgets.DOMWidget):
50
36
  def __init__(self, pdf_data=None, **kwargs):
@@ -631,7 +617,7 @@ try:
631
617
 
632
618
  # Filter out 'char' elements
633
619
  filtered_page_elements = [
634
- el for el in page_elements if getattr(el, "type", "").lower() != "char"
620
+ el for el in page_elements if str(getattr(el, "type", "")).lower() != "char"
635
621
  ]
636
622
  logger.debug(
637
623
  f"Filtered out char elements, keeping {len(filtered_page_elements)} elements."
@@ -659,19 +645,21 @@ try:
659
645
 
660
646
  for i, element in enumerate(filtered_page_elements):
661
647
  # Get original coordinates and calculated width/height (always present via base class)
648
+ # Assuming 'element' is always an object with these attributes now
662
649
  original_x0 = element.x0
663
650
  original_y0 = element.top
664
651
  original_x1 = element.x1
665
652
  original_y1 = element.bottom
666
653
  width = element.width
667
654
  height = element.height
655
+ current_element_type = element.type # Direct attribute access
668
656
  scale = 1.0
669
657
 
670
658
  # Base element dict with required info
671
659
  elem_dict = {
672
660
  "id": i,
673
661
  # Use the standardized .type property
674
- "type": element.type,
662
+ "type": current_element_type,
675
663
  # Scaled coordinates for positioning in HTML/SVG
676
664
  "x0": original_x0 * scale,
677
665
  "y0": original_y0 * scale,
@@ -684,21 +672,24 @@ try:
684
672
  # --- Get Default Attributes --- #
685
673
  attributes_found = set()
686
674
  for attr_name in default_attributes_to_get:
675
+ # Assuming 'element' is always an object
687
676
  if hasattr(element, attr_name):
688
677
  try:
689
- value = getattr(element, attr_name)
678
+ value_to_process = getattr(element, attr_name)
690
679
  # Convert non-JSON serializable types to string
691
- processed_value = value
680
+ processed_value = value_to_process
692
681
  if (
693
- not isinstance(value, (str, int, float, bool, list, dict, tuple))
694
- and value is not None
682
+ not isinstance(
683
+ value_to_process, (str, int, float, bool, list, dict, tuple)
684
+ )
685
+ and value_to_process is not None
695
686
  ):
696
- processed_value = str(value)
687
+ processed_value = str(value_to_process)
697
688
  elem_dict[attr_name] = processed_value
698
689
  attributes_found.add(attr_name)
699
690
  except Exception as e:
700
691
  logger.warning(
701
- f"Could not get or process default attribute '{attr_name}' for element {i} ({element.type}): {e}"
692
+ f"Could not get or process default attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
702
693
  )
703
694
 
704
695
  # --- Get User-Requested Attributes (if any) --- #
@@ -707,23 +698,23 @@ try:
707
698
  # Only process if not already added and exists
708
699
  if attr_name not in attributes_found and hasattr(element, attr_name):
709
700
  try:
710
- value = getattr(element, attr_name)
711
- processed_value = value
701
+ value_to_process = getattr(element, attr_name)
702
+ processed_value = value_to_process
712
703
  if (
713
704
  not isinstance(
714
- value, (str, int, float, bool, list, dict, tuple)
705
+ value_to_process, (str, int, float, bool, list, dict, tuple)
715
706
  )
716
- and value is not None
707
+ and value_to_process is not None
717
708
  ):
718
- processed_value = str(value)
709
+ processed_value = str(value_to_process)
719
710
  elem_dict[attr_name] = processed_value
720
711
  except Exception as e:
721
712
  logger.warning(
722
- f"Could not get or process requested attribute '{attr_name}' for element {i} ({element.type}): {e}"
713
+ f"Could not get or process requested attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
723
714
  )
724
- for attr_name in elem_dict:
725
- if isinstance(elem_dict[attr_name], float):
726
- elem_dict[attr_name] = round(elem_dict[attr_name], 2)
715
+ for attr_name_val in elem_dict: # Renamed to avoid conflict
716
+ if isinstance(elem_dict[attr_name_val], float):
717
+ elem_dict[attr_name_val] = round(elem_dict[attr_name_val], 2)
727
718
  elements.append(elem_dict)
728
719
 
729
720
  logger.debug(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: natural-pdf
3
- Version: 0.1.11
3
+ Version: 0.1.12
4
4
  Summary: A more intuitive interface for working with PDFs
5
5
  Author-email: Jonathan Soma <jonathan.soma@gmail.com>
6
6
  License-Expression: MIT
@@ -12,20 +12,16 @@ Requires-Python: >=3.9
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: pdfplumber
15
- Requires-Dist: Pillow
15
+ Requires-Dist: pillow
16
16
  Requires-Dist: colour
17
17
  Requires-Dist: numpy
18
18
  Requires-Dist: urllib3
19
19
  Requires-Dist: tqdm
20
20
  Requires-Dist: pydantic
21
- Provides-Extra: interactive
22
- Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "interactive"
23
- Provides-Extra: haystack
24
- Requires-Dist: haystack-ai; extra == "haystack"
25
- Requires-Dist: lancedb-haystack; extra == "haystack"
26
- Requires-Dist: lancedb; extra == "haystack"
27
- Requires-Dist: sentence-transformers; extra == "haystack"
28
- Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
21
+ Requires-Dist: jenkspy
22
+ Requires-Dist: pikepdf>=9.7.0
23
+ Provides-Extra: viewer
24
+ Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "viewer"
29
25
  Provides-Extra: easyocr
30
26
  Requires-Dist: easyocr; extra == "easyocr"
31
27
  Requires-Dist: natural-pdf[core-ml]; extra == "easyocr"
@@ -41,19 +37,25 @@ Requires-Dist: natural-pdf[core-ml]; extra == "surya"
41
37
  Provides-Extra: doctr
42
38
  Requires-Dist: python-doctr[torch]; extra == "doctr"
43
39
  Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
44
- Provides-Extra: qa
45
- Requires-Dist: natural-pdf[core-ml]; extra == "qa"
46
40
  Provides-Extra: docling
47
41
  Requires-Dist: docling; extra == "docling"
48
42
  Requires-Dist: natural-pdf[core-ml]; extra == "docling"
49
43
  Provides-Extra: llm
50
44
  Requires-Dist: openai>=1.0; extra == "llm"
51
- Provides-Extra: classification
52
- Requires-Dist: sentence-transformers; extra == "classification"
53
- Requires-Dist: timm; extra == "classification"
54
- Requires-Dist: natural-pdf[core-ml]; extra == "classification"
55
45
  Provides-Extra: test
56
46
  Requires-Dist: pytest; extra == "test"
47
+ Provides-Extra: search
48
+ Requires-Dist: lancedb; extra == "search"
49
+ Requires-Dist: pyarrow; extra == "search"
50
+ Provides-Extra: favorites
51
+ Requires-Dist: natural-pdf[deskew]; extra == "favorites"
52
+ Requires-Dist: natural-pdf[llm]; extra == "favorites"
53
+ Requires-Dist: natural-pdf[surya]; extra == "favorites"
54
+ Requires-Dist: natural-pdf[easyocr]; extra == "favorites"
55
+ Requires-Dist: natural-pdf[layout_yolo]; extra == "favorites"
56
+ Requires-Dist: natural-pdf[ocr-export]; extra == "favorites"
57
+ Requires-Dist: natural-pdf[viewer]; extra == "favorites"
58
+ Requires-Dist: natural-pdf[search]; extra == "favorites"
57
59
  Provides-Extra: dev
58
60
  Requires-Dist: black; extra == "dev"
59
61
  Requires-Dist: isort; extra == "dev"
@@ -67,29 +69,32 @@ Requires-Dist: pipdeptree; extra == "dev"
67
69
  Requires-Dist: nbformat; extra == "dev"
68
70
  Requires-Dist: jupytext; extra == "dev"
69
71
  Requires-Dist: nbclient; extra == "dev"
72
+ Requires-Dist: ipykernel; extra == "dev"
70
73
  Provides-Extra: deskew
71
74
  Requires-Dist: deskew>=1.5; extra == "deskew"
72
75
  Requires-Dist: img2pdf; extra == "deskew"
73
76
  Provides-Extra: all
74
- Requires-Dist: natural-pdf[interactive]; extra == "all"
75
- Requires-Dist: natural-pdf[haystack]; extra == "all"
77
+ Requires-Dist: natural-pdf[viewer]; extra == "all"
76
78
  Requires-Dist: natural-pdf[easyocr]; extra == "all"
77
79
  Requires-Dist: natural-pdf[paddle]; extra == "all"
78
80
  Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
79
81
  Requires-Dist: natural-pdf[surya]; extra == "all"
80
82
  Requires-Dist: natural-pdf[doctr]; extra == "all"
81
- Requires-Dist: natural-pdf[qa]; extra == "all"
82
83
  Requires-Dist: natural-pdf[ocr-export]; extra == "all"
83
84
  Requires-Dist: natural-pdf[docling]; extra == "all"
84
85
  Requires-Dist: natural-pdf[llm]; extra == "all"
85
- Requires-Dist: natural-pdf[classification]; extra == "all"
86
+ Requires-Dist: natural-pdf[core-ml]; extra == "all"
86
87
  Requires-Dist: natural-pdf[deskew]; extra == "all"
87
88
  Requires-Dist: natural-pdf[test]; extra == "all"
89
+ Requires-Dist: natural-pdf[search]; extra == "all"
88
90
  Provides-Extra: core-ml
89
91
  Requires-Dist: torch; extra == "core-ml"
90
92
  Requires-Dist: torchvision; extra == "core-ml"
91
93
  Requires-Dist: transformers[sentencepiece]; extra == "core-ml"
92
94
  Requires-Dist: huggingface_hub; extra == "core-ml"
95
+ Requires-Dist: sentence-transformers; extra == "core-ml"
96
+ Requires-Dist: numpy; extra == "core-ml"
97
+ Requires-Dist: timm; extra == "core-ml"
93
98
  Provides-Extra: ocr-export
94
99
  Requires-Dist: pikepdf; extra == "ocr-export"
95
100
  Provides-Extra: export-extras
@@ -114,26 +119,11 @@ Natural PDF lets you find and extract content from PDFs using simple code that m
114
119
  pip install natural-pdf
115
120
  ```
116
121
 
117
- For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
122
+ For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install one to two million different extras. If you just want the greatest hits:
118
123
 
119
124
  ```bash
120
- # Example: Install with EasyOCR support
121
- pip install natural-pdf[easyocr]
122
- pip install natural-pdf[surya]
123
- pip install natural-pdf[paddle]
124
-
125
- # Example: Install support for features using Large Language Models (e.g., via OpenAI-compatible APIs)
126
- pip install natural-pdf[llm]
127
- # (May require setting API key environment variables, e.g., GOOGLE_API_KEY for Gemini)
128
-
129
- # Example: Install with interactive viewer support
130
- pip install natural-pdf[interactive]
131
-
132
- # Example: Install with semantic search support (Haystack)
133
- pip install natural-pdf[haystack]
134
-
135
- # Install everything
136
- pip install natural-pdf[all]
125
+ # deskewing, OCR (surya) + layout analysis (yolo), interactive browsing
126
+ pip install natural-pdf[favorites]
137
127
  ```
138
128
 
139
129
  See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
@@ -147,25 +137,26 @@ from natural_pdf import PDF
147
137
  pdf = PDF('document.pdf')
148
138
  page = pdf.pages[0]
149
139
 
140
+ # Extract all of the text on the page
141
+ page.extract_text()
142
+
150
143
  # Find elements using CSS-like selectors
151
144
  heading = page.find('text:contains("Summary"):bold')
152
145
 
153
146
  # Extract content below the heading
154
147
  content = heading.below().extract_text()
155
- print("Content below Summary:", content[:100] + "...")
156
148
 
157
- # Exclude headers/footers automatically (example)
158
- # You might define these based on common text or position
159
- page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
160
- page.add_exclusion(page.find_all('line')[-1].below())
149
+ # Examine all the bold text on the page
150
+ page.find_all('text:bold').show()
161
151
 
162
- # Extract clean text from the page
163
- clean_text = page.extract_text()
164
- print("\nClean page text:", clean_text[:200] + "...")
152
+ # Exclude parts of the page from selectors/extractors
153
+ header = page.find('text:contains("CONFIDENTIAL")').above()
154
+ footer = page.find_all('line')[-1].below()
155
+ page.add_exclusion(header)
156
+ page.add_exclusion(footer)
165
157
 
166
- # Highlight the heading and view the page
167
- heading.highlight(color='red')
168
- page.to_image()
158
+ # Extract clean text from the page ignoring exclusions
159
+ clean_text = page.extract_text()
169
160
  ```
170
161
 
171
162
  And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
@@ -186,3 +177,17 @@ Natural PDF offers a range of features for working with PDFs:
186
177
  ## Learn More
187
178
 
188
179
  Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
180
+
181
+ ## Best friends
182
+
183
+ Natural PDF sits on top of a *lot* of fantastic tools and mdoels, some of which are:
184
+
185
+ - [pdfplumber](https://github.com/jsvine/pdfplumber)
186
+ - [EasyOCR](https://www.jaided.ai/easyocr/)
187
+ - [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
188
+ - [Surya](https://github.com/VikParuchuri/surya)
189
+ - A specific [YOLO](https://github.com/opendatalab/DocLayout-YOLO)
190
+ - [deskew](https://github.com/sbrunner/deskew)
191
+ - [doctr](https://github.com/mindee/doctr)
192
+ - [docling](https://github.com/docling-project/docling)
193
+ - [Hugging Face](https://huggingface.co/models)
@@ -1,7 +1,7 @@
1
- natural_pdf/__init__.py,sha256=HIYdzHD7QBRssIseUX_oDJYvVJs646tNSYhKHqk0HeA,2495
1
+ natural_pdf/__init__.py,sha256=0sCYgb9BAV5OnpD_1AswMuOLuXNmpe3OLJpv_6p3tgw,2449
2
2
  natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
3
- natural_pdf/analyzers/text_options.py,sha256=nE2E1pp4psDPpxmtarvNtEQsgozPkyFRjv0TVP2HTyU,2865
4
- natural_pdf/analyzers/text_structure.py,sha256=Uhxc7aYB1jddkiwRTEPOg_Te2HfOua4z_OtgP1m3org,12794
3
+ natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
4
+ natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
5
5
  natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
6
6
  natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
7
7
  natural_pdf/analyzers/layout/base.py,sha256=bYawhmc_0xqKG-xbxUSiazIU1om-aBox5Jh8qDqv-eM,6451
@@ -15,31 +15,34 @@ natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh
15
15
  natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuKvUGQfEBI,9789
16
16
  natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
17
17
  natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
18
- natural_pdf/classification/manager.py,sha256=RxJch8xVu8Me6_T2Kh7ZqUNaAKlXvfyCZD0hRc4Hk6w,17929
18
+ natural_pdf/classification/manager.py,sha256=4iHoZQ6W541efFvwJt38K-4n5svYH3uYk0ixWPCt7do,17922
19
19
  natural_pdf/classification/mixin.py,sha256=hhX9qWPShpOq_-mgoEq0GUWnutBnNMo3YdUlxwyNWMA,6781
20
20
  natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
21
21
  natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
22
22
  natural_pdf/collections/pdf_collection.py,sha256=obHizc2KR4ZiAspodaPOeMgfpoW3aKg_G0goBHlrFJI,32018
23
23
  natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
24
- natural_pdf/core/element_manager.py,sha256=knRN6qXxV-6KZCj2GUOyiqRi83DjJzL77TmKGeiD08Y,25144
25
- natural_pdf/core/highlighting_service.py,sha256=wINdRxq63_CYYA81EwuCRqhNKimn0dNKyoKWuzkirc0,31959
26
- natural_pdf/core/page.py,sha256=S7Uj3DVksX7o3Qg7hpNulYuxHmqzSJIJ0yXVytPhFqY,105158
27
- natural_pdf/core/pdf.py,sha256=qpZx5LXZ5Oq1fZ4mzDXBDOIcsApRinMEH0CjVY6jNvM,69273
24
+ natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
25
+ natural_pdf/core/highlighting_service.py,sha256=F1s9KEVkZb3Srtp1_ayQJayp1ZvDf9FcChsZdLk4yWk,37138
26
+ natural_pdf/core/page.py,sha256=XoHPdsg7YUQIkayD0U1cQ7pNR8NCgV9xkV0rVAO7n3s,105167
27
+ natural_pdf/core/pdf.py,sha256=ssXJviTVKyVZuyiSKv1vE0GW7BACNXBhovIUbPm5MZ4,69511
28
28
  natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
29
- natural_pdf/elements/base.py,sha256=7vVCPQyEHifh4LyBuv0kLTqr_gNbbEMc4SoiJmLfEUQ,37585
30
- natural_pdf/elements/collections.py,sha256=HsNt_4x-yqNI_bDGeNEiih3hotAfrbppmp_O7rq9HGs,107141
29
+ natural_pdf/elements/base.py,sha256=qKU95sJMw6uiIuez57i-3SmMIHvi9ctv7jwIWJI9qnQ,38415
30
+ natural_pdf/elements/collections.py,sha256=gPj_7eONu6zLyYHnWdPRvgHd___FfTW9mg8iwEBsgxg,112383
31
31
  natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
32
32
  natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
33
- natural_pdf/elements/region.py,sha256=XYWUym7hgkzMMfmXw0hEz_iGJ6Sdyf6DRz6XjgMVwN0,97250
33
+ natural_pdf/elements/region.py,sha256=HWqO_Or_wi7pu82w_LUoKHa7r_64AEOJDdmBXSWeZ50,98848
34
34
  natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
35
35
  natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
36
36
  natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
37
37
  natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
38
- natural_pdf/exporters/hocr.py,sha256=wilmVyBgmBNp2ZEdbKijk9ag8E1AGMMl6rBtsAOzp-Y,20201
39
- natural_pdf/exporters/hocr_font.py,sha256=e9QdxeCExxpY_dpzwGxFlT_3TcvNejw9qpkNc1NVa4Y,4612
40
- natural_pdf/exporters/original_pdf.py,sha256=vZeqBsCZh3JRRWwtfHzM78fxvhKkAI4QK3LLkeXidUM,5082
38
+ natural_pdf/exporters/hocr.py,sha256=MOb5sTxe-GlMSOtmqp3p4SY_ZigwOtmd4sj_zMRCIQY,19907
39
+ natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
40
+ natural_pdf/exporters/original_pdf.py,sha256=zsZPg_lUoEerKIzzoEw-qGdM5XBg_LZhFJeVKnCUp4o,5054
41
41
  natural_pdf/exporters/paddleocr.py,sha256=BYpdtJI7S8rBkI2dkRESx2epVAZOTfzqU-rjJnUQ5jQ,16249
42
- natural_pdf/exporters/searchable_pdf.py,sha256=-sbjjM4oV2YCiJaVKcUIPXjAs94ouXSyOSlAzv_qM7I,16815
42
+ natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
43
+ natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
+ natural_pdf/exporters/data/pdf.ttf,sha256=x4RUIJJaI9iO2DCmOVe4r4Wmao2vjZ_JDoQ2c7LvGlk,572
45
+ natural_pdf/exporters/data/sRGB.icc,sha256=KpLUuuRQt22LCqQhk9-XTXX2Jzjs6_dPAcXnWxKpV5Y,6922
43
46
  natural_pdf/extraction/manager.py,sha256=mUBbfgLG5Pl31wmajXwyipdEJb_dZ5I-y8GnWw7IzGo,4969
44
47
  natural_pdf/extraction/mixin.py,sha256=eKbr70VibpbtfjvCE80lTFuYHzq_BoVtOHjznL_GMRA,11719
45
48
  natural_pdf/extraction/result.py,sha256=c1vLguCR6l95cvg-BJJmZvL_MPg2McJaczge55bKZMg,934
@@ -55,12 +58,12 @@ natural_pdf/ocr/ocr_options.py,sha256=ZvtnFn1kPkFEoWveQ13uy6B-ofquP0gHEi4tBHrjqC
55
58
  natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
56
59
  natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
57
60
  natural_pdf/qa/document_qa.py,sha256=Jw4yyq3Vifn57D0ANmOfUlZeG8CJjBkItZBV-8ZAmos,15111
58
- natural_pdf/search/__init__.py,sha256=gdGlW3kTCw87iXMwcIesbLkUsnv5UKJmF-_1ZMR0pfQ,3339
59
- natural_pdf/search/haystack_search_service.py,sha256=UHr2UWNBetG3MZ1n_1LnV9oUe5fC-rY9p-V0j00JjQM,30339
60
- natural_pdf/search/haystack_utils.py,sha256=6Hv5DeLSF4AVDrB_aFJZGB3XpSCLQ45dXLKEd4yG2tU,18978
61
+ natural_pdf/search/__init__.py,sha256=72n_Mj_AhF_RCIoBBhZ6EZKjbILM8omelXZ99fXw7n4,3688
62
+ natural_pdf/search/lancedb_search_service.py,sha256=tW7ONPcWGY1HKle_7OqCXRnMCI-aKL-AqneKz2YbLlM,13706
63
+ natural_pdf/search/numpy_search_service.py,sha256=3_8fx7NV-15jBokOU73mcxrznxPxzVQnOlDHf3dpo28,10117
61
64
  natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
62
65
  natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
63
- natural_pdf/search/searchable_mixin.py,sha256=M2a6FaFVM0vcfh7FgjDH6BLhS-7ggeVpcfft4OOBDxY,26390
66
+ natural_pdf/search/searchable_mixin.py,sha256=dZbaHv8Go3TJNqxoPtnp9Dr0Ftxuf_44RpBeIRXkPxc,23534
64
67
  natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
65
68
  natural_pdf/selectors/parser.py,sha256=oI3ezkB6sIyrq_nLJrbaBaBZktXwEp_HG_gKQlVSVcs,24447
66
69
  natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
@@ -75,9 +78,9 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
75
78
  natural_pdf/utils/tqdm_utils.py,sha256=wV3RXvqog26eWEFEqjt2LkGnLswmO1GXaVGSqgS7tAY,1601
76
79
  natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
77
80
  natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
78
- natural_pdf/widgets/viewer.py,sha256=dC_hlPlosc08gsDc3bdAa8chOKtAoH9QFU6mrGOG9vE,39532
79
- natural_pdf-0.1.11.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
80
- natural_pdf-0.1.11.dist-info/METADATA,sha256=HBEH41sOW2opbRoN_yUq8iw3jB2fvdOXEDj0ZGfmw8g,7354
81
- natural_pdf-0.1.11.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
82
- natural_pdf-0.1.11.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
83
- natural_pdf-0.1.11.dist-info/RECORD,,
81
+ natural_pdf/widgets/viewer.py,sha256=ekgXTEfA48GrR-JjpCpgyBCXdf4IubV0pAXDJozcU7A,39196
82
+ natural_pdf-0.1.12.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
83
+ natural_pdf-0.1.12.dist-info/METADATA,sha256=GpzOi_m7e7vOS1vlUsrN0WIc0ncvhxvGvpEuPr5UGY8,7653
84
+ natural_pdf-0.1.12.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
85
+ natural_pdf-0.1.12.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
86
+ natural_pdf-0.1.12.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.3.1)
2
+ Generator: setuptools (80.7.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5