natural-pdf 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +120 -40
- natural_pdf/core/page.py +4 -2
- natural_pdf/core/pdf.py +53 -38
- natural_pdf/elements/base.py +17 -0
- natural_pdf/elements/collections.py +203 -59
- natural_pdf/elements/region.py +43 -11
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +28 -25
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,6 @@ from abc import ABC, abstractmethod
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Type, Union
|
5
5
|
|
6
6
|
# Now import the flag from the canonical source - this import should always work
|
7
|
-
from .haystack_utils import HAS_HAYSTACK_EXTRAS
|
8
7
|
|
9
8
|
DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
|
10
9
|
|
@@ -108,7 +107,6 @@ class SearchableMixin(ABC):
|
|
108
107
|
logger.info(
|
109
108
|
f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}')."
|
110
109
|
)
|
111
|
-
# TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
|
112
110
|
self._search_service = service
|
113
111
|
else:
|
114
112
|
# Create new service
|
@@ -125,28 +123,17 @@ class SearchableMixin(ABC):
|
|
125
123
|
logger.info(
|
126
124
|
f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
|
127
125
|
)
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
logger.error(f"Failed to create SearchService due to missing dependency: {ie}")
|
139
|
-
raise ie # Re-raise the original ImportError
|
140
|
-
except Exception as e:
|
141
|
-
logger.error(
|
142
|
-
f"Failed to create SearchService due to unexpected error: {e}", exc_info=True
|
143
|
-
)
|
144
|
-
# Keep the RuntimeError for other unexpected creation errors
|
145
|
-
raise RuntimeError(
|
146
|
-
"Could not create SearchService instance due to an unexpected error."
|
147
|
-
) from e
|
126
|
+
|
127
|
+
# Direct creation without try/except
|
128
|
+
service_args = {
|
129
|
+
"collection_name": effective_collection_name,
|
130
|
+
"persist": effective_persist,
|
131
|
+
**kwargs,
|
132
|
+
}
|
133
|
+
if embedding_model:
|
134
|
+
service_args["embedding_model"] = embedding_model
|
135
|
+
self._search_service = get_search_service(**service_args)
|
148
136
|
|
149
|
-
# --- Optional Immediate Indexing (with safety check for persistent) ---
|
150
137
|
if index:
|
151
138
|
if not self._search_service: # Should not happen if logic above is correct
|
152
139
|
raise RuntimeError(
|
@@ -176,8 +163,6 @@ class SearchableMixin(ABC):
|
|
176
163
|
logger.warning(
|
177
164
|
f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted."
|
178
165
|
)
|
179
|
-
# else: # Not persistent, safe to proceed without existence check
|
180
|
-
# logger.debug("Proceeding with index=True for non-persistent index.")
|
181
166
|
|
182
167
|
# Proceed with indexing if checks passed or not applicable
|
183
168
|
logger.info(
|
@@ -197,12 +182,8 @@ class SearchableMixin(ABC):
|
|
197
182
|
f"Starting internal indexing process into SearchService collection '{collection_name}'..."
|
198
183
|
)
|
199
184
|
|
200
|
-
#
|
201
|
-
|
202
|
-
indexable_items = list(self.get_indexable_items()) # Consume iterator
|
203
|
-
except Exception as e:
|
204
|
-
logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
|
205
|
-
raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
|
185
|
+
# Get indexable items without try/except
|
186
|
+
indexable_items = list(self.get_indexable_items()) # Consume iterator
|
206
187
|
|
207
188
|
if not indexable_items:
|
208
189
|
logger.warning(
|
@@ -211,27 +192,19 @@ class SearchableMixin(ABC):
|
|
211
192
|
return
|
212
193
|
|
213
194
|
logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
logger.error(
|
228
|
-
f"Indexing failed due to configuration error in collection '{collection_name}': {ice}",
|
229
|
-
exc_info=True,
|
230
|
-
)
|
231
|
-
raise # Re-raise specific error
|
232
|
-
except Exception as e: # Catch other indexing errors from the service
|
233
|
-
logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
|
234
|
-
raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
|
195
|
+
logger.debug(
|
196
|
+
f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
|
197
|
+
)
|
198
|
+
|
199
|
+
# Call index without try/except
|
200
|
+
self._search_service.index(
|
201
|
+
documents=indexable_items,
|
202
|
+
embedder_device=embedder_device,
|
203
|
+
force_reindex=force_reindex,
|
204
|
+
)
|
205
|
+
logger.info(
|
206
|
+
f"Successfully completed indexing into SearchService collection '{collection_name}'."
|
207
|
+
)
|
235
208
|
|
236
209
|
def index_for_search(
|
237
210
|
self,
|
@@ -254,14 +227,12 @@ class SearchableMixin(ABC):
|
|
254
227
|
Returns:
|
255
228
|
Self for method chaining.
|
256
229
|
"""
|
257
|
-
# --- Ensure Service is Initialized (Use Default if Needed) ---
|
258
230
|
if not self._search_service:
|
259
231
|
logger.info(
|
260
232
|
"Search service not initialized prior to index_for_search. Initializing default in-memory service."
|
261
233
|
)
|
262
234
|
self.init_search() # Call init with defaults
|
263
235
|
|
264
|
-
# --- Perform Indexing ---
|
265
236
|
self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
|
266
237
|
return self
|
267
238
|
|
@@ -289,7 +260,6 @@ class SearchableMixin(ABC):
|
|
289
260
|
RuntimeError: If no search service is configured or provided, or if search fails.
|
290
261
|
FileNotFoundError: If the collection managed by the service does not exist.
|
291
262
|
"""
|
292
|
-
# --- Determine which Search Service to use ---
|
293
263
|
effective_service = search_service or self._search_service
|
294
264
|
if not effective_service:
|
295
265
|
raise RuntimeError(
|
@@ -302,21 +272,9 @@ class SearchableMixin(ABC):
|
|
302
272
|
f"Searching collection '{collection_name}' via {type(effective_service).__name__}..."
|
303
273
|
)
|
304
274
|
|
305
|
-
# --- Prepare Query and Options ---
|
306
275
|
query_input = query
|
307
|
-
# Example: Handle Region query - maybe move this logic into HaystackSearchService.search?
|
308
|
-
# If we keep it here, it makes the mixin less generic.
|
309
|
-
# Let's assume the SearchService handles the query type appropriately for now.
|
310
|
-
# if isinstance(query, Region):
|
311
|
-
# logger.debug("Query is a Region object. Extracting text.")
|
312
|
-
# query_input = query.extract_text()
|
313
|
-
# if not query_input or query_input.isspace():
|
314
|
-
# logger.warning("Region provided for query has no extractable text.")
|
315
|
-
# return []
|
316
|
-
|
317
276
|
effective_options = options if options is not None else TextSearchOptions()
|
318
277
|
|
319
|
-
# --- Call SearchService Search Method ---
|
320
278
|
try:
|
321
279
|
results = effective_service.search(
|
322
280
|
query=query_input,
|
@@ -336,7 +294,6 @@ class SearchableMixin(ABC):
|
|
336
294
|
# Consider wrapping in a SearchError?
|
337
295
|
raise RuntimeError(f"Search failed in collection '{collection_name}'.") from e
|
338
296
|
|
339
|
-
# --- NEW Sync Method ---
|
340
297
|
def sync_index(
|
341
298
|
self,
|
342
299
|
strategy: str = "full", # 'full' (add/update/delete) or 'upsert_only'
|
@@ -378,7 +335,6 @@ class SearchableMixin(ABC):
|
|
378
335
|
)
|
379
336
|
summary = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
|
380
337
|
|
381
|
-
# --- Check Service Capabilities for 'full' sync ---
|
382
338
|
if strategy == "full":
|
383
339
|
required_methods = ["list_documents", "delete_documents"]
|
384
340
|
missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
|
@@ -388,7 +344,6 @@ class SearchableMixin(ABC):
|
|
388
344
|
f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
|
389
345
|
)
|
390
346
|
|
391
|
-
# --- 1. Get Desired State (from current collection) ---
|
392
347
|
desired_state: Dict[str, Indexable] = {} # {id: item}
|
393
348
|
desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
|
394
349
|
try:
|
@@ -426,7 +381,6 @@ class SearchableMixin(ABC):
|
|
426
381
|
|
427
382
|
logger.info(f"Desired state contains {len(desired_state)} indexable items.")
|
428
383
|
|
429
|
-
# --- 2. Handle Different Strategies ---
|
430
384
|
if strategy == "upsert_only":
|
431
385
|
# Simple case: just index everything, let the service handle upserts
|
432
386
|
items_to_index = list(desired_state.values())
|
natural_pdf/widgets/viewer.py
CHANGED
@@ -31,20 +31,6 @@ try:
|
|
31
31
|
from PIL import Image
|
32
32
|
from traitlets import Dict, List, Unicode, observe
|
33
33
|
|
34
|
-
# --- Read JS code from file (only needed if widgets are defined) --- #
|
35
|
-
_MODULE_DIR = os.path.dirname(__file__)
|
36
|
-
_FRONTEND_JS_PATH = os.path.join(_MODULE_DIR, "frontend", "viewer.js")
|
37
|
-
try:
|
38
|
-
with open(_FRONTEND_JS_PATH, "r", encoding="utf-8") as f:
|
39
|
-
_FRONTEND_JS_CODE = f.read()
|
40
|
-
logger.debug(f"Successfully read frontend JS from: {_FRONTEND_JS_PATH}")
|
41
|
-
except FileNotFoundError:
|
42
|
-
logger.error(f"Frontend JS file not found at {_FRONTEND_JS_PATH}. Widget will likely fail.")
|
43
|
-
_FRONTEND_JS_CODE = "console.error('Frontend JS file not found! Widget cannot load.');"
|
44
|
-
except Exception as e:
|
45
|
-
logger.error(f"Error reading frontend JS file {_FRONTEND_JS_PATH}: {e}")
|
46
|
-
_FRONTEND_JS_CODE = f"console.error('Error reading frontend JS file: {e}');"
|
47
|
-
|
48
34
|
# --- Define Widget Classes ONLY if ipywidgets is available ---
|
49
35
|
class SimpleInteractiveViewerWidget(widgets.DOMWidget):
|
50
36
|
def __init__(self, pdf_data=None, **kwargs):
|
@@ -631,7 +617,7 @@ try:
|
|
631
617
|
|
632
618
|
# Filter out 'char' elements
|
633
619
|
filtered_page_elements = [
|
634
|
-
el for el in page_elements if getattr(el, "type", "").lower() != "char"
|
620
|
+
el for el in page_elements if str(getattr(el, "type", "")).lower() != "char"
|
635
621
|
]
|
636
622
|
logger.debug(
|
637
623
|
f"Filtered out char elements, keeping {len(filtered_page_elements)} elements."
|
@@ -659,19 +645,21 @@ try:
|
|
659
645
|
|
660
646
|
for i, element in enumerate(filtered_page_elements):
|
661
647
|
# Get original coordinates and calculated width/height (always present via base class)
|
648
|
+
# Assuming 'element' is always an object with these attributes now
|
662
649
|
original_x0 = element.x0
|
663
650
|
original_y0 = element.top
|
664
651
|
original_x1 = element.x1
|
665
652
|
original_y1 = element.bottom
|
666
653
|
width = element.width
|
667
654
|
height = element.height
|
655
|
+
current_element_type = element.type # Direct attribute access
|
668
656
|
scale = 1.0
|
669
657
|
|
670
658
|
# Base element dict with required info
|
671
659
|
elem_dict = {
|
672
660
|
"id": i,
|
673
661
|
# Use the standardized .type property
|
674
|
-
"type":
|
662
|
+
"type": current_element_type,
|
675
663
|
# Scaled coordinates for positioning in HTML/SVG
|
676
664
|
"x0": original_x0 * scale,
|
677
665
|
"y0": original_y0 * scale,
|
@@ -684,21 +672,24 @@ try:
|
|
684
672
|
# --- Get Default Attributes --- #
|
685
673
|
attributes_found = set()
|
686
674
|
for attr_name in default_attributes_to_get:
|
675
|
+
# Assuming 'element' is always an object
|
687
676
|
if hasattr(element, attr_name):
|
688
677
|
try:
|
689
|
-
|
678
|
+
value_to_process = getattr(element, attr_name)
|
690
679
|
# Convert non-JSON serializable types to string
|
691
|
-
processed_value =
|
680
|
+
processed_value = value_to_process
|
692
681
|
if (
|
693
|
-
not isinstance(
|
694
|
-
|
682
|
+
not isinstance(
|
683
|
+
value_to_process, (str, int, float, bool, list, dict, tuple)
|
684
|
+
)
|
685
|
+
and value_to_process is not None
|
695
686
|
):
|
696
|
-
processed_value = str(
|
687
|
+
processed_value = str(value_to_process)
|
697
688
|
elem_dict[attr_name] = processed_value
|
698
689
|
attributes_found.add(attr_name)
|
699
690
|
except Exception as e:
|
700
691
|
logger.warning(
|
701
|
-
f"Could not get or process default attribute '{attr_name}' for element {i} ({
|
692
|
+
f"Could not get or process default attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
|
702
693
|
)
|
703
694
|
|
704
695
|
# --- Get User-Requested Attributes (if any) --- #
|
@@ -707,23 +698,23 @@ try:
|
|
707
698
|
# Only process if not already added and exists
|
708
699
|
if attr_name not in attributes_found and hasattr(element, attr_name):
|
709
700
|
try:
|
710
|
-
|
711
|
-
processed_value =
|
701
|
+
value_to_process = getattr(element, attr_name)
|
702
|
+
processed_value = value_to_process
|
712
703
|
if (
|
713
704
|
not isinstance(
|
714
|
-
|
705
|
+
value_to_process, (str, int, float, bool, list, dict, tuple)
|
715
706
|
)
|
716
|
-
and
|
707
|
+
and value_to_process is not None
|
717
708
|
):
|
718
|
-
processed_value = str(
|
709
|
+
processed_value = str(value_to_process)
|
719
710
|
elem_dict[attr_name] = processed_value
|
720
711
|
except Exception as e:
|
721
712
|
logger.warning(
|
722
|
-
f"Could not get or process requested attribute '{attr_name}' for element {i} ({
|
713
|
+
f"Could not get or process requested attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
|
723
714
|
)
|
724
|
-
for
|
725
|
-
if isinstance(elem_dict[
|
726
|
-
elem_dict[
|
715
|
+
for attr_name_val in elem_dict: # Renamed to avoid conflict
|
716
|
+
if isinstance(elem_dict[attr_name_val], float):
|
717
|
+
elem_dict[attr_name_val] = round(elem_dict[attr_name_val], 2)
|
727
718
|
elements.append(elem_dict)
|
728
719
|
|
729
720
|
logger.debug(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: natural-pdf
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.12
|
4
4
|
Summary: A more intuitive interface for working with PDFs
|
5
5
|
Author-email: Jonathan Soma <jonathan.soma@gmail.com>
|
6
6
|
License-Expression: MIT
|
@@ -12,20 +12,16 @@ Requires-Python: >=3.9
|
|
12
12
|
Description-Content-Type: text/markdown
|
13
13
|
License-File: LICENSE
|
14
14
|
Requires-Dist: pdfplumber
|
15
|
-
Requires-Dist:
|
15
|
+
Requires-Dist: pillow
|
16
16
|
Requires-Dist: colour
|
17
17
|
Requires-Dist: numpy
|
18
18
|
Requires-Dist: urllib3
|
19
19
|
Requires-Dist: tqdm
|
20
20
|
Requires-Dist: pydantic
|
21
|
-
|
22
|
-
Requires-Dist:
|
23
|
-
Provides-Extra:
|
24
|
-
Requires-Dist:
|
25
|
-
Requires-Dist: lancedb-haystack; extra == "haystack"
|
26
|
-
Requires-Dist: lancedb; extra == "haystack"
|
27
|
-
Requires-Dist: sentence-transformers; extra == "haystack"
|
28
|
-
Requires-Dist: natural-pdf[core-ml]; extra == "haystack"
|
21
|
+
Requires-Dist: jenkspy
|
22
|
+
Requires-Dist: pikepdf>=9.7.0
|
23
|
+
Provides-Extra: viewer
|
24
|
+
Requires-Dist: ipywidgets<9.0.0,>=7.0.0; extra == "viewer"
|
29
25
|
Provides-Extra: easyocr
|
30
26
|
Requires-Dist: easyocr; extra == "easyocr"
|
31
27
|
Requires-Dist: natural-pdf[core-ml]; extra == "easyocr"
|
@@ -41,19 +37,25 @@ Requires-Dist: natural-pdf[core-ml]; extra == "surya"
|
|
41
37
|
Provides-Extra: doctr
|
42
38
|
Requires-Dist: python-doctr[torch]; extra == "doctr"
|
43
39
|
Requires-Dist: natural-pdf[core-ml]; extra == "doctr"
|
44
|
-
Provides-Extra: qa
|
45
|
-
Requires-Dist: natural-pdf[core-ml]; extra == "qa"
|
46
40
|
Provides-Extra: docling
|
47
41
|
Requires-Dist: docling; extra == "docling"
|
48
42
|
Requires-Dist: natural-pdf[core-ml]; extra == "docling"
|
49
43
|
Provides-Extra: llm
|
50
44
|
Requires-Dist: openai>=1.0; extra == "llm"
|
51
|
-
Provides-Extra: classification
|
52
|
-
Requires-Dist: sentence-transformers; extra == "classification"
|
53
|
-
Requires-Dist: timm; extra == "classification"
|
54
|
-
Requires-Dist: natural-pdf[core-ml]; extra == "classification"
|
55
45
|
Provides-Extra: test
|
56
46
|
Requires-Dist: pytest; extra == "test"
|
47
|
+
Provides-Extra: search
|
48
|
+
Requires-Dist: lancedb; extra == "search"
|
49
|
+
Requires-Dist: pyarrow; extra == "search"
|
50
|
+
Provides-Extra: favorites
|
51
|
+
Requires-Dist: natural-pdf[deskew]; extra == "favorites"
|
52
|
+
Requires-Dist: natural-pdf[llm]; extra == "favorites"
|
53
|
+
Requires-Dist: natural-pdf[surya]; extra == "favorites"
|
54
|
+
Requires-Dist: natural-pdf[easyocr]; extra == "favorites"
|
55
|
+
Requires-Dist: natural-pdf[layout_yolo]; extra == "favorites"
|
56
|
+
Requires-Dist: natural-pdf[ocr-export]; extra == "favorites"
|
57
|
+
Requires-Dist: natural-pdf[viewer]; extra == "favorites"
|
58
|
+
Requires-Dist: natural-pdf[search]; extra == "favorites"
|
57
59
|
Provides-Extra: dev
|
58
60
|
Requires-Dist: black; extra == "dev"
|
59
61
|
Requires-Dist: isort; extra == "dev"
|
@@ -67,29 +69,32 @@ Requires-Dist: pipdeptree; extra == "dev"
|
|
67
69
|
Requires-Dist: nbformat; extra == "dev"
|
68
70
|
Requires-Dist: jupytext; extra == "dev"
|
69
71
|
Requires-Dist: nbclient; extra == "dev"
|
72
|
+
Requires-Dist: ipykernel; extra == "dev"
|
70
73
|
Provides-Extra: deskew
|
71
74
|
Requires-Dist: deskew>=1.5; extra == "deskew"
|
72
75
|
Requires-Dist: img2pdf; extra == "deskew"
|
73
76
|
Provides-Extra: all
|
74
|
-
Requires-Dist: natural-pdf[
|
75
|
-
Requires-Dist: natural-pdf[haystack]; extra == "all"
|
77
|
+
Requires-Dist: natural-pdf[viewer]; extra == "all"
|
76
78
|
Requires-Dist: natural-pdf[easyocr]; extra == "all"
|
77
79
|
Requires-Dist: natural-pdf[paddle]; extra == "all"
|
78
80
|
Requires-Dist: natural-pdf[layout_yolo]; extra == "all"
|
79
81
|
Requires-Dist: natural-pdf[surya]; extra == "all"
|
80
82
|
Requires-Dist: natural-pdf[doctr]; extra == "all"
|
81
|
-
Requires-Dist: natural-pdf[qa]; extra == "all"
|
82
83
|
Requires-Dist: natural-pdf[ocr-export]; extra == "all"
|
83
84
|
Requires-Dist: natural-pdf[docling]; extra == "all"
|
84
85
|
Requires-Dist: natural-pdf[llm]; extra == "all"
|
85
|
-
Requires-Dist: natural-pdf[
|
86
|
+
Requires-Dist: natural-pdf[core-ml]; extra == "all"
|
86
87
|
Requires-Dist: natural-pdf[deskew]; extra == "all"
|
87
88
|
Requires-Dist: natural-pdf[test]; extra == "all"
|
89
|
+
Requires-Dist: natural-pdf[search]; extra == "all"
|
88
90
|
Provides-Extra: core-ml
|
89
91
|
Requires-Dist: torch; extra == "core-ml"
|
90
92
|
Requires-Dist: torchvision; extra == "core-ml"
|
91
93
|
Requires-Dist: transformers[sentencepiece]; extra == "core-ml"
|
92
94
|
Requires-Dist: huggingface_hub; extra == "core-ml"
|
95
|
+
Requires-Dist: sentence-transformers; extra == "core-ml"
|
96
|
+
Requires-Dist: numpy; extra == "core-ml"
|
97
|
+
Requires-Dist: timm; extra == "core-ml"
|
93
98
|
Provides-Extra: ocr-export
|
94
99
|
Requires-Dist: pikepdf; extra == "ocr-export"
|
95
100
|
Provides-Extra: export-extras
|
@@ -114,26 +119,11 @@ Natural PDF lets you find and extract content from PDFs using simple code that m
|
|
114
119
|
pip install natural-pdf
|
115
120
|
```
|
116
121
|
|
117
|
-
For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install extras:
|
122
|
+
For optional features like specific OCR engines, layout analysis models, or the interactive Jupyter widget, you can install one to two million different extras. If you just want the greatest hits:
|
118
123
|
|
119
124
|
```bash
|
120
|
-
#
|
121
|
-
pip install natural-pdf[
|
122
|
-
pip install natural-pdf[surya]
|
123
|
-
pip install natural-pdf[paddle]
|
124
|
-
|
125
|
-
# Example: Install support for features using Large Language Models (e.g., via OpenAI-compatible APIs)
|
126
|
-
pip install natural-pdf[llm]
|
127
|
-
# (May require setting API key environment variables, e.g., GOOGLE_API_KEY for Gemini)
|
128
|
-
|
129
|
-
# Example: Install with interactive viewer support
|
130
|
-
pip install natural-pdf[interactive]
|
131
|
-
|
132
|
-
# Example: Install with semantic search support (Haystack)
|
133
|
-
pip install natural-pdf[haystack]
|
134
|
-
|
135
|
-
# Install everything
|
136
|
-
pip install natural-pdf[all]
|
125
|
+
# deskewing, OCR (surya) + layout analysis (yolo), interactive browsing
|
126
|
+
pip install natural-pdf[favorites]
|
137
127
|
```
|
138
128
|
|
139
129
|
See the [installation guide](https://jsoma.github.io/natural-pdf/installation/) for more details on extras.
|
@@ -147,25 +137,26 @@ from natural_pdf import PDF
|
|
147
137
|
pdf = PDF('document.pdf')
|
148
138
|
page = pdf.pages[0]
|
149
139
|
|
140
|
+
# Extract all of the text on the page
|
141
|
+
page.extract_text()
|
142
|
+
|
150
143
|
# Find elements using CSS-like selectors
|
151
144
|
heading = page.find('text:contains("Summary"):bold')
|
152
145
|
|
153
146
|
# Extract content below the heading
|
154
147
|
content = heading.below().extract_text()
|
155
|
-
print("Content below Summary:", content[:100] + "...")
|
156
148
|
|
157
|
-
#
|
158
|
-
|
159
|
-
page.add_exclusion(page.find('text:contains("CONFIDENTIAL")').above())
|
160
|
-
page.add_exclusion(page.find_all('line')[-1].below())
|
149
|
+
# Examine all the bold text on the page
|
150
|
+
page.find_all('text:bold').show()
|
161
151
|
|
162
|
-
#
|
163
|
-
|
164
|
-
|
152
|
+
# Exclude parts of the page from selectors/extractors
|
153
|
+
header = page.find('text:contains("CONFIDENTIAL")').above()
|
154
|
+
footer = page.find_all('line')[-1].below()
|
155
|
+
page.add_exclusion(header)
|
156
|
+
page.add_exclusion(footer)
|
165
157
|
|
166
|
-
#
|
167
|
-
|
168
|
-
page.to_image()
|
158
|
+
# Extract clean text from the page ignoring exclusions
|
159
|
+
clean_text = page.extract_text()
|
169
160
|
```
|
170
161
|
|
171
162
|
And as a fun bonus, `page.viewer()` will provide an interactive method to explore the PDF.
|
@@ -186,3 +177,17 @@ Natural PDF offers a range of features for working with PDFs:
|
|
186
177
|
## Learn More
|
187
178
|
|
188
179
|
Dive deeper into the features and explore advanced usage in the [**Complete Documentation**](https://jsoma.github.io/natural-pdf).
|
180
|
+
|
181
|
+
## Best friends
|
182
|
+
|
183
|
+
Natural PDF sits on top of a *lot* of fantastic tools and mdoels, some of which are:
|
184
|
+
|
185
|
+
- [pdfplumber](https://github.com/jsvine/pdfplumber)
|
186
|
+
- [EasyOCR](https://www.jaided.ai/easyocr/)
|
187
|
+
- [PaddleOCR](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
|
188
|
+
- [Surya](https://github.com/VikParuchuri/surya)
|
189
|
+
- A specific [YOLO](https://github.com/opendatalab/DocLayout-YOLO)
|
190
|
+
- [deskew](https://github.com/sbrunner/deskew)
|
191
|
+
- [doctr](https://github.com/mindee/doctr)
|
192
|
+
- [docling](https://github.com/docling-project/docling)
|
193
|
+
- [Hugging Face](https://huggingface.co/models)
|
@@ -1,7 +1,7 @@
|
|
1
|
-
natural_pdf/__init__.py,sha256=
|
1
|
+
natural_pdf/__init__.py,sha256=0sCYgb9BAV5OnpD_1AswMuOLuXNmpe3OLJpv_6p3tgw,2449
|
2
2
|
natural_pdf/analyzers/__init__.py,sha256=dIXjsMqoxKmd9OOnSBzn12wvdIz7D7YNQRAnXslpJSM,142
|
3
|
-
natural_pdf/analyzers/text_options.py,sha256=
|
4
|
-
natural_pdf/analyzers/text_structure.py,sha256=
|
3
|
+
natural_pdf/analyzers/text_options.py,sha256=qEkDaYWla0rIM_gszEOsu52q7C_dAfV81P2HLJZM2sw,3333
|
4
|
+
natural_pdf/analyzers/text_structure.py,sha256=VfKTsTFrK877sC0grsis9jK3rrgp0Mbp13VWEbukTcs,28437
|
5
5
|
natural_pdf/analyzers/utils.py,sha256=PYbzJzSAHZ7JsMes84WIrSbA0zkjJGs0CLvIeINsf_k,2100
|
6
6
|
natural_pdf/analyzers/layout/__init__.py,sha256=oq1uJ5UkGGMbBKGirV1aRKK3hxAUyjTLywYkPCQH1f0,33
|
7
7
|
natural_pdf/analyzers/layout/base.py,sha256=bYawhmc_0xqKG-xbxUSiazIU1om-aBox5Jh8qDqv-eM,6451
|
@@ -15,31 +15,34 @@ natural_pdf/analyzers/layout/pdfplumber_table_finder.py,sha256=Tk0Q7wv7nGYPo69lh
|
|
15
15
|
natural_pdf/analyzers/layout/surya.py,sha256=4RdnhRxSS3i3Ns5mFhOA9-P0xd7Ms19uZuKvUGQfEBI,9789
|
16
16
|
natural_pdf/analyzers/layout/tatr.py,sha256=cVr0ZyhY2mNLAKZ4DGMm-b7XNJpILKh8x8ZpyDeUhLk,15032
|
17
17
|
natural_pdf/analyzers/layout/yolo.py,sha256=ANo2U4EZgeN2eYKM1bZIuysiuJLgwl4JeQchrRxOKwA,8388
|
18
|
-
natural_pdf/classification/manager.py,sha256=
|
18
|
+
natural_pdf/classification/manager.py,sha256=4iHoZQ6W541efFvwJt38K-4n5svYH3uYk0ixWPCt7do,17922
|
19
19
|
natural_pdf/classification/mixin.py,sha256=hhX9qWPShpOq_-mgoEq0GUWnutBnNMo3YdUlxwyNWMA,6781
|
20
20
|
natural_pdf/classification/results.py,sha256=El1dY7cBQVOB5lP-uj52dWgH6Y7TeQgJOVcZD-OLjes,2778
|
21
21
|
natural_pdf/collections/mixins.py,sha256=sj76Cn6EdBtb5f-bdAV-1qpdixX8tI4BzPccPiYLI1w,5117
|
22
22
|
natural_pdf/collections/pdf_collection.py,sha256=obHizc2KR4ZiAspodaPOeMgfpoW3aKg_G0goBHlrFJI,32018
|
23
23
|
natural_pdf/core/__init__.py,sha256=QC8H4M3KbXwMFiQORZ0pdPlzx1Ix6oKKQSS7Ib2KEaA,38
|
24
|
-
natural_pdf/core/element_manager.py,sha256=
|
25
|
-
natural_pdf/core/highlighting_service.py,sha256=
|
26
|
-
natural_pdf/core/page.py,sha256=
|
27
|
-
natural_pdf/core/pdf.py,sha256=
|
24
|
+
natural_pdf/core/element_manager.py,sha256=_UdXu51sLi6STzc8Pj4k8R721G3yJixXDLuRHn3hmr8,25731
|
25
|
+
natural_pdf/core/highlighting_service.py,sha256=F1s9KEVkZb3Srtp1_ayQJayp1ZvDf9FcChsZdLk4yWk,37138
|
26
|
+
natural_pdf/core/page.py,sha256=XoHPdsg7YUQIkayD0U1cQ7pNR8NCgV9xkV0rVAO7n3s,105167
|
27
|
+
natural_pdf/core/pdf.py,sha256=ssXJviTVKyVZuyiSKv1vE0GW7BACNXBhovIUbPm5MZ4,69511
|
28
28
|
natural_pdf/elements/__init__.py,sha256=S8XeiNWJ1WcgnyYKdYV1yxQlAxCCO3FfITT8MQwNbyk,41
|
29
|
-
natural_pdf/elements/base.py,sha256=
|
30
|
-
natural_pdf/elements/collections.py,sha256=
|
29
|
+
natural_pdf/elements/base.py,sha256=qKU95sJMw6uiIuez57i-3SmMIHvi9ctv7jwIWJI9qnQ,38415
|
30
|
+
natural_pdf/elements/collections.py,sha256=gPj_7eONu6zLyYHnWdPRvgHd___FfTW9mg8iwEBsgxg,112383
|
31
31
|
natural_pdf/elements/line.py,sha256=7cow3xMUKhAj7zoQz7OaB1eIH2_a8B__LB7iGJ4Mb0o,4612
|
32
32
|
natural_pdf/elements/rect.py,sha256=kiVa3e377ZnqIOXc89d9ZSY4EcmDxtccdtUw-HOQzpw,3796
|
33
|
-
natural_pdf/elements/region.py,sha256=
|
33
|
+
natural_pdf/elements/region.py,sha256=HWqO_Or_wi7pu82w_LUoKHa7r_64AEOJDdmBXSWeZ50,98848
|
34
34
|
natural_pdf/elements/text.py,sha256=13HvVZGinj2Vm_fFCAnqi7hohtoKvnpCp3VCfkpeAbc,11146
|
35
35
|
natural_pdf/export/mixin.py,sha256=L1q3MIEFWuvie4j4_EmW7GT3NerbZ1as0XMUoqTS7gM,5083
|
36
36
|
natural_pdf/exporters/__init__.py,sha256=7MnvRLLQdwtg-ULu-8uK8C84GsKiJamyhRw_GgWhw7k,151
|
37
37
|
natural_pdf/exporters/base.py,sha256=XhR1xlkHOh7suOuX7mWbsj1h2o1pZNet-OAS5YCJyeI,2115
|
38
|
-
natural_pdf/exporters/hocr.py,sha256=
|
39
|
-
natural_pdf/exporters/hocr_font.py,sha256=
|
40
|
-
natural_pdf/exporters/original_pdf.py,sha256=
|
38
|
+
natural_pdf/exporters/hocr.py,sha256=MOb5sTxe-GlMSOtmqp3p4SY_ZigwOtmd4sj_zMRCIQY,19907
|
39
|
+
natural_pdf/exporters/hocr_font.py,sha256=1wsGOMj6zoaRN2rxCwrv4MMLGawpNz984WgXpmWekgw,4574
|
40
|
+
natural_pdf/exporters/original_pdf.py,sha256=zsZPg_lUoEerKIzzoEw-qGdM5XBg_LZhFJeVKnCUp4o,5054
|
41
41
|
natural_pdf/exporters/paddleocr.py,sha256=BYpdtJI7S8rBkI2dkRESx2epVAZOTfzqU-rjJnUQ5jQ,16249
|
42
|
-
natural_pdf/exporters/searchable_pdf.py,sha256
|
42
|
+
natural_pdf/exporters/searchable_pdf.py,sha256=G2Tc4tpDXSYIufXJlkA8ppW_3DuzHAaweYKae33pI_c,16290
|
43
|
+
natural_pdf/exporters/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
|
+
natural_pdf/exporters/data/pdf.ttf,sha256=x4RUIJJaI9iO2DCmOVe4r4Wmao2vjZ_JDoQ2c7LvGlk,572
|
45
|
+
natural_pdf/exporters/data/sRGB.icc,sha256=KpLUuuRQt22LCqQhk9-XTXX2Jzjs6_dPAcXnWxKpV5Y,6922
|
43
46
|
natural_pdf/extraction/manager.py,sha256=mUBbfgLG5Pl31wmajXwyipdEJb_dZ5I-y8GnWw7IzGo,4969
|
44
47
|
natural_pdf/extraction/mixin.py,sha256=eKbr70VibpbtfjvCE80lTFuYHzq_BoVtOHjznL_GMRA,11719
|
45
48
|
natural_pdf/extraction/result.py,sha256=c1vLguCR6l95cvg-BJJmZvL_MPg2McJaczge55bKZMg,934
|
@@ -55,12 +58,12 @@ natural_pdf/ocr/ocr_options.py,sha256=ZvtnFn1kPkFEoWveQ13uy6B-ofquP0gHEi4tBHrjqC
|
|
55
58
|
natural_pdf/ocr/utils.py,sha256=OxuHwDbHWj6setvnC0QYwMHrAjxGkhmLzWHpMqqGupA,4397
|
56
59
|
natural_pdf/qa/__init__.py,sha256=Pjo62JTnUNEjGNsC437mvsS5KQ5m7X_BibGvavR9AW0,108
|
57
60
|
natural_pdf/qa/document_qa.py,sha256=Jw4yyq3Vifn57D0ANmOfUlZeG8CJjBkItZBV-8ZAmos,15111
|
58
|
-
natural_pdf/search/__init__.py,sha256=
|
59
|
-
natural_pdf/search/
|
60
|
-
natural_pdf/search/
|
61
|
+
natural_pdf/search/__init__.py,sha256=72n_Mj_AhF_RCIoBBhZ6EZKjbILM8omelXZ99fXw7n4,3688
|
62
|
+
natural_pdf/search/lancedb_search_service.py,sha256=tW7ONPcWGY1HKle_7OqCXRnMCI-aKL-AqneKz2YbLlM,13706
|
63
|
+
natural_pdf/search/numpy_search_service.py,sha256=3_8fx7NV-15jBokOU73mcxrznxPxzVQnOlDHf3dpo28,10117
|
61
64
|
natural_pdf/search/search_options.py,sha256=sq_e8_jSROicD94b_xtDtLnjEr_Zsy4icjzPkK0a8QA,3566
|
62
65
|
natural_pdf/search/search_service_protocol.py,sha256=Dl-Q-CrutkhZwI69scbW9EWPeYM63qxB60_EA7YqIYo,6699
|
63
|
-
natural_pdf/search/searchable_mixin.py,sha256=
|
66
|
+
natural_pdf/search/searchable_mixin.py,sha256=dZbaHv8Go3TJNqxoPtnp9Dr0Ftxuf_44RpBeIRXkPxc,23534
|
64
67
|
natural_pdf/selectors/__init__.py,sha256=oZGeqSv53EqmIZOhcnawuaGGlRg1h79vArXuZCWKm4A,123
|
65
68
|
natural_pdf/selectors/parser.py,sha256=oI3ezkB6sIyrq_nLJrbaBaBZktXwEp_HG_gKQlVSVcs,24447
|
66
69
|
natural_pdf/templates/__init__.py,sha256=jYBxzfi73vew0f6yhIh1MlRxw4F_TVN2hKQR0YXOFe0,20
|
@@ -75,9 +78,9 @@ natural_pdf/utils/text_extraction.py,sha256=z6Jhy11pakYCsEpkvh8ldw6DkUFsYF1hCL9Y
|
|
75
78
|
natural_pdf/utils/tqdm_utils.py,sha256=wV3RXvqog26eWEFEqjt2LkGnLswmO1GXaVGSqgS7tAY,1601
|
76
79
|
natural_pdf/utils/visualization.py,sha256=30pRWQdsRJh2pSObh-brKVsFgC1n8tHmSrta_UDnVPw,8989
|
77
80
|
natural_pdf/widgets/__init__.py,sha256=O2fSDo604wDAP6UwUkmBq3eT91RSqHwBpAOQXq92S8s,214
|
78
|
-
natural_pdf/widgets/viewer.py,sha256=
|
79
|
-
natural_pdf-0.1.
|
80
|
-
natural_pdf-0.1.
|
81
|
-
natural_pdf-0.1.
|
82
|
-
natural_pdf-0.1.
|
83
|
-
natural_pdf-0.1.
|
81
|
+
natural_pdf/widgets/viewer.py,sha256=ekgXTEfA48GrR-JjpCpgyBCXdf4IubV0pAXDJozcU7A,39196
|
82
|
+
natural_pdf-0.1.12.dist-info/licenses/LICENSE,sha256=9zfwINwJlarbDmdh6iJV4QUG54QSJlSAUcnC1YiC_Ns,1074
|
83
|
+
natural_pdf-0.1.12.dist-info/METADATA,sha256=GpzOi_m7e7vOS1vlUsrN0WIc0ncvhxvGvpEuPr5UGY8,7653
|
84
|
+
natural_pdf-0.1.12.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
85
|
+
natural_pdf-0.1.12.dist-info/top_level.txt,sha256=Cyw1zmNDlUZfb5moU-WUWGprrwH7ln_8LDGdmMHF1xI,17
|
86
|
+
natural_pdf-0.1.12.dist-info/RECORD,,
|