natural-pdf 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +117 -75
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +200 -126
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
|
|
1
|
-
import logging
|
2
|
-
import numpy as np
|
3
1
|
import json
|
2
|
+
import logging
|
4
3
|
from pathlib import Path
|
5
4
|
from typing import Any, Dict, Iterable, List, Optional, Union
|
6
5
|
|
6
|
+
import numpy as np
|
7
7
|
from sentence_transformers import SentenceTransformer
|
8
8
|
|
9
9
|
from .search_options import BaseSearchOptions
|
@@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
|
|
17
17
|
|
18
18
|
DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
19
19
|
|
20
|
+
|
20
21
|
class NumpySearchService(SearchServiceProtocol):
|
21
22
|
"""Basic in-memory vector search implementation using NumPy."""
|
22
23
|
|
@@ -34,19 +35,21 @@ class NumpySearchService(SearchServiceProtocol):
|
|
34
35
|
"Persistence requested but LanceDB is not installed. "
|
35
36
|
"For persistent vector search, install LanceDB: pip install lancedb"
|
36
37
|
)
|
37
|
-
|
38
|
+
|
38
39
|
self.collection_name = collection_name
|
39
40
|
self._embedding_model_name = embedding_model_name
|
40
41
|
self.embedding_model = SentenceTransformer(self._embedding_model_name)
|
41
42
|
self._embedding_dims = len(self.embedding_model.encode("test"))
|
42
|
-
|
43
|
+
|
43
44
|
# Simple in-memory storage
|
44
45
|
self._vectors = []
|
45
46
|
self._documents = []
|
46
47
|
self._metadata = []
|
47
48
|
self._ids = []
|
48
|
-
|
49
|
-
logger.info(
|
49
|
+
|
50
|
+
logger.info(
|
51
|
+
f"NumpySearchService initialized for collection '{collection_name}' with model '{embedding_model_name}'"
|
52
|
+
)
|
50
53
|
|
51
54
|
def index(
|
52
55
|
self,
|
@@ -55,70 +58,74 @@ class NumpySearchService(SearchServiceProtocol):
|
|
55
58
|
force_reindex: bool = False,
|
56
59
|
) -> None:
|
57
60
|
if force_reindex:
|
58
|
-
logger.info(
|
61
|
+
logger.info(
|
62
|
+
f"Force reindex requested for collection '{self.collection_name}'. Clearing in-memory vectors."
|
63
|
+
)
|
59
64
|
self._vectors = []
|
60
65
|
self._documents = []
|
61
66
|
self._metadata = []
|
62
67
|
self._ids = []
|
63
|
-
|
68
|
+
|
64
69
|
items = list(documents)
|
65
70
|
logger.info(f"Indexing {len(items)} documents for collection '{self.collection_name}'")
|
66
|
-
|
71
|
+
|
67
72
|
if not items:
|
68
73
|
logger.warning("No documents provided for indexing. Skipping.")
|
69
74
|
return
|
70
|
-
|
75
|
+
|
71
76
|
texts_to_embed = []
|
72
77
|
items_info = []
|
73
|
-
|
78
|
+
|
74
79
|
for item in items:
|
75
80
|
doc_id = item.get_id()
|
76
81
|
metadata = item.get_metadata().copy()
|
77
82
|
content_obj = item.get_content()
|
78
83
|
content_text = ""
|
79
|
-
|
84
|
+
|
80
85
|
if isinstance(content_obj, str):
|
81
86
|
content_text = content_obj
|
82
|
-
elif hasattr(content_obj, "extract_text") and callable(
|
87
|
+
elif hasattr(content_obj, "extract_text") and callable(
|
88
|
+
getattr(content_obj, "extract_text")
|
89
|
+
):
|
83
90
|
content_text = content_obj.extract_text()
|
84
|
-
if not isinstance(content_text, str):
|
91
|
+
if not isinstance(content_text, str):
|
85
92
|
content_text = str(content_obj)
|
86
93
|
else:
|
87
94
|
content_text = str(content_obj)
|
88
|
-
|
95
|
+
|
89
96
|
# Try to add content hash to metadata
|
90
97
|
try:
|
91
98
|
content_hash = item.get_content_hash()
|
92
|
-
if content_hash:
|
99
|
+
if content_hash:
|
93
100
|
metadata["content_hash"] = content_hash
|
94
101
|
except (AttributeError, NotImplementedError):
|
95
102
|
pass
|
96
103
|
except Exception as e:
|
97
104
|
logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
|
98
|
-
|
105
|
+
|
99
106
|
texts_to_embed.append(content_text)
|
100
|
-
items_info.append({
|
101
|
-
|
102
|
-
"metadata": metadata,
|
103
|
-
"text": content_text
|
104
|
-
})
|
105
|
-
|
107
|
+
items_info.append({"id": doc_id, "metadata": metadata, "text": content_text})
|
108
|
+
|
106
109
|
if not texts_to_embed:
|
107
110
|
logger.warning("No text content to embed. Skipping.")
|
108
111
|
return
|
109
|
-
|
110
|
-
logger.info(
|
112
|
+
|
113
|
+
logger.info(
|
114
|
+
f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'..."
|
115
|
+
)
|
111
116
|
generated_embeddings = self.embedding_model.encode(
|
112
117
|
texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
|
113
118
|
)
|
114
|
-
|
119
|
+
|
115
120
|
for i, item_info in enumerate(items_info):
|
116
121
|
self._vectors.append(generated_embeddings[i])
|
117
122
|
self._documents.append(item_info["text"])
|
118
123
|
self._metadata.append(item_info["metadata"])
|
119
124
|
self._ids.append(item_info["id"])
|
120
|
-
|
121
|
-
logger.info(
|
125
|
+
|
126
|
+
logger.info(
|
127
|
+
f"Successfully indexed {len(texts_to_embed)} documents. Total count: {len(self._vectors)}"
|
128
|
+
)
|
122
129
|
|
123
130
|
def search(
|
124
131
|
self,
|
@@ -128,7 +135,7 @@ class NumpySearchService(SearchServiceProtocol):
|
|
128
135
|
if not self._vectors:
|
129
136
|
logger.debug("No vectors in index. Returning empty results.")
|
130
137
|
return []
|
131
|
-
|
138
|
+
|
132
139
|
# Process query to text
|
133
140
|
query_text = ""
|
134
141
|
if isinstance(query, (str, Path)):
|
@@ -139,28 +146,30 @@ class NumpySearchService(SearchServiceProtocol):
|
|
139
146
|
return []
|
140
147
|
else:
|
141
148
|
raise TypeError(f"Unsupported query type: {type(query)}")
|
142
|
-
|
143
|
-
logger.info(
|
144
|
-
|
149
|
+
|
150
|
+
logger.info(
|
151
|
+
f"Search request for collection '{self.collection_name}' with query type {type(query).__name__}"
|
152
|
+
)
|
153
|
+
|
145
154
|
# Encode query and perform similarity search
|
146
155
|
query_vector = self.embedding_model.encode(query_text)
|
147
|
-
|
156
|
+
|
148
157
|
# Convert list to numpy array for batch operations
|
149
158
|
vectors_array = np.array(self._vectors)
|
150
|
-
|
159
|
+
|
151
160
|
# Normalize vectors for cosine similarity
|
152
161
|
query_norm = np.linalg.norm(query_vector)
|
153
162
|
if query_norm > 0:
|
154
163
|
query_vector = query_vector / query_norm
|
155
|
-
|
164
|
+
|
156
165
|
# Normalize all vectors (avoid division by zero)
|
157
166
|
vector_norms = np.linalg.norm(vectors_array, axis=1, keepdims=True)
|
158
167
|
valid_indices = vector_norms.flatten() > 0
|
159
168
|
vectors_array[valid_indices] = vectors_array[valid_indices] / vector_norms[valid_indices]
|
160
|
-
|
169
|
+
|
161
170
|
# Calculate cosine similarities
|
162
171
|
similarities = np.dot(vectors_array, query_vector)
|
163
|
-
|
172
|
+
|
164
173
|
# Apply filters if present
|
165
174
|
filtered_indices = np.arange(len(similarities))
|
166
175
|
if options.filters:
|
@@ -175,43 +184,49 @@ class NumpySearchService(SearchServiceProtocol):
|
|
175
184
|
new_filtered.append(i)
|
176
185
|
filtered_indices = np.array(new_filtered)
|
177
186
|
else:
|
178
|
-
logger.warning(
|
179
|
-
|
187
|
+
logger.warning(
|
188
|
+
f"Complex filter expressions not supported in NumPy backend: {options.filters}"
|
189
|
+
)
|
190
|
+
|
180
191
|
# Apply filtering and sort by similarity
|
181
192
|
if len(filtered_indices) > 0:
|
182
193
|
filtered_similarities = similarities[filtered_indices]
|
183
194
|
top_k = min(options.top_k, len(filtered_similarities))
|
184
195
|
if top_k == 0:
|
185
196
|
return []
|
186
|
-
|
197
|
+
|
187
198
|
top_indices_within_filtered = np.argsort(filtered_similarities)[-top_k:][::-1]
|
188
199
|
top_indices = filtered_indices[top_indices_within_filtered]
|
189
200
|
else:
|
190
201
|
top_k = min(options.top_k, len(similarities))
|
191
202
|
if top_k == 0:
|
192
203
|
return []
|
193
|
-
|
204
|
+
|
194
205
|
top_indices = np.argsort(similarities)[-top_k:][::-1]
|
195
|
-
|
206
|
+
|
196
207
|
# Format results
|
197
208
|
results = []
|
198
209
|
for idx in top_indices:
|
199
210
|
metadata = self._metadata[idx]
|
200
|
-
results.append(
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
211
|
+
results.append(
|
212
|
+
{
|
213
|
+
"id": self._ids[idx],
|
214
|
+
"content_snippet": self._documents[idx][:200] if self._documents[idx] else "",
|
215
|
+
"score": float(similarities[idx]),
|
216
|
+
"page_number": metadata.get("page_number"),
|
217
|
+
"pdf_path": metadata.get("pdf_path"),
|
218
|
+
"metadata": metadata,
|
219
|
+
}
|
220
|
+
)
|
221
|
+
|
222
|
+
logger.info(
|
223
|
+
f"Search returned {len(results)} results from collection '{self.collection_name}'"
|
224
|
+
)
|
210
225
|
return results
|
211
226
|
|
212
227
|
def index_exists(self) -> bool:
|
213
228
|
return len(self._vectors) > 0
|
214
|
-
|
229
|
+
|
215
230
|
def delete_index(self) -> bool:
|
216
231
|
logger.warning(f"Deleting in-memory index for collection '{self.collection_name}'")
|
217
232
|
self._vectors = []
|
@@ -219,37 +234,43 @@ class NumpySearchService(SearchServiceProtocol):
|
|
219
234
|
self._metadata = []
|
220
235
|
self._ids = []
|
221
236
|
return True
|
222
|
-
|
237
|
+
|
223
238
|
def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
|
224
|
-
logger.debug(
|
225
|
-
|
239
|
+
logger.debug(
|
240
|
+
f"Listing documents for NumPy collection '{self.collection_name}' (include_metadata={include_metadata})..."
|
241
|
+
)
|
242
|
+
|
226
243
|
results = []
|
227
244
|
for i, doc_id in enumerate(self._ids):
|
228
245
|
doc_info = {"id": doc_id}
|
229
246
|
if include_metadata:
|
230
247
|
doc_info["meta"] = self._metadata[i]
|
231
248
|
results.append(doc_info)
|
232
|
-
|
233
|
-
logger.info(
|
249
|
+
|
250
|
+
logger.info(
|
251
|
+
f"Retrieved {len(results)} documents from NumPy collection '{self.collection_name}'"
|
252
|
+
)
|
234
253
|
return results
|
235
|
-
|
254
|
+
|
236
255
|
def delete_documents(self, ids: List[str]) -> None:
|
237
256
|
if not ids:
|
238
257
|
logger.debug("No document IDs provided for deletion. Skipping.")
|
239
258
|
return
|
240
|
-
|
241
|
-
logger.warning(
|
242
|
-
|
259
|
+
|
260
|
+
logger.warning(
|
261
|
+
f"Request to delete {len(ids)} documents from NumPy collection '{self.collection_name}'"
|
262
|
+
)
|
263
|
+
|
243
264
|
# Find indices to remove
|
244
265
|
keep_indices = []
|
245
266
|
for i, doc_id in enumerate(self._ids):
|
246
267
|
if doc_id not in ids:
|
247
268
|
keep_indices.append(i)
|
248
|
-
|
269
|
+
|
249
270
|
# Create new filtered lists
|
250
271
|
self._ids = [self._ids[i] for i in keep_indices]
|
251
272
|
self._vectors = [self._vectors[i] for i in keep_indices]
|
252
273
|
self._documents = [self._documents[i] for i in keep_indices]
|
253
274
|
self._metadata = [self._metadata[i] for i in keep_indices]
|
254
|
-
|
255
|
-
logger.info(f"Deleted documents. Collection now contains {len(self._ids)} documents.")
|
275
|
+
|
276
|
+
logger.info(f"Deleted documents. Collection now contains {len(self._ids)} documents.")
|
@@ -123,7 +123,7 @@ class SearchableMixin(ABC):
|
|
123
123
|
logger.info(
|
124
124
|
f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
|
125
125
|
)
|
126
|
-
|
126
|
+
|
127
127
|
# Direct creation without try/except
|
128
128
|
service_args = {
|
129
129
|
"collection_name": effective_collection_name,
|
@@ -195,7 +195,7 @@ class SearchableMixin(ABC):
|
|
195
195
|
logger.debug(
|
196
196
|
f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
|
197
197
|
)
|
198
|
-
|
198
|
+
|
199
199
|
# Call index without try/except
|
200
200
|
self._search_service.index(
|
201
201
|
documents=indexable_items,
|