natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +119 -76
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/describe/__init__.py +21 -0
  14. natural_pdf/describe/base.py +457 -0
  15. natural_pdf/describe/elements.py +411 -0
  16. natural_pdf/describe/mixin.py +84 -0
  17. natural_pdf/describe/summary.py +186 -0
  18. natural_pdf/elements/base.py +11 -10
  19. natural_pdf/elements/collections.py +116 -51
  20. natural_pdf/elements/region.py +204 -127
  21. natural_pdf/exporters/paddleocr.py +38 -13
  22. natural_pdf/flows/__init__.py +3 -3
  23. natural_pdf/flows/collections.py +303 -132
  24. natural_pdf/flows/element.py +277 -132
  25. natural_pdf/flows/flow.py +33 -16
  26. natural_pdf/flows/region.py +142 -79
  27. natural_pdf/ocr/engine_doctr.py +37 -4
  28. natural_pdf/ocr/engine_easyocr.py +23 -3
  29. natural_pdf/ocr/engine_paddle.py +281 -30
  30. natural_pdf/ocr/engine_surya.py +8 -3
  31. natural_pdf/ocr/ocr_manager.py +75 -76
  32. natural_pdf/ocr/ocr_options.py +52 -87
  33. natural_pdf/search/__init__.py +25 -12
  34. natural_pdf/search/lancedb_search_service.py +91 -54
  35. natural_pdf/search/numpy_search_service.py +86 -65
  36. natural_pdf/search/searchable_mixin.py +2 -2
  37. natural_pdf/selectors/parser.py +125 -81
  38. natural_pdf/widgets/__init__.py +1 -1
  39. natural_pdf/widgets/viewer.py +205 -449
  40. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
  41. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
  42. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
  43. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
  44. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,9 @@
1
- import logging
2
- import numpy as np
3
1
  import json
2
+ import logging
4
3
  from pathlib import Path
5
4
  from typing import Any, Dict, Iterable, List, Optional, Union
6
5
 
6
+ import numpy as np
7
7
  from sentence_transformers import SentenceTransformer
8
8
 
9
9
  from .search_options import BaseSearchOptions
@@ -17,6 +17,7 @@ logger = logging.getLogger(__name__)
17
17
 
18
18
  DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
19
19
 
20
+
20
21
  class NumpySearchService(SearchServiceProtocol):
21
22
  """Basic in-memory vector search implementation using NumPy."""
22
23
 
@@ -34,19 +35,21 @@ class NumpySearchService(SearchServiceProtocol):
34
35
  "Persistence requested but LanceDB is not installed. "
35
36
  "For persistent vector search, install LanceDB: pip install lancedb"
36
37
  )
37
-
38
+
38
39
  self.collection_name = collection_name
39
40
  self._embedding_model_name = embedding_model_name
40
41
  self.embedding_model = SentenceTransformer(self._embedding_model_name)
41
42
  self._embedding_dims = len(self.embedding_model.encode("test"))
42
-
43
+
43
44
  # Simple in-memory storage
44
45
  self._vectors = []
45
46
  self._documents = []
46
47
  self._metadata = []
47
48
  self._ids = []
48
-
49
- logger.info(f"NumpySearchService initialized for collection '{collection_name}' with model '{embedding_model_name}'")
49
+
50
+ logger.info(
51
+ f"NumpySearchService initialized for collection '{collection_name}' with model '{embedding_model_name}'"
52
+ )
50
53
 
51
54
  def index(
52
55
  self,
@@ -55,70 +58,74 @@ class NumpySearchService(SearchServiceProtocol):
55
58
  force_reindex: bool = False,
56
59
  ) -> None:
57
60
  if force_reindex:
58
- logger.info(f"Force reindex requested for collection '{self.collection_name}'. Clearing in-memory vectors.")
61
+ logger.info(
62
+ f"Force reindex requested for collection '{self.collection_name}'. Clearing in-memory vectors."
63
+ )
59
64
  self._vectors = []
60
65
  self._documents = []
61
66
  self._metadata = []
62
67
  self._ids = []
63
-
68
+
64
69
  items = list(documents)
65
70
  logger.info(f"Indexing {len(items)} documents for collection '{self.collection_name}'")
66
-
71
+
67
72
  if not items:
68
73
  logger.warning("No documents provided for indexing. Skipping.")
69
74
  return
70
-
75
+
71
76
  texts_to_embed = []
72
77
  items_info = []
73
-
78
+
74
79
  for item in items:
75
80
  doc_id = item.get_id()
76
81
  metadata = item.get_metadata().copy()
77
82
  content_obj = item.get_content()
78
83
  content_text = ""
79
-
84
+
80
85
  if isinstance(content_obj, str):
81
86
  content_text = content_obj
82
- elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
87
+ elif hasattr(content_obj, "extract_text") and callable(
88
+ getattr(content_obj, "extract_text")
89
+ ):
83
90
  content_text = content_obj.extract_text()
84
- if not isinstance(content_text, str):
91
+ if not isinstance(content_text, str):
85
92
  content_text = str(content_obj)
86
93
  else:
87
94
  content_text = str(content_obj)
88
-
95
+
89
96
  # Try to add content hash to metadata
90
97
  try:
91
98
  content_hash = item.get_content_hash()
92
- if content_hash:
99
+ if content_hash:
93
100
  metadata["content_hash"] = content_hash
94
101
  except (AttributeError, NotImplementedError):
95
102
  pass
96
103
  except Exception as e:
97
104
  logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
98
-
105
+
99
106
  texts_to_embed.append(content_text)
100
- items_info.append({
101
- "id": doc_id,
102
- "metadata": metadata,
103
- "text": content_text
104
- })
105
-
107
+ items_info.append({"id": doc_id, "metadata": metadata, "text": content_text})
108
+
106
109
  if not texts_to_embed:
107
110
  logger.warning("No text content to embed. Skipping.")
108
111
  return
109
-
110
- logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
112
+
113
+ logger.info(
114
+ f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'..."
115
+ )
111
116
  generated_embeddings = self.embedding_model.encode(
112
117
  texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
113
118
  )
114
-
119
+
115
120
  for i, item_info in enumerate(items_info):
116
121
  self._vectors.append(generated_embeddings[i])
117
122
  self._documents.append(item_info["text"])
118
123
  self._metadata.append(item_info["metadata"])
119
124
  self._ids.append(item_info["id"])
120
-
121
- logger.info(f"Successfully indexed {len(texts_to_embed)} documents. Total count: {len(self._vectors)}")
125
+
126
+ logger.info(
127
+ f"Successfully indexed {len(texts_to_embed)} documents. Total count: {len(self._vectors)}"
128
+ )
122
129
 
123
130
  def search(
124
131
  self,
@@ -128,7 +135,7 @@ class NumpySearchService(SearchServiceProtocol):
128
135
  if not self._vectors:
129
136
  logger.debug("No vectors in index. Returning empty results.")
130
137
  return []
131
-
138
+
132
139
  # Process query to text
133
140
  query_text = ""
134
141
  if isinstance(query, (str, Path)):
@@ -139,28 +146,30 @@ class NumpySearchService(SearchServiceProtocol):
139
146
  return []
140
147
  else:
141
148
  raise TypeError(f"Unsupported query type: {type(query)}")
142
-
143
- logger.info(f"Search request for collection '{self.collection_name}' with query type {type(query).__name__}")
144
-
149
+
150
+ logger.info(
151
+ f"Search request for collection '{self.collection_name}' with query type {type(query).__name__}"
152
+ )
153
+
145
154
  # Encode query and perform similarity search
146
155
  query_vector = self.embedding_model.encode(query_text)
147
-
156
+
148
157
  # Convert list to numpy array for batch operations
149
158
  vectors_array = np.array(self._vectors)
150
-
159
+
151
160
  # Normalize vectors for cosine similarity
152
161
  query_norm = np.linalg.norm(query_vector)
153
162
  if query_norm > 0:
154
163
  query_vector = query_vector / query_norm
155
-
164
+
156
165
  # Normalize all vectors (avoid division by zero)
157
166
  vector_norms = np.linalg.norm(vectors_array, axis=1, keepdims=True)
158
167
  valid_indices = vector_norms.flatten() > 0
159
168
  vectors_array[valid_indices] = vectors_array[valid_indices] / vector_norms[valid_indices]
160
-
169
+
161
170
  # Calculate cosine similarities
162
171
  similarities = np.dot(vectors_array, query_vector)
163
-
172
+
164
173
  # Apply filters if present
165
174
  filtered_indices = np.arange(len(similarities))
166
175
  if options.filters:
@@ -175,43 +184,49 @@ class NumpySearchService(SearchServiceProtocol):
175
184
  new_filtered.append(i)
176
185
  filtered_indices = np.array(new_filtered)
177
186
  else:
178
- logger.warning(f"Complex filter expressions not supported in NumPy backend: {options.filters}")
179
-
187
+ logger.warning(
188
+ f"Complex filter expressions not supported in NumPy backend: {options.filters}"
189
+ )
190
+
180
191
  # Apply filtering and sort by similarity
181
192
  if len(filtered_indices) > 0:
182
193
  filtered_similarities = similarities[filtered_indices]
183
194
  top_k = min(options.top_k, len(filtered_similarities))
184
195
  if top_k == 0:
185
196
  return []
186
-
197
+
187
198
  top_indices_within_filtered = np.argsort(filtered_similarities)[-top_k:][::-1]
188
199
  top_indices = filtered_indices[top_indices_within_filtered]
189
200
  else:
190
201
  top_k = min(options.top_k, len(similarities))
191
202
  if top_k == 0:
192
203
  return []
193
-
204
+
194
205
  top_indices = np.argsort(similarities)[-top_k:][::-1]
195
-
206
+
196
207
  # Format results
197
208
  results = []
198
209
  for idx in top_indices:
199
210
  metadata = self._metadata[idx]
200
- results.append({
201
- "id": self._ids[idx],
202
- "content_snippet": self._documents[idx][:200] if self._documents[idx] else "",
203
- "score": float(similarities[idx]),
204
- "page_number": metadata.get("page_number"),
205
- "pdf_path": metadata.get("pdf_path"),
206
- "metadata": metadata,
207
- })
208
-
209
- logger.info(f"Search returned {len(results)} results from collection '{self.collection_name}'")
211
+ results.append(
212
+ {
213
+ "id": self._ids[idx],
214
+ "content_snippet": self._documents[idx][:200] if self._documents[idx] else "",
215
+ "score": float(similarities[idx]),
216
+ "page_number": metadata.get("page_number"),
217
+ "pdf_path": metadata.get("pdf_path"),
218
+ "metadata": metadata,
219
+ }
220
+ )
221
+
222
+ logger.info(
223
+ f"Search returned {len(results)} results from collection '{self.collection_name}'"
224
+ )
210
225
  return results
211
226
 
212
227
  def index_exists(self) -> bool:
213
228
  return len(self._vectors) > 0
214
-
229
+
215
230
  def delete_index(self) -> bool:
216
231
  logger.warning(f"Deleting in-memory index for collection '{self.collection_name}'")
217
232
  self._vectors = []
@@ -219,37 +234,43 @@ class NumpySearchService(SearchServiceProtocol):
219
234
  self._metadata = []
220
235
  self._ids = []
221
236
  return True
222
-
237
+
223
238
  def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
224
- logger.debug(f"Listing documents for NumPy collection '{self.collection_name}' (include_metadata={include_metadata})...")
225
-
239
+ logger.debug(
240
+ f"Listing documents for NumPy collection '{self.collection_name}' (include_metadata={include_metadata})..."
241
+ )
242
+
226
243
  results = []
227
244
  for i, doc_id in enumerate(self._ids):
228
245
  doc_info = {"id": doc_id}
229
246
  if include_metadata:
230
247
  doc_info["meta"] = self._metadata[i]
231
248
  results.append(doc_info)
232
-
233
- logger.info(f"Retrieved {len(results)} documents from NumPy collection '{self.collection_name}'")
249
+
250
+ logger.info(
251
+ f"Retrieved {len(results)} documents from NumPy collection '{self.collection_name}'"
252
+ )
234
253
  return results
235
-
254
+
236
255
  def delete_documents(self, ids: List[str]) -> None:
237
256
  if not ids:
238
257
  logger.debug("No document IDs provided for deletion. Skipping.")
239
258
  return
240
-
241
- logger.warning(f"Request to delete {len(ids)} documents from NumPy collection '{self.collection_name}'")
242
-
259
+
260
+ logger.warning(
261
+ f"Request to delete {len(ids)} documents from NumPy collection '{self.collection_name}'"
262
+ )
263
+
243
264
  # Find indices to remove
244
265
  keep_indices = []
245
266
  for i, doc_id in enumerate(self._ids):
246
267
  if doc_id not in ids:
247
268
  keep_indices.append(i)
248
-
269
+
249
270
  # Create new filtered lists
250
271
  self._ids = [self._ids[i] for i in keep_indices]
251
272
  self._vectors = [self._vectors[i] for i in keep_indices]
252
273
  self._documents = [self._documents[i] for i in keep_indices]
253
274
  self._metadata = [self._metadata[i] for i in keep_indices]
254
-
255
- logger.info(f"Deleted documents. Collection now contains {len(self._ids)} documents.")
275
+
276
+ logger.info(f"Deleted documents. Collection now contains {len(self._ids)} documents.")
@@ -123,7 +123,7 @@ class SearchableMixin(ABC):
123
123
  logger.info(
124
124
  f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
125
125
  )
126
-
126
+
127
127
  # Direct creation without try/except
128
128
  service_args = {
129
129
  "collection_name": effective_collection_name,
@@ -195,7 +195,7 @@ class SearchableMixin(ABC):
195
195
  logger.debug(
196
196
  f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
197
197
  )
198
-
198
+
199
199
  # Call index without try/except
200
200
  self._search_service.index(
201
201
  documents=indexable_items,