natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. natural_pdf/__init__.py +31 -0
  2. natural_pdf/analyzers/layout/gemini.py +137 -162
  3. natural_pdf/analyzers/layout/layout_manager.py +9 -5
  4. natural_pdf/analyzers/layout/layout_options.py +77 -7
  5. natural_pdf/analyzers/layout/paddle.py +318 -165
  6. natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
  7. natural_pdf/analyzers/shape_detection_mixin.py +770 -405
  8. natural_pdf/classification/mixin.py +2 -8
  9. natural_pdf/collections/pdf_collection.py +25 -30
  10. natural_pdf/core/highlighting_service.py +47 -32
  11. natural_pdf/core/page.py +119 -76
  12. natural_pdf/core/pdf.py +19 -22
  13. natural_pdf/describe/__init__.py +21 -0
  14. natural_pdf/describe/base.py +457 -0
  15. natural_pdf/describe/elements.py +411 -0
  16. natural_pdf/describe/mixin.py +84 -0
  17. natural_pdf/describe/summary.py +186 -0
  18. natural_pdf/elements/base.py +11 -10
  19. natural_pdf/elements/collections.py +116 -51
  20. natural_pdf/elements/region.py +204 -127
  21. natural_pdf/exporters/paddleocr.py +38 -13
  22. natural_pdf/flows/__init__.py +3 -3
  23. natural_pdf/flows/collections.py +303 -132
  24. natural_pdf/flows/element.py +277 -132
  25. natural_pdf/flows/flow.py +33 -16
  26. natural_pdf/flows/region.py +142 -79
  27. natural_pdf/ocr/engine_doctr.py +37 -4
  28. natural_pdf/ocr/engine_easyocr.py +23 -3
  29. natural_pdf/ocr/engine_paddle.py +281 -30
  30. natural_pdf/ocr/engine_surya.py +8 -3
  31. natural_pdf/ocr/ocr_manager.py +75 -76
  32. natural_pdf/ocr/ocr_options.py +52 -87
  33. natural_pdf/search/__init__.py +25 -12
  34. natural_pdf/search/lancedb_search_service.py +91 -54
  35. natural_pdf/search/numpy_search_service.py +86 -65
  36. natural_pdf/search/searchable_mixin.py +2 -2
  37. natural_pdf/selectors/parser.py +125 -81
  38. natural_pdf/widgets/__init__.py +1 -1
  39. natural_pdf/widgets/viewer.py +205 -449
  40. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
  41. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
  42. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
  43. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
  44. {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
@@ -65,97 +65,62 @@ class EasyOCROptions(BaseOCROptions):
65
65
  # --- PaddleOCR Specific Options ---
66
66
  @dataclass
67
67
  class PaddleOCROptions(BaseOCROptions):
68
- """Specific options for the PaddleOCR engine."""
69
-
70
- # General
71
- use_gpu: Optional[bool] = None
72
- gpu_mem: int = 8000 # Default from Paddle documentation
73
- gpu_mem: int = 8000 # Default from Paddle documentation
74
- ir_optim: bool = True
75
- use_tensorrt: bool = False
76
- min_subgraph_size: int = 15
77
- precision: str = "fp32"
78
- enable_mkldnn: bool = False
79
- cpu_threads: int = 10
80
- use_fp16: bool = False
81
- show_log: bool = False
82
- use_onnx: bool = False
83
- use_zero_copy_run: bool = False
84
-
85
- # Detection
86
- det: bool = True
87
- det_algorithm: str = "DB"
88
- show_log: bool = False
89
- use_onnx: bool = False
90
- use_zero_copy_run: bool = False
91
-
92
- # Detection
93
- det: bool = True
94
- det_algorithm: str = "DB"
95
- det_model_dir: Optional[str] = None
96
- det_limit_side_len: int = 960 # Corresponds to det_max_side_len
97
- # DB specific
98
- det_db_thresh: float = 0.3
99
- det_db_box_thresh: float = 0.5
100
- det_db_unclip_ratio: float = 2.0
101
- # EAST specific
102
- det_east_score_thresh: float = 0.8
103
- det_east_cover_thresh: float = 0.1
104
- det_east_nms_thresh: float = 0.2
105
-
106
- # Recognition
107
- rec: bool = True
108
- rec_algorithm: str = "CRNN"
109
- det_limit_side_len: int = 960 # Corresponds to det_max_side_len
110
- # DB specific
111
- det_db_thresh: float = 0.3
112
- det_db_box_thresh: float = 0.5
113
- det_db_unclip_ratio: float = 2.0
114
- # EAST specific
115
- det_east_score_thresh: float = 0.8
116
- det_east_cover_thresh: float = 0.1
117
- det_east_nms_thresh: float = 0.2
118
-
119
- # Recognition
120
- rec: bool = True
121
- rec_algorithm: str = "CRNN"
122
- rec_model_dir: Optional[str] = None
123
- rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
124
- rec_batch_num: int = 30 # Default from Paddle documentation
125
- rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
126
- rec_batch_num: int = 30 # Default from Paddle documentation
127
- max_text_length: int = 25
128
- rec_char_dict_path: Optional[str] = None # Path to char dictionary file
129
- rec_char_dict_path: Optional[str] = None # Path to char dictionary file
130
- use_space_char: bool = True
131
- drop_score: float = 0.5
132
-
133
- # Classification
134
- cls: Optional[bool] = None # Often inferred from use_angle_cls
135
- use_angle_cls: bool = False # Default from Paddle documentation
136
- cls_model_dir: Optional[str] = None
137
- cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
138
- label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
139
- cls_batch_num: int = 30
140
-
141
- # Classification
142
- cls: Optional[bool] = None # Often inferred from use_angle_cls
143
- use_angle_cls: bool = False # Default from Paddle documentation
144
- cls_model_dir: Optional[str] = None
145
- cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
146
- label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
147
- cls_batch_num: int = 30
68
+ """
69
+ Specific options for the PaddleOCR engine, reflecting the paddleocr>=3.0.0 API.
70
+ See: https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/OCR.html
71
+ """
72
+
73
+ # --- Constructor Parameters ---
74
+
75
+ # Model paths and names
76
+ doc_orientation_classify_model_name: Optional[str] = None
77
+ doc_orientation_classify_model_dir: Optional[str] = None
78
+ doc_unwarping_model_name: Optional[str] = None
79
+ doc_unwarping_model_dir: Optional[str] = None
80
+ text_detection_model_name: Optional[str] = None
81
+ text_detection_model_dir: Optional[str] = None
82
+ textline_orientation_model_name: Optional[str] = None
83
+ textline_orientation_model_dir: Optional[str] = None
84
+ text_recognition_model_name: Optional[str] = None
85
+ text_recognition_model_dir: Optional[str] = None
86
+
87
+ # Module usage flags (can be overridden at predict time)
88
+ use_doc_orientation_classify: Optional[bool] = False
89
+ use_doc_unwarping: Optional[bool] = False
90
+ use_textline_orientation: Optional[bool] = False
91
+
92
+ # Batch sizes
93
+ textline_orientation_batch_size: Optional[int] = None
94
+ text_recognition_batch_size: Optional[int] = None
95
+
96
+ # Detection parameters (can be overridden at predict time)
97
+ # https://github.com/PaddlePaddle/PaddleOCR/issues/15424
98
+ text_det_limit_side_len: Optional[int] = 736 # WAITING FOR FIX
99
+ text_det_limit_type: Optional[str] = 'max' # WAITING FOR FIX
100
+ text_det_thresh: Optional[float] = None
101
+ text_det_box_thresh: Optional[float] = None
102
+ text_det_unclip_ratio: Optional[float] = None
103
+ text_det_input_shape: Optional[Tuple[int, int]] = None
104
+
105
+ # Recognition parameters (can be overridden at predict time)
106
+ text_rec_score_thresh: Optional[float] = None
107
+ text_rec_input_shape: Optional[Tuple[int, int, int]] = None
108
+
109
+ # General parameters
110
+ lang: Optional[str] = None
111
+ ocr_version: Optional[str] = None
112
+ device: Optional[str] = None
113
+ enable_hpi: Optional[bool] = None
114
+ use_tensorrt: Optional[bool] = None
115
+ precision: Optional[str] = None
116
+ enable_mkldnn: Optional[bool] = False # https://github.com/PaddlePaddle/PaddleOCR/issues/15294
117
+ # mkldnn_cache_capacity: Optional[int] = None
118
+ cpu_threads: Optional[int] = None
119
+ paddlex_config: Optional[str] = None
148
120
 
149
121
  def __post_init__(self):
150
122
  pass
151
123
 
152
- # if self.use_gpu is None:
153
- # if self.device and "cuda" in self.device.lower():
154
- # self.use_gpu = True
155
- # else:
156
- # self.use_gpu = False
157
- # # logger.debug(f"Initialized PaddleOCROptions: {self}")
158
-
159
124
 
160
125
  # --- Surya Specific Options ---
161
126
  @dataclass
@@ -4,8 +4,12 @@ import logging
4
4
  from typing import Optional
5
5
 
6
6
  # Import constants
7
- from .search_options import SearchOptions
8
- from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
7
+ from .search_options import (
8
+ BaseSearchOptions,
9
+ MultiModalSearchOptions,
10
+ SearchOptions,
11
+ TextSearchOptions,
12
+ )
9
13
  from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
10
14
 
11
15
  # Check search extras availability
@@ -13,21 +17,27 @@ LANCEDB_AVAILABLE = False
13
17
  SEARCH_DEPENDENCIES_AVAILABLE = False
14
18
 
15
19
  try:
16
- import sentence_transformers
17
20
  import numpy as np
21
+ import sentence_transformers
22
+
18
23
  # Basic search dependencies are available
19
24
  SEARCH_DEPENDENCIES_AVAILABLE = True
20
-
25
+
21
26
  # Check if LanceDB is available
22
27
  try:
23
28
  import lancedb
24
29
  import pyarrow
30
+
25
31
  LANCEDB_AVAILABLE = True
26
- from .lancedb_search_service import LanceDBSearchService, DEFAULT_LANCEDB_PERSIST_PATH, DEFAULT_EMBEDDING_MODEL
32
+ from .lancedb_search_service import (
33
+ DEFAULT_EMBEDDING_MODEL,
34
+ DEFAULT_LANCEDB_PERSIST_PATH,
35
+ LanceDBSearchService,
36
+ )
27
37
  except ImportError:
28
38
  # LanceDB not available, we'll use NumPy fallback
29
39
  LANCEDB_AVAILABLE = False
30
- from .numpy_search_service import NumpySearchService, DEFAULT_EMBEDDING_MODEL
40
+ from .numpy_search_service import DEFAULT_EMBEDDING_MODEL, NumpySearchService
31
41
  except ImportError:
32
42
  # Basic dependencies missing
33
43
  SEARCH_DEPENDENCIES_AVAILABLE = False
@@ -35,6 +45,7 @@ except ImportError:
35
45
 
36
46
  logger = logging.getLogger(__name__)
37
47
 
48
+
38
49
  def check_search_availability():
39
50
  """Check if required search dependencies are available."""
40
51
  if not SEARCH_DEPENDENCIES_AVAILABLE:
@@ -43,6 +54,7 @@ def check_search_availability():
43
54
  "Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
44
55
  )
45
56
 
57
+
46
58
  def get_search_service(
47
59
  collection_name: str,
48
60
  persist: bool = False,
@@ -51,7 +63,7 @@ def get_search_service(
51
63
  ) -> SearchServiceProtocol:
52
64
  """
53
65
  Factory function to get an instance of the configured search service.
54
-
66
+
55
67
  Automatically selects the best available implementation:
56
68
  - LanceDB if installed (recommended for both in-memory and persistent)
57
69
  - Numpy fallback for in-memory only
@@ -84,16 +96,17 @@ def get_search_service(
84
96
  # If persistence is requested, LanceDB is required
85
97
  if persist and not LANCEDB_AVAILABLE:
86
98
  raise RuntimeError(
87
- "Persistent vector search requires LanceDB. "
88
- "Please install: pip install lancedb"
99
+ "Persistent vector search requires LanceDB. " "Please install: pip install lancedb"
89
100
  )
90
-
101
+
91
102
  # Select the appropriate implementation
92
103
  if LANCEDB_AVAILABLE:
93
104
  logger.info(f"Using LanceDB for vector search (collection: {collection_name})")
94
105
  service_instance = LanceDBSearchService(**service_args)
95
106
  else:
96
- logger.info(f"Using NumPy fallback for in-memory vector search (collection: {collection_name})")
107
+ logger.info(
108
+ f"Using NumPy fallback for in-memory vector search (collection: {collection_name})"
109
+ )
97
110
  service_instance = NumpySearchService(**service_args)
98
-
111
+
99
112
  return service_instance
@@ -63,20 +63,22 @@ class LanceDBSearchService(SearchServiceProtocol):
63
63
  def _get_schema(self) -> pa.Schema:
64
64
  if self._embedding_dims is None:
65
65
  raise RuntimeError("Embedding dimensions not determined. Cannot create schema.")
66
-
67
- return pa.schema([
68
- pa.field("id", pa.string(), nullable=False),
69
- pa.field("vector", pa.list_(pa.float32(), list_size=self._embedding_dims)),
70
- pa.field("text", pa.string()),
71
- pa.field("metadata_json", pa.string())
72
- ])
66
+
67
+ return pa.schema(
68
+ [
69
+ pa.field("id", pa.string(), nullable=False),
70
+ pa.field("vector", pa.list_(pa.float32(), list_size=self._embedding_dims)),
71
+ pa.field("text", pa.string()),
72
+ pa.field("metadata_json", pa.string()),
73
+ ]
74
+ )
73
75
 
74
76
  def _open_or_create_table(self):
75
77
  if self._db is None:
76
78
  raise RuntimeError("LanceDB connection not established.")
77
-
79
+
78
80
  table_names = self._db.table_names()
79
-
81
+
80
82
  if self.collection_name in table_names:
81
83
  logger.debug(f"Opening existing LanceDB table: {self.collection_name}")
82
84
  self._table = self._db.open_table(self.collection_name)
@@ -86,7 +88,7 @@ class LanceDBSearchService(SearchServiceProtocol):
86
88
  self._table = self._db.create_table(self.collection_name, schema=schema, mode="create")
87
89
 
88
90
  def __del__(self):
89
- if not self._persist and hasattr(self, '_temp_dir_obj') and logger:
91
+ if not self._persist and hasattr(self, "_temp_dir_obj") and logger:
90
92
  logger.debug(f"Cleaning up temporary directory for in-memory LanceDB: {self._uri}")
91
93
  self._temp_dir_obj.cleanup()
92
94
 
@@ -130,17 +132,23 @@ class LanceDBSearchService(SearchServiceProtocol):
130
132
 
131
133
  if isinstance(content_obj, str):
132
134
  content_text = content_obj
133
- elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
135
+ elif hasattr(content_obj, "extract_text") and callable(
136
+ getattr(content_obj, "extract_text")
137
+ ):
134
138
  content_text = content_obj.extract_text()
135
- if not isinstance(content_text, str): content_text = str(content_obj)
139
+ if not isinstance(content_text, str):
140
+ content_text = str(content_obj)
136
141
  else:
137
142
  content_text = str(content_obj)
138
143
 
139
144
  try:
140
145
  content_hash = item.get_content_hash()
141
- if content_hash: metadata["content_hash"] = content_hash
142
- except (AttributeError, NotImplementedError): pass
143
- except Exception as e: logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
146
+ if content_hash:
147
+ metadata["content_hash"] = content_hash
148
+ except (AttributeError, NotImplementedError):
149
+ pass
150
+ except Exception as e:
151
+ logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
144
152
 
145
153
  # Ensure doc_id is not None - use a fallback if needed
146
154
  if doc_id is None:
@@ -151,28 +159,30 @@ class LanceDBSearchService(SearchServiceProtocol):
151
159
  doc_id = f"auto_{len(texts_to_embed)}"
152
160
 
153
161
  texts_to_embed.append(content_text)
154
- original_items_info.append({
155
- "id": doc_id,
156
- "metadata_json": json.dumps(metadata),
157
- "text": content_text
158
- })
162
+ original_items_info.append(
163
+ {"id": doc_id, "metadata_json": json.dumps(metadata), "text": content_text}
164
+ )
159
165
 
160
166
  if not texts_to_embed:
161
167
  logger.warning("No text content to embed. Skipping.")
162
168
  return
163
169
 
164
- logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
170
+ logger.info(
171
+ f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'..."
172
+ )
165
173
  generated_embeddings = self.embedding_model.encode(
166
174
  texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
167
175
  )
168
176
 
169
177
  for i, item_info in enumerate(original_items_info):
170
- data_to_add.append({
171
- "id": item_info["id"],
172
- "vector": generated_embeddings[i].tolist(),
173
- "text": item_info["text"],
174
- "metadata_json": item_info["metadata_json"]
175
- })
178
+ data_to_add.append(
179
+ {
180
+ "id": item_info["id"],
181
+ "vector": generated_embeddings[i].tolist(),
182
+ "text": item_info["text"],
183
+ "metadata_json": item_info["metadata_json"],
184
+ }
185
+ )
176
186
 
177
187
  if not data_to_add:
178
188
  logger.warning("No data prepared for LanceDB. Skipping add.")
@@ -188,11 +198,17 @@ class LanceDBSearchService(SearchServiceProtocol):
188
198
  ]
189
199
  table = pa.Table.from_arrays(arrays, schema=schema)
190
200
 
191
- logger.info(f"Adding/updating {len(data_to_add)} documents to LanceDB table '{self.collection_name}'.")
192
- self._table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(
201
+ logger.info(
202
+ f"Adding/updating {len(data_to_add)} documents to LanceDB table '{self.collection_name}'."
203
+ )
204
+ self._table.merge_insert(
205
+ "id"
206
+ ).when_matched_update_all().when_not_matched_insert_all().execute(
193
207
  table,
194
208
  )
195
- logger.info(f"Successfully added/updated {len(data_to_add)} documents. Table count: {self._table.count_rows()}")
209
+ logger.info(
210
+ f"Successfully added/updated {len(data_to_add)} documents. Table count: {self._table.count_rows()}"
211
+ )
196
212
 
197
213
  def search(
198
214
  self,
@@ -202,12 +218,16 @@ class LanceDBSearchService(SearchServiceProtocol):
202
218
  if self._table is None:
203
219
  raise RuntimeError(f"LanceDB table '{self.collection_name}' not initialized.")
204
220
 
205
- logger.info(f"Search request for table='{self.collection_name}', query_type={type(query).__name__}, options={options}")
221
+ logger.info(
222
+ f"Search request for table='{self.collection_name}', query_type={type(query).__name__}, options={options}"
223
+ )
206
224
  query_text = ""
207
- if isinstance(query, (str, Path)): query_text = str(query)
225
+ if isinstance(query, (str, Path)):
226
+ query_text = str(query)
208
227
  elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
209
228
  query_text = query.extract_text()
210
- if not query_text or not query_text.strip(): return []
229
+ if not query_text or not query_text.strip():
230
+ return []
211
231
  else:
212
232
  raise TypeError(f"Unsupported query type: {type(query)}")
213
233
 
@@ -226,7 +246,9 @@ class LanceDBSearchService(SearchServiceProtocol):
226
246
  filter_parts.append(f"{k} = {v}")
227
247
  if filter_parts:
228
248
  lancedb_filter = " AND ".join(filter_parts)
229
- logger.warning(f"Filter conversion from dict is basic: {options.filters} -> {lancedb_filter}. For metadata_json, use SQL path expressions.")
249
+ logger.warning(
250
+ f"Filter conversion from dict is basic: {options.filters} -> {lancedb_filter}. For metadata_json, use SQL path expressions."
251
+ )
230
252
 
231
253
  search_query = self._table.search(query_vector).limit(options.top_k)
232
254
  if lancedb_filter:
@@ -246,15 +268,19 @@ class LanceDBSearchService(SearchServiceProtocol):
246
268
 
247
269
  score = 1 - row["_distance"] if "_distance" in row else 0.0
248
270
 
249
- final_results.append({
250
- "id": row.get("id"),
251
- "content_snippet": row["text"][:200] if "text" in row and row["text"] else "",
252
- "score": score,
253
- "page_number": metadata.get("page_number"),
254
- "pdf_path": metadata.get("pdf_path"),
255
- "metadata": metadata,
256
- })
257
- logger.info(f"Search returned {len(final_results)} results from LanceDB table '{self.collection_name}'.")
271
+ final_results.append(
272
+ {
273
+ "id": row.get("id"),
274
+ "content_snippet": row["text"][:200] if "text" in row and row["text"] else "",
275
+ "score": score,
276
+ "page_number": metadata.get("page_number"),
277
+ "pdf_path": metadata.get("pdf_path"),
278
+ "metadata": metadata,
279
+ }
280
+ )
281
+ logger.info(
282
+ f"Search returned {len(final_results)} results from LanceDB table '{self.collection_name}'."
283
+ )
258
284
  return final_results
259
285
 
260
286
  def delete_index(self) -> bool:
@@ -262,29 +288,33 @@ class LanceDBSearchService(SearchServiceProtocol):
262
288
  logger.warning("LanceDB connection not initialized. Cannot delete index.")
263
289
  return False
264
290
  logger.warning(f"Request to delete LanceDB table '{self.collection_name}'.")
265
-
291
+
266
292
  self._db.drop_table(self.collection_name)
267
293
  self._table = None
268
294
  logger.info(f"LanceDB table '{self.collection_name}' deleted successfully.")
269
295
  return True
270
296
 
271
297
  def index_exists(self) -> bool:
272
- if self._db is None:
298
+ if self._db is None:
273
299
  return False
274
300
  exists = self.collection_name in self._db.table_names()
275
301
  if exists:
276
302
  tbl = self._db.open_table(self.collection_name)
277
303
  count = tbl.count_rows()
278
- logger.debug(f"LanceDB table '{self.collection_name}' found with {count} documents. Exists: {count > 0}")
304
+ logger.debug(
305
+ f"LanceDB table '{self.collection_name}' found with {count} documents. Exists: {count > 0}"
306
+ )
279
307
  return count > 0
280
-
308
+
281
309
  logger.debug(f"LanceDB table '{self.collection_name}' not found in db.table_names().")
282
310
  return False
283
311
 
284
312
  def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
285
- if self._table is None:
313
+ if self._table is None:
286
314
  raise RuntimeError("Table not initialized")
287
- logger.debug(f"Listing documents for LanceDB table '{self.collection_name}' (include_metadata={include_metadata})...")
315
+ logger.debug(
316
+ f"Listing documents for LanceDB table '{self.collection_name}' (include_metadata={include_metadata})..."
317
+ )
288
318
 
289
319
  select_columns = ["id"]
290
320
  if include_metadata:
@@ -298,6 +328,7 @@ class LanceDBSearchService(SearchServiceProtocol):
298
328
 
299
329
  formatted_docs: List[Dict[str, Any]] = []
300
330
  import json
331
+
301
332
  for row in results_list:
302
333
  doc_data: Dict[str, Any] = {"id": row.get("id")}
303
334
  if include_metadata and "metadata_json" in row and row["metadata_json"]:
@@ -307,11 +338,13 @@ class LanceDBSearchService(SearchServiceProtocol):
307
338
  except json.JSONDecodeError:
308
339
  doc_data["meta"] = {}
309
340
  formatted_docs.append(doc_data)
310
- logger.info(f"Retrieved {len(formatted_docs)} documents from LanceDB table '{self.collection_name}'.")
341
+ logger.info(
342
+ f"Retrieved {len(formatted_docs)} documents from LanceDB table '{self.collection_name}'."
343
+ )
311
344
  return formatted_docs
312
345
 
313
346
  def delete_documents(self, ids: List[str]) -> None:
314
- if self._table is None:
347
+ if self._table is None:
315
348
  raise RuntimeError("Table not initialized")
316
349
  if not ids:
317
350
  logger.debug("No document IDs provided for deletion. Skipping.")
@@ -319,7 +352,11 @@ class LanceDBSearchService(SearchServiceProtocol):
319
352
 
320
353
  id_filter_string = ", ".join([f"'{doc_id}'" for doc_id in ids])
321
354
  delete_condition = f"id IN ({id_filter_string})"
322
- logger.warning(f"Request to delete {len(ids)} documents from LanceDB table '{self.collection_name}' with condition: {delete_condition}")
323
-
355
+ logger.warning(
356
+ f"Request to delete {len(ids)} documents from LanceDB table '{self.collection_name}' with condition: {delete_condition}"
357
+ )
358
+
324
359
  self._table.delete(delete_condition)
325
- logger.info(f"Successfully requested deletion of {len(ids)} documents. Table count now: {self._table.count_rows()}")
360
+ logger.info(
361
+ f"Successfully requested deletion of {len(ids)} documents. Table count now: {self._table.count_rows()}"
362
+ )