natural-pdf 0.1.15__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +119 -76
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/describe/__init__.py +21 -0
- natural_pdf/describe/base.py +457 -0
- natural_pdf/describe/elements.py +411 -0
- natural_pdf/describe/mixin.py +84 -0
- natural_pdf/describe/summary.py +186 -0
- natural_pdf/elements/base.py +11 -10
- natural_pdf/elements/collections.py +116 -51
- natural_pdf/elements/region.py +204 -127
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/RECORD +44 -38
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.15.dist-info → natural_pdf-0.1.17.dist-info}/top_level.txt +0 -0
natural_pdf/ocr/ocr_options.py
CHANGED
@@ -65,97 +65,62 @@ class EasyOCROptions(BaseOCROptions):
|
|
65
65
|
# --- PaddleOCR Specific Options ---
|
66
66
|
@dataclass
|
67
67
|
class PaddleOCROptions(BaseOCROptions):
|
68
|
-
"""
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
#
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
#
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
rec: bool = True
|
121
|
-
rec_algorithm: str = "CRNN"
|
122
|
-
rec_model_dir: Optional[str] = None
|
123
|
-
rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
|
124
|
-
rec_batch_num: int = 30 # Default from Paddle documentation
|
125
|
-
rec_image_shape: str = "3, 32, 320" # Kept as string per Paddle examples
|
126
|
-
rec_batch_num: int = 30 # Default from Paddle documentation
|
127
|
-
max_text_length: int = 25
|
128
|
-
rec_char_dict_path: Optional[str] = None # Path to char dictionary file
|
129
|
-
rec_char_dict_path: Optional[str] = None # Path to char dictionary file
|
130
|
-
use_space_char: bool = True
|
131
|
-
drop_score: float = 0.5
|
132
|
-
|
133
|
-
# Classification
|
134
|
-
cls: Optional[bool] = None # Often inferred from use_angle_cls
|
135
|
-
use_angle_cls: bool = False # Default from Paddle documentation
|
136
|
-
cls_model_dir: Optional[str] = None
|
137
|
-
cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
|
138
|
-
label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
|
139
|
-
cls_batch_num: int = 30
|
140
|
-
|
141
|
-
# Classification
|
142
|
-
cls: Optional[bool] = None # Often inferred from use_angle_cls
|
143
|
-
use_angle_cls: bool = False # Default from Paddle documentation
|
144
|
-
cls_model_dir: Optional[str] = None
|
145
|
-
cls_image_shape: str = "3, 48, 192" # Kept as string per Paddle examples
|
146
|
-
label_list: List[str] = field(default_factory=lambda: ["0", "180"]) # Default from Paddle doc
|
147
|
-
cls_batch_num: int = 30
|
68
|
+
"""
|
69
|
+
Specific options for the PaddleOCR engine, reflecting the paddleocr>=3.0.0 API.
|
70
|
+
See: https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/OCR.html
|
71
|
+
"""
|
72
|
+
|
73
|
+
# --- Constructor Parameters ---
|
74
|
+
|
75
|
+
# Model paths and names
|
76
|
+
doc_orientation_classify_model_name: Optional[str] = None
|
77
|
+
doc_orientation_classify_model_dir: Optional[str] = None
|
78
|
+
doc_unwarping_model_name: Optional[str] = None
|
79
|
+
doc_unwarping_model_dir: Optional[str] = None
|
80
|
+
text_detection_model_name: Optional[str] = None
|
81
|
+
text_detection_model_dir: Optional[str] = None
|
82
|
+
textline_orientation_model_name: Optional[str] = None
|
83
|
+
textline_orientation_model_dir: Optional[str] = None
|
84
|
+
text_recognition_model_name: Optional[str] = None
|
85
|
+
text_recognition_model_dir: Optional[str] = None
|
86
|
+
|
87
|
+
# Module usage flags (can be overridden at predict time)
|
88
|
+
use_doc_orientation_classify: Optional[bool] = False
|
89
|
+
use_doc_unwarping: Optional[bool] = False
|
90
|
+
use_textline_orientation: Optional[bool] = False
|
91
|
+
|
92
|
+
# Batch sizes
|
93
|
+
textline_orientation_batch_size: Optional[int] = None
|
94
|
+
text_recognition_batch_size: Optional[int] = None
|
95
|
+
|
96
|
+
# Detection parameters (can be overridden at predict time)
|
97
|
+
# https://github.com/PaddlePaddle/PaddleOCR/issues/15424
|
98
|
+
text_det_limit_side_len: Optional[int] = 736 # WAITING FOR FIX
|
99
|
+
text_det_limit_type: Optional[str] = 'max' # WAITING FOR FIX
|
100
|
+
text_det_thresh: Optional[float] = None
|
101
|
+
text_det_box_thresh: Optional[float] = None
|
102
|
+
text_det_unclip_ratio: Optional[float] = None
|
103
|
+
text_det_input_shape: Optional[Tuple[int, int]] = None
|
104
|
+
|
105
|
+
# Recognition parameters (can be overridden at predict time)
|
106
|
+
text_rec_score_thresh: Optional[float] = None
|
107
|
+
text_rec_input_shape: Optional[Tuple[int, int, int]] = None
|
108
|
+
|
109
|
+
# General parameters
|
110
|
+
lang: Optional[str] = None
|
111
|
+
ocr_version: Optional[str] = None
|
112
|
+
device: Optional[str] = None
|
113
|
+
enable_hpi: Optional[bool] = None
|
114
|
+
use_tensorrt: Optional[bool] = None
|
115
|
+
precision: Optional[str] = None
|
116
|
+
enable_mkldnn: Optional[bool] = False # https://github.com/PaddlePaddle/PaddleOCR/issues/15294
|
117
|
+
# mkldnn_cache_capacity: Optional[int] = None
|
118
|
+
cpu_threads: Optional[int] = None
|
119
|
+
paddlex_config: Optional[str] = None
|
148
120
|
|
149
121
|
def __post_init__(self):
|
150
122
|
pass
|
151
123
|
|
152
|
-
# if self.use_gpu is None:
|
153
|
-
# if self.device and "cuda" in self.device.lower():
|
154
|
-
# self.use_gpu = True
|
155
|
-
# else:
|
156
|
-
# self.use_gpu = False
|
157
|
-
# # logger.debug(f"Initialized PaddleOCROptions: {self}")
|
158
|
-
|
159
124
|
|
160
125
|
# --- Surya Specific Options ---
|
161
126
|
@dataclass
|
natural_pdf/search/__init__.py
CHANGED
@@ -4,8 +4,12 @@ import logging
|
|
4
4
|
from typing import Optional
|
5
5
|
|
6
6
|
# Import constants
|
7
|
-
from .search_options import
|
8
|
-
|
7
|
+
from .search_options import (
|
8
|
+
BaseSearchOptions,
|
9
|
+
MultiModalSearchOptions,
|
10
|
+
SearchOptions,
|
11
|
+
TextSearchOptions,
|
12
|
+
)
|
9
13
|
from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
|
10
14
|
|
11
15
|
# Check search extras availability
|
@@ -13,21 +17,27 @@ LANCEDB_AVAILABLE = False
|
|
13
17
|
SEARCH_DEPENDENCIES_AVAILABLE = False
|
14
18
|
|
15
19
|
try:
|
16
|
-
import sentence_transformers
|
17
20
|
import numpy as np
|
21
|
+
import sentence_transformers
|
22
|
+
|
18
23
|
# Basic search dependencies are available
|
19
24
|
SEARCH_DEPENDENCIES_AVAILABLE = True
|
20
|
-
|
25
|
+
|
21
26
|
# Check if LanceDB is available
|
22
27
|
try:
|
23
28
|
import lancedb
|
24
29
|
import pyarrow
|
30
|
+
|
25
31
|
LANCEDB_AVAILABLE = True
|
26
|
-
from .lancedb_search_service import
|
32
|
+
from .lancedb_search_service import (
|
33
|
+
DEFAULT_EMBEDDING_MODEL,
|
34
|
+
DEFAULT_LANCEDB_PERSIST_PATH,
|
35
|
+
LanceDBSearchService,
|
36
|
+
)
|
27
37
|
except ImportError:
|
28
38
|
# LanceDB not available, we'll use NumPy fallback
|
29
39
|
LANCEDB_AVAILABLE = False
|
30
|
-
from .numpy_search_service import
|
40
|
+
from .numpy_search_service import DEFAULT_EMBEDDING_MODEL, NumpySearchService
|
31
41
|
except ImportError:
|
32
42
|
# Basic dependencies missing
|
33
43
|
SEARCH_DEPENDENCIES_AVAILABLE = False
|
@@ -35,6 +45,7 @@ except ImportError:
|
|
35
45
|
|
36
46
|
logger = logging.getLogger(__name__)
|
37
47
|
|
48
|
+
|
38
49
|
def check_search_availability():
|
39
50
|
"""Check if required search dependencies are available."""
|
40
51
|
if not SEARCH_DEPENDENCIES_AVAILABLE:
|
@@ -43,6 +54,7 @@ def check_search_availability():
|
|
43
54
|
"Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
|
44
55
|
)
|
45
56
|
|
57
|
+
|
46
58
|
def get_search_service(
|
47
59
|
collection_name: str,
|
48
60
|
persist: bool = False,
|
@@ -51,7 +63,7 @@ def get_search_service(
|
|
51
63
|
) -> SearchServiceProtocol:
|
52
64
|
"""
|
53
65
|
Factory function to get an instance of the configured search service.
|
54
|
-
|
66
|
+
|
55
67
|
Automatically selects the best available implementation:
|
56
68
|
- LanceDB if installed (recommended for both in-memory and persistent)
|
57
69
|
- Numpy fallback for in-memory only
|
@@ -84,16 +96,17 @@ def get_search_service(
|
|
84
96
|
# If persistence is requested, LanceDB is required
|
85
97
|
if persist and not LANCEDB_AVAILABLE:
|
86
98
|
raise RuntimeError(
|
87
|
-
"Persistent vector search requires LanceDB. "
|
88
|
-
"Please install: pip install lancedb"
|
99
|
+
"Persistent vector search requires LanceDB. " "Please install: pip install lancedb"
|
89
100
|
)
|
90
|
-
|
101
|
+
|
91
102
|
# Select the appropriate implementation
|
92
103
|
if LANCEDB_AVAILABLE:
|
93
104
|
logger.info(f"Using LanceDB for vector search (collection: {collection_name})")
|
94
105
|
service_instance = LanceDBSearchService(**service_args)
|
95
106
|
else:
|
96
|
-
logger.info(
|
107
|
+
logger.info(
|
108
|
+
f"Using NumPy fallback for in-memory vector search (collection: {collection_name})"
|
109
|
+
)
|
97
110
|
service_instance = NumpySearchService(**service_args)
|
98
|
-
|
111
|
+
|
99
112
|
return service_instance
|
@@ -63,20 +63,22 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
63
63
|
def _get_schema(self) -> pa.Schema:
|
64
64
|
if self._embedding_dims is None:
|
65
65
|
raise RuntimeError("Embedding dimensions not determined. Cannot create schema.")
|
66
|
-
|
67
|
-
return pa.schema(
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
66
|
+
|
67
|
+
return pa.schema(
|
68
|
+
[
|
69
|
+
pa.field("id", pa.string(), nullable=False),
|
70
|
+
pa.field("vector", pa.list_(pa.float32(), list_size=self._embedding_dims)),
|
71
|
+
pa.field("text", pa.string()),
|
72
|
+
pa.field("metadata_json", pa.string()),
|
73
|
+
]
|
74
|
+
)
|
73
75
|
|
74
76
|
def _open_or_create_table(self):
|
75
77
|
if self._db is None:
|
76
78
|
raise RuntimeError("LanceDB connection not established.")
|
77
|
-
|
79
|
+
|
78
80
|
table_names = self._db.table_names()
|
79
|
-
|
81
|
+
|
80
82
|
if self.collection_name in table_names:
|
81
83
|
logger.debug(f"Opening existing LanceDB table: {self.collection_name}")
|
82
84
|
self._table = self._db.open_table(self.collection_name)
|
@@ -86,7 +88,7 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
86
88
|
self._table = self._db.create_table(self.collection_name, schema=schema, mode="create")
|
87
89
|
|
88
90
|
def __del__(self):
|
89
|
-
if not self._persist and hasattr(self,
|
91
|
+
if not self._persist and hasattr(self, "_temp_dir_obj") and logger:
|
90
92
|
logger.debug(f"Cleaning up temporary directory for in-memory LanceDB: {self._uri}")
|
91
93
|
self._temp_dir_obj.cleanup()
|
92
94
|
|
@@ -130,17 +132,23 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
130
132
|
|
131
133
|
if isinstance(content_obj, str):
|
132
134
|
content_text = content_obj
|
133
|
-
elif hasattr(content_obj, "extract_text") and callable(
|
135
|
+
elif hasattr(content_obj, "extract_text") and callable(
|
136
|
+
getattr(content_obj, "extract_text")
|
137
|
+
):
|
134
138
|
content_text = content_obj.extract_text()
|
135
|
-
if not isinstance(content_text, str):
|
139
|
+
if not isinstance(content_text, str):
|
140
|
+
content_text = str(content_obj)
|
136
141
|
else:
|
137
142
|
content_text = str(content_obj)
|
138
143
|
|
139
144
|
try:
|
140
145
|
content_hash = item.get_content_hash()
|
141
|
-
if content_hash:
|
142
|
-
|
143
|
-
except
|
146
|
+
if content_hash:
|
147
|
+
metadata["content_hash"] = content_hash
|
148
|
+
except (AttributeError, NotImplementedError):
|
149
|
+
pass
|
150
|
+
except Exception as e:
|
151
|
+
logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
|
144
152
|
|
145
153
|
# Ensure doc_id is not None - use a fallback if needed
|
146
154
|
if doc_id is None:
|
@@ -151,28 +159,30 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
151
159
|
doc_id = f"auto_{len(texts_to_embed)}"
|
152
160
|
|
153
161
|
texts_to_embed.append(content_text)
|
154
|
-
original_items_info.append(
|
155
|
-
"id": doc_id,
|
156
|
-
|
157
|
-
"text": content_text
|
158
|
-
})
|
162
|
+
original_items_info.append(
|
163
|
+
{"id": doc_id, "metadata_json": json.dumps(metadata), "text": content_text}
|
164
|
+
)
|
159
165
|
|
160
166
|
if not texts_to_embed:
|
161
167
|
logger.warning("No text content to embed. Skipping.")
|
162
168
|
return
|
163
169
|
|
164
|
-
logger.info(
|
170
|
+
logger.info(
|
171
|
+
f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'..."
|
172
|
+
)
|
165
173
|
generated_embeddings = self.embedding_model.encode(
|
166
174
|
texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
|
167
175
|
)
|
168
176
|
|
169
177
|
for i, item_info in enumerate(original_items_info):
|
170
|
-
data_to_add.append(
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
178
|
+
data_to_add.append(
|
179
|
+
{
|
180
|
+
"id": item_info["id"],
|
181
|
+
"vector": generated_embeddings[i].tolist(),
|
182
|
+
"text": item_info["text"],
|
183
|
+
"metadata_json": item_info["metadata_json"],
|
184
|
+
}
|
185
|
+
)
|
176
186
|
|
177
187
|
if not data_to_add:
|
178
188
|
logger.warning("No data prepared for LanceDB. Skipping add.")
|
@@ -188,11 +198,17 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
188
198
|
]
|
189
199
|
table = pa.Table.from_arrays(arrays, schema=schema)
|
190
200
|
|
191
|
-
logger.info(
|
192
|
-
|
201
|
+
logger.info(
|
202
|
+
f"Adding/updating {len(data_to_add)} documents to LanceDB table '{self.collection_name}'."
|
203
|
+
)
|
204
|
+
self._table.merge_insert(
|
205
|
+
"id"
|
206
|
+
).when_matched_update_all().when_not_matched_insert_all().execute(
|
193
207
|
table,
|
194
208
|
)
|
195
|
-
logger.info(
|
209
|
+
logger.info(
|
210
|
+
f"Successfully added/updated {len(data_to_add)} documents. Table count: {self._table.count_rows()}"
|
211
|
+
)
|
196
212
|
|
197
213
|
def search(
|
198
214
|
self,
|
@@ -202,12 +218,16 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
202
218
|
if self._table is None:
|
203
219
|
raise RuntimeError(f"LanceDB table '{self.collection_name}' not initialized.")
|
204
220
|
|
205
|
-
logger.info(
|
221
|
+
logger.info(
|
222
|
+
f"Search request for table='{self.collection_name}', query_type={type(query).__name__}, options={options}"
|
223
|
+
)
|
206
224
|
query_text = ""
|
207
|
-
if isinstance(query, (str, Path)):
|
225
|
+
if isinstance(query, (str, Path)):
|
226
|
+
query_text = str(query)
|
208
227
|
elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
|
209
228
|
query_text = query.extract_text()
|
210
|
-
if not query_text or not query_text.strip():
|
229
|
+
if not query_text or not query_text.strip():
|
230
|
+
return []
|
211
231
|
else:
|
212
232
|
raise TypeError(f"Unsupported query type: {type(query)}")
|
213
233
|
|
@@ -226,7 +246,9 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
226
246
|
filter_parts.append(f"{k} = {v}")
|
227
247
|
if filter_parts:
|
228
248
|
lancedb_filter = " AND ".join(filter_parts)
|
229
|
-
logger.warning(
|
249
|
+
logger.warning(
|
250
|
+
f"Filter conversion from dict is basic: {options.filters} -> {lancedb_filter}. For metadata_json, use SQL path expressions."
|
251
|
+
)
|
230
252
|
|
231
253
|
search_query = self._table.search(query_vector).limit(options.top_k)
|
232
254
|
if lancedb_filter:
|
@@ -246,15 +268,19 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
246
268
|
|
247
269
|
score = 1 - row["_distance"] if "_distance" in row else 0.0
|
248
270
|
|
249
|
-
final_results.append(
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
271
|
+
final_results.append(
|
272
|
+
{
|
273
|
+
"id": row.get("id"),
|
274
|
+
"content_snippet": row["text"][:200] if "text" in row and row["text"] else "",
|
275
|
+
"score": score,
|
276
|
+
"page_number": metadata.get("page_number"),
|
277
|
+
"pdf_path": metadata.get("pdf_path"),
|
278
|
+
"metadata": metadata,
|
279
|
+
}
|
280
|
+
)
|
281
|
+
logger.info(
|
282
|
+
f"Search returned {len(final_results)} results from LanceDB table '{self.collection_name}'."
|
283
|
+
)
|
258
284
|
return final_results
|
259
285
|
|
260
286
|
def delete_index(self) -> bool:
|
@@ -262,29 +288,33 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
262
288
|
logger.warning("LanceDB connection not initialized. Cannot delete index.")
|
263
289
|
return False
|
264
290
|
logger.warning(f"Request to delete LanceDB table '{self.collection_name}'.")
|
265
|
-
|
291
|
+
|
266
292
|
self._db.drop_table(self.collection_name)
|
267
293
|
self._table = None
|
268
294
|
logger.info(f"LanceDB table '{self.collection_name}' deleted successfully.")
|
269
295
|
return True
|
270
296
|
|
271
297
|
def index_exists(self) -> bool:
|
272
|
-
if self._db is None:
|
298
|
+
if self._db is None:
|
273
299
|
return False
|
274
300
|
exists = self.collection_name in self._db.table_names()
|
275
301
|
if exists:
|
276
302
|
tbl = self._db.open_table(self.collection_name)
|
277
303
|
count = tbl.count_rows()
|
278
|
-
logger.debug(
|
304
|
+
logger.debug(
|
305
|
+
f"LanceDB table '{self.collection_name}' found with {count} documents. Exists: {count > 0}"
|
306
|
+
)
|
279
307
|
return count > 0
|
280
|
-
|
308
|
+
|
281
309
|
logger.debug(f"LanceDB table '{self.collection_name}' not found in db.table_names().")
|
282
310
|
return False
|
283
311
|
|
284
312
|
def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
|
285
|
-
if self._table is None:
|
313
|
+
if self._table is None:
|
286
314
|
raise RuntimeError("Table not initialized")
|
287
|
-
logger.debug(
|
315
|
+
logger.debug(
|
316
|
+
f"Listing documents for LanceDB table '{self.collection_name}' (include_metadata={include_metadata})..."
|
317
|
+
)
|
288
318
|
|
289
319
|
select_columns = ["id"]
|
290
320
|
if include_metadata:
|
@@ -298,6 +328,7 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
298
328
|
|
299
329
|
formatted_docs: List[Dict[str, Any]] = []
|
300
330
|
import json
|
331
|
+
|
301
332
|
for row in results_list:
|
302
333
|
doc_data: Dict[str, Any] = {"id": row.get("id")}
|
303
334
|
if include_metadata and "metadata_json" in row and row["metadata_json"]:
|
@@ -307,11 +338,13 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
307
338
|
except json.JSONDecodeError:
|
308
339
|
doc_data["meta"] = {}
|
309
340
|
formatted_docs.append(doc_data)
|
310
|
-
logger.info(
|
341
|
+
logger.info(
|
342
|
+
f"Retrieved {len(formatted_docs)} documents from LanceDB table '{self.collection_name}'."
|
343
|
+
)
|
311
344
|
return formatted_docs
|
312
345
|
|
313
346
|
def delete_documents(self, ids: List[str]) -> None:
|
314
|
-
if self._table is None:
|
347
|
+
if self._table is None:
|
315
348
|
raise RuntimeError("Table not initialized")
|
316
349
|
if not ids:
|
317
350
|
logger.debug("No document IDs provided for deletion. Skipping.")
|
@@ -319,7 +352,11 @@ class LanceDBSearchService(SearchServiceProtocol):
|
|
319
352
|
|
320
353
|
id_filter_string = ", ".join([f"'{doc_id}'" for doc_id in ids])
|
321
354
|
delete_condition = f"id IN ({id_filter_string})"
|
322
|
-
logger.warning(
|
323
|
-
|
355
|
+
logger.warning(
|
356
|
+
f"Request to delete {len(ids)} documents from LanceDB table '{self.collection_name}' with condition: {delete_condition}"
|
357
|
+
)
|
358
|
+
|
324
359
|
self._table.delete(delete_condition)
|
325
|
-
logger.info(
|
360
|
+
logger.info(
|
361
|
+
f"Successfully requested deletion of {len(ids)} documents. Table count now: {self._table.count_rows()}"
|
362
|
+
)
|