natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/__init__.py +29 -40
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +20 -18
  8. natural_pdf/core/pdf.py +146 -13
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +374 -30
  11. natural_pdf/elements/region.py +45 -14
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +519 -0
  16. natural_pdf/exporters/hocr_font.py +136 -0
  17. natural_pdf/exporters/original_pdf.py +127 -0
  18. natural_pdf/exporters/searchable_pdf.py +2 -12
  19. natural_pdf/ocr/engine_surya.py +1 -1
  20. natural_pdf/search/__init__.py +65 -52
  21. natural_pdf/search/lancedb_search_service.py +325 -0
  22. natural_pdf/search/numpy_search_service.py +255 -0
  23. natural_pdf/search/searchable_mixin.py +25 -71
  24. natural_pdf/widgets/viewer.py +22 -31
  25. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
  26. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
  27. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  28. natural_pdf/search/haystack_search_service.py +0 -687
  29. natural_pdf/search/haystack_utils.py +0 -474
  30. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,127 @@
1
+ """
2
+ Module for exporting original PDF pages without modification.
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, List, Set, Union
9
+
10
+ # Lazy import for optional dependency
11
+ try:
12
+ import pikepdf
13
+ except ImportError:
14
+ pikepdf = None
15
+
16
+ if TYPE_CHECKING:
17
+ from natural_pdf.core.page import Page
18
+ from natural_pdf.core.pdf import PDF
19
+ from natural_pdf.elements.collections import PageCollection
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def create_original_pdf(
25
+ source: Union["Page", "PageCollection", "PDF"], output_path: Union[str, Path]
26
+ ):
27
+ """
28
+ Creates a new PDF file containing only the original, unmodified pages
29
+ specified by the source object.
30
+
31
+ Requires 'pikepdf'. Install with: pip install "natural-pdf[ocr-export]"
32
+
33
+ Args:
34
+ source: The Page, PageCollection, or PDF object indicating which pages to include.
35
+ output_path: The path to save the resulting PDF file.
36
+
37
+ Raises:
38
+ ImportError: If 'pikepdf' is not installed.
39
+ ValueError: If the source object is empty, pages are from different PDFs,
40
+ or the source PDF path cannot be determined.
41
+ RuntimeError: If pikepdf fails to open the source or save the output.
42
+ pikepdf.PasswordError: If the source PDF is password-protected.
43
+ """
44
+ if pikepdf is None:
45
+ raise ImportError(
46
+ "Saving original PDF pages requires 'pikepdf'. "
47
+ 'Install with: pip install "natural-pdf[ocr-export]"'
48
+ )
49
+
50
+ output_path_str = str(output_path)
51
+ pages_to_extract: List["Page"] = []
52
+
53
+ # Determine the list of pages and the source PDF path
54
+ if hasattr(source, "pages") and isinstance(source.pages, list): # PDF or PageCollection
55
+ if not source.pages:
56
+ raise ValueError("Cannot save an empty collection/PDF.")
57
+ pages_to_extract = source.pages
58
+ elif hasattr(source, "page") and hasattr(source, "number"): # Single Page object
59
+ # Check if it's a natural_pdf.core.page.Page or similar duck-typed object
60
+ if hasattr(source, "pdf") and source.pdf and hasattr(source.pdf, "path"):
61
+ pages_to_extract = [source]
62
+ else:
63
+ raise ValueError("Input Page object does not have a valid PDF reference with a path.")
64
+ else:
65
+ raise TypeError(f"Unsupported source type for create_original_pdf: {type(source)}")
66
+
67
+ if not pages_to_extract:
68
+ raise ValueError("No valid pages found in the source object.")
69
+
70
+ # Verify all pages come from the same PDF and get path
71
+ first_page_pdf_path = None
72
+ if hasattr(pages_to_extract[0], "pdf") and pages_to_extract[0].pdf:
73
+ first_page_pdf_path = getattr(pages_to_extract[0].pdf, "path", None)
74
+
75
+ if not first_page_pdf_path:
76
+ raise ValueError(
77
+ "Cannot save original pages: Source PDF path not found for the first page."
78
+ )
79
+
80
+ page_indices_set: Set[int] = set()
81
+ for page in pages_to_extract:
82
+ page_pdf_path = getattr(getattr(page, "pdf", None), "path", None)
83
+ if not page_pdf_path or page_pdf_path != first_page_pdf_path:
84
+ raise ValueError(
85
+ "Cannot save original pages: All pages must belong to the same source PDF document."
86
+ )
87
+ page_indices_set.add(page.index) # 0-based index
88
+
89
+ sorted_indices = sorted(list(page_indices_set))
90
+
91
+ logger.info(
92
+ f"Extracting original pages {sorted_indices} from '{first_page_pdf_path}' to '{output_path_str}'"
93
+ )
94
+
95
+ try:
96
+ with pikepdf.Pdf.open(first_page_pdf_path) as source_pikepdf_doc:
97
+ target_pikepdf_doc = pikepdf.Pdf.new()
98
+
99
+ for page_index in sorted_indices:
100
+ if 0 <= page_index < len(source_pikepdf_doc.pages):
101
+ # This correctly appends the pikepdf.Page object
102
+ target_pikepdf_doc.pages.append(source_pikepdf_doc.pages[page_index])
103
+ else:
104
+ logger.warning(
105
+ f"Page index {page_index} out of bounds for source PDF '{first_page_pdf_path}'. Skipping."
106
+ )
107
+
108
+ if not target_pikepdf_doc.pages:
109
+ raise RuntimeError(f"No valid pages found to save from source PDF.")
110
+
111
+ target_pikepdf_doc.save(output_path_str)
112
+ logger.info(
113
+ f"Successfully saved original pages PDF ({len(target_pikepdf_doc.pages)} pages) to: {output_path_str}"
114
+ )
115
+
116
+ except pikepdf.PasswordError:
117
+ logger.error(f"Failed to open password-protected source PDF: {first_page_pdf_path}")
118
+ raise RuntimeError(
119
+ f"Source PDF '{first_page_pdf_path}' is password-protected."
120
+ ) from None # Raise specific error without chaining the generic Exception
121
+ except Exception as e:
122
+ logger.error(
123
+ f"Failed to save original pages PDF to '{output_path_str}': {e}",
124
+ exc_info=True,
125
+ )
126
+ # Re-raise as RuntimeError for consistent API error handling
127
+ raise RuntimeError(f"Failed to save original pages PDF: {e}") from e
@@ -22,7 +22,7 @@ except ImportError:
22
22
  pikepdf = None # type: ignore
23
23
 
24
24
  try:
25
- from ocrmypdf.hocrtransform import HocrTransform
25
+ from natural_pdf.exporters.hocr import HocrTransform
26
26
  except ImportError:
27
27
  HocrTransform = None # type: ignore
28
28
 
@@ -310,7 +310,7 @@ def create_searchable_pdf(
310
310
  """
311
311
  Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
312
312
 
313
- Relies on ocrmypdf for hOCR transformation. Requires optional dependencies.
313
+ Relies on pikepdf for saving the PDF.
314
314
 
315
315
  Args:
316
316
  source: The natural_pdf.PDF, PageCollection, or Page object
@@ -318,16 +318,6 @@ def create_searchable_pdf(
318
318
  dpi: The resolution (dots per inch) for rendering page images and hOCR.
319
319
  """
320
320
 
321
- # --- Ensure dependencies are loaded (they should be if installed) ---
322
- if Image is None or pikepdf is None or HocrTransform is None:
323
- # This should ideally not happen if dependencies are in main install,
324
- # but serves as a safeguard during development or if install is broken.
325
- raise ImportError(
326
- "Required dependencies (Pillow, pikepdf, ocrmypdf) are missing. "
327
- "Please ensure natural-pdf is installed correctly with all dependencies."
328
- )
329
- # --- End Safeguard Check ---
330
-
331
321
  # duck type to see if source has .pages, to populate pages =
332
322
  if hasattr(source, "pages"):
333
323
  pages = source.pages
@@ -59,7 +59,7 @@ class SuryaOCREngine(OCREngine):
59
59
 
60
60
  # Store languages instance variable during initialization to use here
61
61
  langs = (
62
- [[lang] for lang in self._langs]
62
+ [self._langs] # Send all languages together in one list per image
63
63
  if hasattr(self, "_langs")
64
64
  else [[self.DEFAULT_LANGUAGES[0]]]
65
65
  )
@@ -3,29 +3,46 @@
3
3
  import logging
4
4
  from typing import Optional
5
5
 
6
- # --- Service Implementation Import ---
7
- # Import the concrete implementation
8
- from .haystack_search_service import HaystackSearchService
9
-
10
- # --- Utils Import ---
11
- from .haystack_utils import ( # Re-export flag and helper
12
- HAS_HAYSTACK_EXTRAS,
13
- check_haystack_availability,
14
- )
15
-
16
- # --- Option Imports (for convenience) ---
17
- # Make options easily available via `from natural_pdf.search import ...`
18
- from .search_options import SearchOptions # Alias for TextSearchOptions for simplicity?
6
+ # Import constants
7
+ from .search_options import SearchOptions
19
8
  from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
20
-
21
- # --- Protocol Import ---
22
- # Import the protocol for type hinting
23
9
  from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
24
10
 
11
+ # Check search extras availability
12
+ LANCEDB_AVAILABLE = False
13
+ SEARCH_DEPENDENCIES_AVAILABLE = False
14
+
15
+ try:
16
+ import sentence_transformers
17
+ import numpy as np
18
+ # Basic search dependencies are available
19
+ SEARCH_DEPENDENCIES_AVAILABLE = True
20
+
21
+ # Check if LanceDB is available
22
+ try:
23
+ import lancedb
24
+ import pyarrow
25
+ LANCEDB_AVAILABLE = True
26
+ from .lancedb_search_service import LanceDBSearchService, DEFAULT_LANCEDB_PERSIST_PATH, DEFAULT_EMBEDDING_MODEL
27
+ except ImportError:
28
+ # LanceDB not available, we'll use NumPy fallback
29
+ LANCEDB_AVAILABLE = False
30
+ from .numpy_search_service import NumpySearchService, DEFAULT_EMBEDDING_MODEL
31
+ except ImportError:
32
+ # Basic dependencies missing
33
+ SEARCH_DEPENDENCIES_AVAILABLE = False
34
+ LANCEDB_AVAILABLE = False
35
+
25
36
  logger = logging.getLogger(__name__)
26
37
 
38
+ def check_search_availability():
39
+ """Check if required search dependencies are available."""
40
+ if not SEARCH_DEPENDENCIES_AVAILABLE:
41
+ raise ImportError(
42
+ "Search functionality requires 'sentence-transformers' and NumPy. "
43
+ "Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
44
+ )
27
45
 
28
- # Factory Function
29
46
  def get_search_service(
30
47
  collection_name: str,
31
48
  persist: bool = False,
@@ -34,53 +51,49 @@ def get_search_service(
34
51
  ) -> SearchServiceProtocol:
35
52
  """
36
53
  Factory function to get an instance of the configured search service.
37
-
38
- A service instance is tied to a specific index name (collection/table).
39
-
40
- Currently, only returns HaystackSearchService but is structured for future extension.
54
+
55
+ Automatically selects the best available implementation:
56
+ - LanceDB if installed (recommended for both in-memory and persistent)
57
+ - Numpy fallback for in-memory only
41
58
 
42
59
  Args:
43
- collection_name: The logical name for the index this service instance manages
44
- (used as table_name for LanceDB).
60
+ collection_name: The logical name for the index/table this service instance manages.
45
61
  persist: If True, creates a service instance configured for persistent
46
- storage (currently LanceDB). If False (default), uses InMemory.
47
- uri: Override the default path/URI for persistent storage.
62
+ storage. If False (default), uses InMemory (via temp dir for LanceDB).
63
+ uri: Override the default path for persistent storage.
48
64
  default_embedding_model: Override the default embedding model used by the service.
49
- **kwargs: Reserved for future configuration options.
50
65
 
51
66
  Returns:
52
- An instance conforming to the SearchServiceProtocol for the specified collection/table.
67
+ An instance conforming to the SearchServiceProtocol.
53
68
  """
54
69
  logger.debug(
55
- f"Calling get_search_service factory for index '{collection_name}' (persist={persist}, uri={uri})..."
70
+ f"Calling get_search_service factory for collection '{collection_name}' (persist={persist}, uri={uri})..."
56
71
  )
72
+ check_search_availability()
57
73
 
58
- # Collect arguments relevant to HaystackSearchService.__init__
59
- service_args = {}
60
- service_args["table_name"] = collection_name
61
- service_args["persist"] = persist
74
+ service_args = {
75
+ "collection_name": collection_name,
76
+ "persist": persist,
77
+ }
62
78
  if uri is not None:
63
79
  service_args["uri"] = uri
64
- if default_embedding_model is not None:
65
- service_args["embedding_model"] = default_embedding_model
66
80
 
67
- # Cache logic commented out as before
81
+ if default_embedding_model is not None:
82
+ service_args["embedding_model_name"] = default_embedding_model
68
83
 
69
- try:
70
- service_instance = HaystackSearchService(**service_args)
71
- logger.info(f"Created new HaystackSearchService instance for index '{collection_name}'.")
72
- return service_instance
73
- except ImportError as e:
74
- # Error message remains valid
75
- logger.error(
76
- f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
84
+ # If persistence is requested, LanceDB is required
85
+ if persist and not LANCEDB_AVAILABLE:
86
+ raise RuntimeError(
87
+ "Persistent vector search requires LanceDB. "
88
+ "Please install: pip install lancedb"
77
89
  )
78
- raise ImportError(
79
- "Search Service could not be created. Ensure Haystack extras are installed: pip install natural-pdf[haystack]"
80
- ) from e
81
- except Exception as e:
82
- logger.error(f"Failed to instantiate Search Service: {e}", exc_info=True)
83
- raise RuntimeError("Could not create Search Service instance.") from e
84
-
85
-
86
- # Default instance commented out as before
90
+
91
+ # Select the appropriate implementation
92
+ if LANCEDB_AVAILABLE:
93
+ logger.info(f"Using LanceDB for vector search (collection: {collection_name})")
94
+ service_instance = LanceDBSearchService(**service_args)
95
+ else:
96
+ logger.info(f"Using NumPy fallback for in-memory vector search (collection: {collection_name})")
97
+ service_instance = NumpySearchService(**service_args)
98
+
99
+ return service_instance
@@ -0,0 +1,325 @@
1
+ import logging
2
+ import shutil
3
+ import tempfile
4
+ from pathlib import Path
5
+ from typing import Any, Dict, Iterable, List, Optional
6
+
7
+ import lancedb
8
+ import pyarrow as pa
9
+ from sentence_transformers import SentenceTransformer
10
+
11
+ from .search_options import BaseSearchOptions
12
+ from .search_service_protocol import (
13
+ Indexable,
14
+ IndexConfigurationError,
15
+ SearchServiceProtocol,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
21
+ DEFAULT_LANCEDB_PERSIST_PATH = "./natural_pdf_lancedb_index"
22
+
23
+
24
+ class LanceDBSearchService(SearchServiceProtocol):
25
+ """LanceDB-based implementation of the search service protocol."""
26
+
27
+ collection_name: str
28
+
29
+ def __init__(
30
+ self,
31
+ collection_name: str,
32
+ persist: bool = False,
33
+ uri: Optional[str] = None,
34
+ embedding_model_name: str = DEFAULT_EMBEDDING_MODEL,
35
+ ):
36
+ self.collection_name = collection_name
37
+ self._persist = persist
38
+ self._uri = uri
39
+ self._embedding_model_name = embedding_model_name
40
+ self._embedding_dims: Optional[int] = None
41
+ self._db = None
42
+ self._table = None
43
+
44
+ self.embedding_model = SentenceTransformer(self._embedding_model_name)
45
+ test_embedding = self.embedding_model.encode("test")
46
+ self._embedding_dims = len(test_embedding)
47
+
48
+ if self._persist:
49
+ self._uri = self._uri if self._uri else DEFAULT_LANCEDB_PERSIST_PATH
50
+ logger.info(f"Initializing Persistent LanceDB client at path: {self._uri}")
51
+ Path(self._uri).mkdir(parents=True, exist_ok=True)
52
+ else:
53
+ self._temp_dir_obj = tempfile.TemporaryDirectory()
54
+ self._uri = self._temp_dir_obj.name
55
+ logger.info(f"Initializing In-Memory LanceDB client using temp path: {self._uri}")
56
+
57
+ self._db = lancedb.connect(self._uri)
58
+ self._open_or_create_table()
59
+ logger.info(
60
+ f"LanceDBSearchService initialized. Table '{self.collection_name}' (persist={self._persist} at '{self._uri}'). Model: '{self._embedding_model_name}', Dims: {self._embedding_dims}"
61
+ )
62
+
63
+ def _get_schema(self) -> pa.Schema:
64
+ if self._embedding_dims is None:
65
+ raise RuntimeError("Embedding dimensions not determined. Cannot create schema.")
66
+
67
+ return pa.schema([
68
+ pa.field("id", pa.string(), nullable=False),
69
+ pa.field("vector", pa.list_(pa.float32(), list_size=self._embedding_dims)),
70
+ pa.field("text", pa.string()),
71
+ pa.field("metadata_json", pa.string())
72
+ ])
73
+
74
+ def _open_or_create_table(self):
75
+ if self._db is None:
76
+ raise RuntimeError("LanceDB connection not established.")
77
+
78
+ table_names = self._db.table_names()
79
+
80
+ if self.collection_name in table_names:
81
+ logger.debug(f"Opening existing LanceDB table: {self.collection_name}")
82
+ self._table = self._db.open_table(self.collection_name)
83
+ else:
84
+ logger.debug(f"Creating new LanceDB table: {self.collection_name} with schema.")
85
+ schema = self._get_schema()
86
+ self._table = self._db.create_table(self.collection_name, schema=schema, mode="create")
87
+
88
+ def __del__(self):
89
+ if not self._persist and hasattr(self, '_temp_dir_obj') and logger:
90
+ logger.debug(f"Cleaning up temporary directory for in-memory LanceDB: {self._uri}")
91
+ self._temp_dir_obj.cleanup()
92
+
93
+ def index(
94
+ self,
95
+ documents: Iterable[Indexable],
96
+ embedder_device: Optional[str] = None,
97
+ force_reindex: bool = False,
98
+ ) -> None:
99
+ indexable_list = list(documents)
100
+ logger.info(
101
+ f"Index request for table='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model_name}', force={force_reindex}"
102
+ )
103
+
104
+ if self._table is None or self._db is None:
105
+ raise RuntimeError(f"LanceDB table '{self.collection_name}' not initialized.")
106
+
107
+ if not indexable_list:
108
+ logger.warning("No documents provided for indexing. Skipping.")
109
+ return
110
+
111
+ if force_reindex:
112
+ logger.warning(
113
+ f"Force reindex requested for table '{self.collection_name}'. Deleting existing table and recreating."
114
+ )
115
+ self._db.drop_table(self.collection_name)
116
+ self._open_or_create_table()
117
+ logger.info(f"Table '{self.collection_name}' deleted and recreated.")
118
+
119
+ data_to_add = []
120
+ texts_to_embed: List[str] = []
121
+ original_items_info: List[Dict[str, Any]] = []
122
+
123
+ import json
124
+
125
+ for item in indexable_list:
126
+ doc_id = item.get_id()
127
+ metadata = item.get_metadata().copy()
128
+ content_obj = item.get_content()
129
+ content_text = ""
130
+
131
+ if isinstance(content_obj, str):
132
+ content_text = content_obj
133
+ elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
134
+ content_text = content_obj.extract_text()
135
+ if not isinstance(content_text, str): content_text = str(content_obj)
136
+ else:
137
+ content_text = str(content_obj)
138
+
139
+ try:
140
+ content_hash = item.get_content_hash()
141
+ if content_hash: metadata["content_hash"] = content_hash
142
+ except (AttributeError, NotImplementedError): pass
143
+ except Exception as e: logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
144
+
145
+ # Ensure doc_id is not None - use a fallback if needed
146
+ if doc_id is None:
147
+ # Generate a unique ID based on content hash or position in the list
148
+ try:
149
+ doc_id = f"auto_{item.get_content_hash() if hasattr(item, 'get_content_hash') else hash(content_text)}"
150
+ except:
151
+ doc_id = f"auto_{len(texts_to_embed)}"
152
+
153
+ texts_to_embed.append(content_text)
154
+ original_items_info.append({
155
+ "id": doc_id,
156
+ "metadata_json": json.dumps(metadata),
157
+ "text": content_text
158
+ })
159
+
160
+ if not texts_to_embed:
161
+ logger.warning("No text content to embed. Skipping.")
162
+ return
163
+
164
+ logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
165
+ generated_embeddings = self.embedding_model.encode(
166
+ texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
167
+ )
168
+
169
+ for i, item_info in enumerate(original_items_info):
170
+ data_to_add.append({
171
+ "id": item_info["id"],
172
+ "vector": generated_embeddings[i].tolist(),
173
+ "text": item_info["text"],
174
+ "metadata_json": item_info["metadata_json"]
175
+ })
176
+
177
+ if not data_to_add:
178
+ logger.warning("No data prepared for LanceDB. Skipping add.")
179
+ return
180
+
181
+ # Create a PyArrow table with the same schema as the LanceDB table
182
+ schema = self._get_schema()
183
+ arrays = [
184
+ pa.array([item["id"] for item in data_to_add], type=pa.string()),
185
+ pa.array([item["vector"] for item in data_to_add]),
186
+ pa.array([item["text"] for item in data_to_add], type=pa.string()),
187
+ pa.array([item["metadata_json"] for item in data_to_add], type=pa.string()),
188
+ ]
189
+ table = pa.Table.from_arrays(arrays, schema=schema)
190
+
191
+ logger.info(f"Adding/updating {len(data_to_add)} documents to LanceDB table '{self.collection_name}'.")
192
+ self._table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(
193
+ table,
194
+ )
195
+ logger.info(f"Successfully added/updated {len(data_to_add)} documents. Table count: {self._table.count_rows()}")
196
+
197
+ def search(
198
+ self,
199
+ query: Any,
200
+ options: BaseSearchOptions,
201
+ ) -> List[Dict[str, Any]]:
202
+ if self._table is None:
203
+ raise RuntimeError(f"LanceDB table '{self.collection_name}' not initialized.")
204
+
205
+ logger.info(f"Search request for table='{self.collection_name}', query_type={type(query).__name__}, options={options}")
206
+ query_text = ""
207
+ if isinstance(query, (str, Path)): query_text = str(query)
208
+ elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
209
+ query_text = query.extract_text()
210
+ if not query_text or not query_text.strip(): return []
211
+ else:
212
+ raise TypeError(f"Unsupported query type: {type(query)}")
213
+
214
+ query_vector = self.embedding_model.encode(query_text).tolist()
215
+
216
+ lancedb_filter = None
217
+ if options.filters:
218
+ if isinstance(options.filters, str):
219
+ lancedb_filter = options.filters
220
+ elif isinstance(options.filters, dict):
221
+ filter_parts = []
222
+ for k, v in options.filters.items():
223
+ if isinstance(v, str):
224
+ filter_parts.append(f"{k} = '{v}'")
225
+ else:
226
+ filter_parts.append(f"{k} = {v}")
227
+ if filter_parts:
228
+ lancedb_filter = " AND ".join(filter_parts)
229
+ logger.warning(f"Filter conversion from dict is basic: {options.filters} -> {lancedb_filter}. For metadata_json, use SQL path expressions.")
230
+
231
+ search_query = self._table.search(query_vector).limit(options.top_k)
232
+ if lancedb_filter:
233
+ search_query = search_query.where(lancedb_filter)
234
+
235
+ results_df = search_query.to_df()
236
+ final_results: List[Dict[str, Any]] = []
237
+ import json
238
+
239
+ for _, row in results_df.iterrows():
240
+ metadata = {}
241
+ if "metadata_json" in row and row["metadata_json"]:
242
+ try:
243
+ metadata = json.loads(row["metadata_json"])
244
+ except json.JSONDecodeError:
245
+ logger.warning(f"Failed to parse metadata_json for id {row.get('id')}")
246
+
247
+ score = 1 - row["_distance"] if "_distance" in row else 0.0
248
+
249
+ final_results.append({
250
+ "id": row.get("id"),
251
+ "content_snippet": row["text"][:200] if "text" in row and row["text"] else "",
252
+ "score": score,
253
+ "page_number": metadata.get("page_number"),
254
+ "pdf_path": metadata.get("pdf_path"),
255
+ "metadata": metadata,
256
+ })
257
+ logger.info(f"Search returned {len(final_results)} results from LanceDB table '{self.collection_name}'.")
258
+ return final_results
259
+
260
+ def delete_index(self) -> bool:
261
+ if self._db is None:
262
+ logger.warning("LanceDB connection not initialized. Cannot delete index.")
263
+ return False
264
+ logger.warning(f"Request to delete LanceDB table '{self.collection_name}'.")
265
+
266
+ self._db.drop_table(self.collection_name)
267
+ self._table = None
268
+ logger.info(f"LanceDB table '{self.collection_name}' deleted successfully.")
269
+ return True
270
+
271
+ def index_exists(self) -> bool:
272
+ if self._db is None:
273
+ return False
274
+ exists = self.collection_name in self._db.table_names()
275
+ if exists:
276
+ tbl = self._db.open_table(self.collection_name)
277
+ count = tbl.count_rows()
278
+ logger.debug(f"LanceDB table '{self.collection_name}' found with {count} documents. Exists: {count > 0}")
279
+ return count > 0
280
+
281
+ logger.debug(f"LanceDB table '{self.collection_name}' not found in db.table_names().")
282
+ return False
283
+
284
+ def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
285
+ if self._table is None:
286
+ raise RuntimeError("Table not initialized")
287
+ logger.debug(f"Listing documents for LanceDB table '{self.collection_name}' (include_metadata={include_metadata})...")
288
+
289
+ select_columns = ["id"]
290
+ if include_metadata:
291
+ select_columns.append("metadata_json")
292
+
293
+ lancedb_filter = kwargs.get("filters")
294
+
295
+ query = self._table.to_lance().scanner(columns=select_columns, filter=lancedb_filter)
296
+ results_table = query.to_table()
297
+ results_list = results_table.to_pylist()
298
+
299
+ formatted_docs: List[Dict[str, Any]] = []
300
+ import json
301
+ for row in results_list:
302
+ doc_data: Dict[str, Any] = {"id": row.get("id")}
303
+ if include_metadata and "metadata_json" in row and row["metadata_json"]:
304
+ try:
305
+ metadata = json.loads(row["metadata_json"])
306
+ doc_data["meta"] = metadata
307
+ except json.JSONDecodeError:
308
+ doc_data["meta"] = {}
309
+ formatted_docs.append(doc_data)
310
+ logger.info(f"Retrieved {len(formatted_docs)} documents from LanceDB table '{self.collection_name}'.")
311
+ return formatted_docs
312
+
313
+ def delete_documents(self, ids: List[str]) -> None:
314
+ if self._table is None:
315
+ raise RuntimeError("Table not initialized")
316
+ if not ids:
317
+ logger.debug("No document IDs provided for deletion. Skipping.")
318
+ return
319
+
320
+ id_filter_string = ", ".join([f"'{doc_id}'" for doc_id in ids])
321
+ delete_condition = f"id IN ({id_filter_string})"
322
+ logger.warning(f"Request to delete {len(ids)} documents from LanceDB table '{self.collection_name}' with condition: {delete_condition}")
323
+
324
+ self._table.delete(delete_condition)
325
+ logger.info(f"Successfully requested deletion of {len(ids)} documents. Table count now: {self._table.count_rows()}")