natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +29 -40
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +120 -40
- natural_pdf/core/page.py +20 -18
- natural_pdf/core/pdf.py +146 -13
- natural_pdf/elements/base.py +17 -0
- natural_pdf/elements/collections.py +374 -30
- natural_pdf/elements/region.py +45 -14
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +519 -0
- natural_pdf/exporters/hocr_font.py +136 -0
- natural_pdf/exporters/original_pdf.py +127 -0
- natural_pdf/exporters/searchable_pdf.py +2 -12
- natural_pdf/ocr/engine_surya.py +1 -1
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,127 @@
|
|
1
|
+
"""
|
2
|
+
Module for exporting original PDF pages without modification.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import logging
|
6
|
+
import os
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import TYPE_CHECKING, List, Set, Union
|
9
|
+
|
10
|
+
# Lazy import for optional dependency
|
11
|
+
try:
|
12
|
+
import pikepdf
|
13
|
+
except ImportError:
|
14
|
+
pikepdf = None
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from natural_pdf.core.page import Page
|
18
|
+
from natural_pdf.core.pdf import PDF
|
19
|
+
from natural_pdf.elements.collections import PageCollection
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
def create_original_pdf(
|
25
|
+
source: Union["Page", "PageCollection", "PDF"], output_path: Union[str, Path]
|
26
|
+
):
|
27
|
+
"""
|
28
|
+
Creates a new PDF file containing only the original, unmodified pages
|
29
|
+
specified by the source object.
|
30
|
+
|
31
|
+
Requires 'pikepdf'. Install with: pip install "natural-pdf[ocr-export]"
|
32
|
+
|
33
|
+
Args:
|
34
|
+
source: The Page, PageCollection, or PDF object indicating which pages to include.
|
35
|
+
output_path: The path to save the resulting PDF file.
|
36
|
+
|
37
|
+
Raises:
|
38
|
+
ImportError: If 'pikepdf' is not installed.
|
39
|
+
ValueError: If the source object is empty, pages are from different PDFs,
|
40
|
+
or the source PDF path cannot be determined.
|
41
|
+
RuntimeError: If pikepdf fails to open the source or save the output.
|
42
|
+
pikepdf.PasswordError: If the source PDF is password-protected.
|
43
|
+
"""
|
44
|
+
if pikepdf is None:
|
45
|
+
raise ImportError(
|
46
|
+
"Saving original PDF pages requires 'pikepdf'. "
|
47
|
+
'Install with: pip install "natural-pdf[ocr-export]"'
|
48
|
+
)
|
49
|
+
|
50
|
+
output_path_str = str(output_path)
|
51
|
+
pages_to_extract: List["Page"] = []
|
52
|
+
|
53
|
+
# Determine the list of pages and the source PDF path
|
54
|
+
if hasattr(source, "pages") and isinstance(source.pages, list): # PDF or PageCollection
|
55
|
+
if not source.pages:
|
56
|
+
raise ValueError("Cannot save an empty collection/PDF.")
|
57
|
+
pages_to_extract = source.pages
|
58
|
+
elif hasattr(source, "page") and hasattr(source, "number"): # Single Page object
|
59
|
+
# Check if it's a natural_pdf.core.page.Page or similar duck-typed object
|
60
|
+
if hasattr(source, "pdf") and source.pdf and hasattr(source.pdf, "path"):
|
61
|
+
pages_to_extract = [source]
|
62
|
+
else:
|
63
|
+
raise ValueError("Input Page object does not have a valid PDF reference with a path.")
|
64
|
+
else:
|
65
|
+
raise TypeError(f"Unsupported source type for create_original_pdf: {type(source)}")
|
66
|
+
|
67
|
+
if not pages_to_extract:
|
68
|
+
raise ValueError("No valid pages found in the source object.")
|
69
|
+
|
70
|
+
# Verify all pages come from the same PDF and get path
|
71
|
+
first_page_pdf_path = None
|
72
|
+
if hasattr(pages_to_extract[0], "pdf") and pages_to_extract[0].pdf:
|
73
|
+
first_page_pdf_path = getattr(pages_to_extract[0].pdf, "path", None)
|
74
|
+
|
75
|
+
if not first_page_pdf_path:
|
76
|
+
raise ValueError(
|
77
|
+
"Cannot save original pages: Source PDF path not found for the first page."
|
78
|
+
)
|
79
|
+
|
80
|
+
page_indices_set: Set[int] = set()
|
81
|
+
for page in pages_to_extract:
|
82
|
+
page_pdf_path = getattr(getattr(page, "pdf", None), "path", None)
|
83
|
+
if not page_pdf_path or page_pdf_path != first_page_pdf_path:
|
84
|
+
raise ValueError(
|
85
|
+
"Cannot save original pages: All pages must belong to the same source PDF document."
|
86
|
+
)
|
87
|
+
page_indices_set.add(page.index) # 0-based index
|
88
|
+
|
89
|
+
sorted_indices = sorted(list(page_indices_set))
|
90
|
+
|
91
|
+
logger.info(
|
92
|
+
f"Extracting original pages {sorted_indices} from '{first_page_pdf_path}' to '{output_path_str}'"
|
93
|
+
)
|
94
|
+
|
95
|
+
try:
|
96
|
+
with pikepdf.Pdf.open(first_page_pdf_path) as source_pikepdf_doc:
|
97
|
+
target_pikepdf_doc = pikepdf.Pdf.new()
|
98
|
+
|
99
|
+
for page_index in sorted_indices:
|
100
|
+
if 0 <= page_index < len(source_pikepdf_doc.pages):
|
101
|
+
# This correctly appends the pikepdf.Page object
|
102
|
+
target_pikepdf_doc.pages.append(source_pikepdf_doc.pages[page_index])
|
103
|
+
else:
|
104
|
+
logger.warning(
|
105
|
+
f"Page index {page_index} out of bounds for source PDF '{first_page_pdf_path}'. Skipping."
|
106
|
+
)
|
107
|
+
|
108
|
+
if not target_pikepdf_doc.pages:
|
109
|
+
raise RuntimeError(f"No valid pages found to save from source PDF.")
|
110
|
+
|
111
|
+
target_pikepdf_doc.save(output_path_str)
|
112
|
+
logger.info(
|
113
|
+
f"Successfully saved original pages PDF ({len(target_pikepdf_doc.pages)} pages) to: {output_path_str}"
|
114
|
+
)
|
115
|
+
|
116
|
+
except pikepdf.PasswordError:
|
117
|
+
logger.error(f"Failed to open password-protected source PDF: {first_page_pdf_path}")
|
118
|
+
raise RuntimeError(
|
119
|
+
f"Source PDF '{first_page_pdf_path}' is password-protected."
|
120
|
+
) from None # Raise specific error without chaining the generic Exception
|
121
|
+
except Exception as e:
|
122
|
+
logger.error(
|
123
|
+
f"Failed to save original pages PDF to '{output_path_str}': {e}",
|
124
|
+
exc_info=True,
|
125
|
+
)
|
126
|
+
# Re-raise as RuntimeError for consistent API error handling
|
127
|
+
raise RuntimeError(f"Failed to save original pages PDF: {e}") from e
|
@@ -22,7 +22,7 @@ except ImportError:
|
|
22
22
|
pikepdf = None # type: ignore
|
23
23
|
|
24
24
|
try:
|
25
|
-
from
|
25
|
+
from natural_pdf.exporters.hocr import HocrTransform
|
26
26
|
except ImportError:
|
27
27
|
HocrTransform = None # type: ignore
|
28
28
|
|
@@ -310,7 +310,7 @@ def create_searchable_pdf(
|
|
310
310
|
"""
|
311
311
|
Creates a searchable PDF from a natural_pdf.PDF object using OCR results.
|
312
312
|
|
313
|
-
Relies on
|
313
|
+
Relies on pikepdf for saving the PDF.
|
314
314
|
|
315
315
|
Args:
|
316
316
|
source: The natural_pdf.PDF, PageCollection, or Page object
|
@@ -318,16 +318,6 @@ def create_searchable_pdf(
|
|
318
318
|
dpi: The resolution (dots per inch) for rendering page images and hOCR.
|
319
319
|
"""
|
320
320
|
|
321
|
-
# --- Ensure dependencies are loaded (they should be if installed) ---
|
322
|
-
if Image is None or pikepdf is None or HocrTransform is None:
|
323
|
-
# This should ideally not happen if dependencies are in main install,
|
324
|
-
# but serves as a safeguard during development or if install is broken.
|
325
|
-
raise ImportError(
|
326
|
-
"Required dependencies (Pillow, pikepdf, ocrmypdf) are missing. "
|
327
|
-
"Please ensure natural-pdf is installed correctly with all dependencies."
|
328
|
-
)
|
329
|
-
# --- End Safeguard Check ---
|
330
|
-
|
331
321
|
# duck type to see if source has .pages, to populate pages =
|
332
322
|
if hasattr(source, "pages"):
|
333
323
|
pages = source.pages
|
natural_pdf/ocr/engine_surya.py
CHANGED
@@ -59,7 +59,7 @@ class SuryaOCREngine(OCREngine):
|
|
59
59
|
|
60
60
|
# Store languages instance variable during initialization to use here
|
61
61
|
langs = (
|
62
|
-
[
|
62
|
+
[self._langs] # Send all languages together in one list per image
|
63
63
|
if hasattr(self, "_langs")
|
64
64
|
else [[self.DEFAULT_LANGUAGES[0]]]
|
65
65
|
)
|
natural_pdf/search/__init__.py
CHANGED
@@ -3,29 +3,46 @@
|
|
3
3
|
import logging
|
4
4
|
from typing import Optional
|
5
5
|
|
6
|
-
#
|
7
|
-
|
8
|
-
from .haystack_search_service import HaystackSearchService
|
9
|
-
|
10
|
-
# --- Utils Import ---
|
11
|
-
from .haystack_utils import ( # Re-export flag and helper
|
12
|
-
HAS_HAYSTACK_EXTRAS,
|
13
|
-
check_haystack_availability,
|
14
|
-
)
|
15
|
-
|
16
|
-
# --- Option Imports (for convenience) ---
|
17
|
-
# Make options easily available via `from natural_pdf.search import ...`
|
18
|
-
from .search_options import SearchOptions # Alias for TextSearchOptions for simplicity?
|
6
|
+
# Import constants
|
7
|
+
from .search_options import SearchOptions
|
19
8
|
from .search_options import BaseSearchOptions, MultiModalSearchOptions, TextSearchOptions
|
20
|
-
|
21
|
-
# --- Protocol Import ---
|
22
|
-
# Import the protocol for type hinting
|
23
9
|
from .search_service_protocol import Indexable, IndexConfigurationError, SearchServiceProtocol
|
24
10
|
|
11
|
+
# Check search extras availability
|
12
|
+
LANCEDB_AVAILABLE = False
|
13
|
+
SEARCH_DEPENDENCIES_AVAILABLE = False
|
14
|
+
|
15
|
+
try:
|
16
|
+
import sentence_transformers
|
17
|
+
import numpy as np
|
18
|
+
# Basic search dependencies are available
|
19
|
+
SEARCH_DEPENDENCIES_AVAILABLE = True
|
20
|
+
|
21
|
+
# Check if LanceDB is available
|
22
|
+
try:
|
23
|
+
import lancedb
|
24
|
+
import pyarrow
|
25
|
+
LANCEDB_AVAILABLE = True
|
26
|
+
from .lancedb_search_service import LanceDBSearchService, DEFAULT_LANCEDB_PERSIST_PATH, DEFAULT_EMBEDDING_MODEL
|
27
|
+
except ImportError:
|
28
|
+
# LanceDB not available, we'll use NumPy fallback
|
29
|
+
LANCEDB_AVAILABLE = False
|
30
|
+
from .numpy_search_service import NumpySearchService, DEFAULT_EMBEDDING_MODEL
|
31
|
+
except ImportError:
|
32
|
+
# Basic dependencies missing
|
33
|
+
SEARCH_DEPENDENCIES_AVAILABLE = False
|
34
|
+
LANCEDB_AVAILABLE = False
|
35
|
+
|
25
36
|
logger = logging.getLogger(__name__)
|
26
37
|
|
38
|
+
def check_search_availability():
|
39
|
+
"""Check if required search dependencies are available."""
|
40
|
+
if not SEARCH_DEPENDENCIES_AVAILABLE:
|
41
|
+
raise ImportError(
|
42
|
+
"Search functionality requires 'sentence-transformers' and NumPy. "
|
43
|
+
"Install with: pip install natural-pdf[search] (or pip install sentence-transformers numpy)"
|
44
|
+
)
|
27
45
|
|
28
|
-
# Factory Function
|
29
46
|
def get_search_service(
|
30
47
|
collection_name: str,
|
31
48
|
persist: bool = False,
|
@@ -34,53 +51,49 @@ def get_search_service(
|
|
34
51
|
) -> SearchServiceProtocol:
|
35
52
|
"""
|
36
53
|
Factory function to get an instance of the configured search service.
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
54
|
+
|
55
|
+
Automatically selects the best available implementation:
|
56
|
+
- LanceDB if installed (recommended for both in-memory and persistent)
|
57
|
+
- Numpy fallback for in-memory only
|
41
58
|
|
42
59
|
Args:
|
43
|
-
collection_name: The logical name for the index this service instance manages
|
44
|
-
(used as table_name for LanceDB).
|
60
|
+
collection_name: The logical name for the index/table this service instance manages.
|
45
61
|
persist: If True, creates a service instance configured for persistent
|
46
|
-
storage
|
47
|
-
uri: Override the default path
|
62
|
+
storage. If False (default), uses InMemory (via temp dir for LanceDB).
|
63
|
+
uri: Override the default path for persistent storage.
|
48
64
|
default_embedding_model: Override the default embedding model used by the service.
|
49
|
-
**kwargs: Reserved for future configuration options.
|
50
65
|
|
51
66
|
Returns:
|
52
|
-
An instance conforming to the SearchServiceProtocol
|
67
|
+
An instance conforming to the SearchServiceProtocol.
|
53
68
|
"""
|
54
69
|
logger.debug(
|
55
|
-
f"Calling get_search_service factory for
|
70
|
+
f"Calling get_search_service factory for collection '{collection_name}' (persist={persist}, uri={uri})..."
|
56
71
|
)
|
72
|
+
check_search_availability()
|
57
73
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
74
|
+
service_args = {
|
75
|
+
"collection_name": collection_name,
|
76
|
+
"persist": persist,
|
77
|
+
}
|
62
78
|
if uri is not None:
|
63
79
|
service_args["uri"] = uri
|
64
|
-
if default_embedding_model is not None:
|
65
|
-
service_args["embedding_model"] = default_embedding_model
|
66
80
|
|
67
|
-
|
81
|
+
if default_embedding_model is not None:
|
82
|
+
service_args["embedding_model_name"] = default_embedding_model
|
68
83
|
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
# Error message remains valid
|
75
|
-
logger.error(
|
76
|
-
f"Failed to instantiate Search Service due to missing dependencies: {e}", exc_info=True
|
84
|
+
# If persistence is requested, LanceDB is required
|
85
|
+
if persist and not LANCEDB_AVAILABLE:
|
86
|
+
raise RuntimeError(
|
87
|
+
"Persistent vector search requires LanceDB. "
|
88
|
+
"Please install: pip install lancedb"
|
77
89
|
)
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
90
|
+
|
91
|
+
# Select the appropriate implementation
|
92
|
+
if LANCEDB_AVAILABLE:
|
93
|
+
logger.info(f"Using LanceDB for vector search (collection: {collection_name})")
|
94
|
+
service_instance = LanceDBSearchService(**service_args)
|
95
|
+
else:
|
96
|
+
logger.info(f"Using NumPy fallback for in-memory vector search (collection: {collection_name})")
|
97
|
+
service_instance = NumpySearchService(**service_args)
|
98
|
+
|
99
|
+
return service_instance
|
@@ -0,0 +1,325 @@
|
|
1
|
+
import logging
|
2
|
+
import shutil
|
3
|
+
import tempfile
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import Any, Dict, Iterable, List, Optional
|
6
|
+
|
7
|
+
import lancedb
|
8
|
+
import pyarrow as pa
|
9
|
+
from sentence_transformers import SentenceTransformer
|
10
|
+
|
11
|
+
from .search_options import BaseSearchOptions
|
12
|
+
from .search_service_protocol import (
|
13
|
+
Indexable,
|
14
|
+
IndexConfigurationError,
|
15
|
+
SearchServiceProtocol,
|
16
|
+
)
|
17
|
+
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
|
20
|
+
DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
21
|
+
DEFAULT_LANCEDB_PERSIST_PATH = "./natural_pdf_lancedb_index"
|
22
|
+
|
23
|
+
|
24
|
+
class LanceDBSearchService(SearchServiceProtocol):
|
25
|
+
"""LanceDB-based implementation of the search service protocol."""
|
26
|
+
|
27
|
+
collection_name: str
|
28
|
+
|
29
|
+
def __init__(
|
30
|
+
self,
|
31
|
+
collection_name: str,
|
32
|
+
persist: bool = False,
|
33
|
+
uri: Optional[str] = None,
|
34
|
+
embedding_model_name: str = DEFAULT_EMBEDDING_MODEL,
|
35
|
+
):
|
36
|
+
self.collection_name = collection_name
|
37
|
+
self._persist = persist
|
38
|
+
self._uri = uri
|
39
|
+
self._embedding_model_name = embedding_model_name
|
40
|
+
self._embedding_dims: Optional[int] = None
|
41
|
+
self._db = None
|
42
|
+
self._table = None
|
43
|
+
|
44
|
+
self.embedding_model = SentenceTransformer(self._embedding_model_name)
|
45
|
+
test_embedding = self.embedding_model.encode("test")
|
46
|
+
self._embedding_dims = len(test_embedding)
|
47
|
+
|
48
|
+
if self._persist:
|
49
|
+
self._uri = self._uri if self._uri else DEFAULT_LANCEDB_PERSIST_PATH
|
50
|
+
logger.info(f"Initializing Persistent LanceDB client at path: {self._uri}")
|
51
|
+
Path(self._uri).mkdir(parents=True, exist_ok=True)
|
52
|
+
else:
|
53
|
+
self._temp_dir_obj = tempfile.TemporaryDirectory()
|
54
|
+
self._uri = self._temp_dir_obj.name
|
55
|
+
logger.info(f"Initializing In-Memory LanceDB client using temp path: {self._uri}")
|
56
|
+
|
57
|
+
self._db = lancedb.connect(self._uri)
|
58
|
+
self._open_or_create_table()
|
59
|
+
logger.info(
|
60
|
+
f"LanceDBSearchService initialized. Table '{self.collection_name}' (persist={self._persist} at '{self._uri}'). Model: '{self._embedding_model_name}', Dims: {self._embedding_dims}"
|
61
|
+
)
|
62
|
+
|
63
|
+
def _get_schema(self) -> pa.Schema:
|
64
|
+
if self._embedding_dims is None:
|
65
|
+
raise RuntimeError("Embedding dimensions not determined. Cannot create schema.")
|
66
|
+
|
67
|
+
return pa.schema([
|
68
|
+
pa.field("id", pa.string(), nullable=False),
|
69
|
+
pa.field("vector", pa.list_(pa.float32(), list_size=self._embedding_dims)),
|
70
|
+
pa.field("text", pa.string()),
|
71
|
+
pa.field("metadata_json", pa.string())
|
72
|
+
])
|
73
|
+
|
74
|
+
def _open_or_create_table(self):
|
75
|
+
if self._db is None:
|
76
|
+
raise RuntimeError("LanceDB connection not established.")
|
77
|
+
|
78
|
+
table_names = self._db.table_names()
|
79
|
+
|
80
|
+
if self.collection_name in table_names:
|
81
|
+
logger.debug(f"Opening existing LanceDB table: {self.collection_name}")
|
82
|
+
self._table = self._db.open_table(self.collection_name)
|
83
|
+
else:
|
84
|
+
logger.debug(f"Creating new LanceDB table: {self.collection_name} with schema.")
|
85
|
+
schema = self._get_schema()
|
86
|
+
self._table = self._db.create_table(self.collection_name, schema=schema, mode="create")
|
87
|
+
|
88
|
+
def __del__(self):
|
89
|
+
if not self._persist and hasattr(self, '_temp_dir_obj') and logger:
|
90
|
+
logger.debug(f"Cleaning up temporary directory for in-memory LanceDB: {self._uri}")
|
91
|
+
self._temp_dir_obj.cleanup()
|
92
|
+
|
93
|
+
def index(
|
94
|
+
self,
|
95
|
+
documents: Iterable[Indexable],
|
96
|
+
embedder_device: Optional[str] = None,
|
97
|
+
force_reindex: bool = False,
|
98
|
+
) -> None:
|
99
|
+
indexable_list = list(documents)
|
100
|
+
logger.info(
|
101
|
+
f"Index request for table='{self.collection_name}', docs={len(indexable_list)}, model='{self._embedding_model_name}', force={force_reindex}"
|
102
|
+
)
|
103
|
+
|
104
|
+
if self._table is None or self._db is None:
|
105
|
+
raise RuntimeError(f"LanceDB table '{self.collection_name}' not initialized.")
|
106
|
+
|
107
|
+
if not indexable_list:
|
108
|
+
logger.warning("No documents provided for indexing. Skipping.")
|
109
|
+
return
|
110
|
+
|
111
|
+
if force_reindex:
|
112
|
+
logger.warning(
|
113
|
+
f"Force reindex requested for table '{self.collection_name}'. Deleting existing table and recreating."
|
114
|
+
)
|
115
|
+
self._db.drop_table(self.collection_name)
|
116
|
+
self._open_or_create_table()
|
117
|
+
logger.info(f"Table '{self.collection_name}' deleted and recreated.")
|
118
|
+
|
119
|
+
data_to_add = []
|
120
|
+
texts_to_embed: List[str] = []
|
121
|
+
original_items_info: List[Dict[str, Any]] = []
|
122
|
+
|
123
|
+
import json
|
124
|
+
|
125
|
+
for item in indexable_list:
|
126
|
+
doc_id = item.get_id()
|
127
|
+
metadata = item.get_metadata().copy()
|
128
|
+
content_obj = item.get_content()
|
129
|
+
content_text = ""
|
130
|
+
|
131
|
+
if isinstance(content_obj, str):
|
132
|
+
content_text = content_obj
|
133
|
+
elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
|
134
|
+
content_text = content_obj.extract_text()
|
135
|
+
if not isinstance(content_text, str): content_text = str(content_obj)
|
136
|
+
else:
|
137
|
+
content_text = str(content_obj)
|
138
|
+
|
139
|
+
try:
|
140
|
+
content_hash = item.get_content_hash()
|
141
|
+
if content_hash: metadata["content_hash"] = content_hash
|
142
|
+
except (AttributeError, NotImplementedError): pass
|
143
|
+
except Exception as e: logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
|
144
|
+
|
145
|
+
# Ensure doc_id is not None - use a fallback if needed
|
146
|
+
if doc_id is None:
|
147
|
+
# Generate a unique ID based on content hash or position in the list
|
148
|
+
try:
|
149
|
+
doc_id = f"auto_{item.get_content_hash() if hasattr(item, 'get_content_hash') else hash(content_text)}"
|
150
|
+
except:
|
151
|
+
doc_id = f"auto_{len(texts_to_embed)}"
|
152
|
+
|
153
|
+
texts_to_embed.append(content_text)
|
154
|
+
original_items_info.append({
|
155
|
+
"id": doc_id,
|
156
|
+
"metadata_json": json.dumps(metadata),
|
157
|
+
"text": content_text
|
158
|
+
})
|
159
|
+
|
160
|
+
if not texts_to_embed:
|
161
|
+
logger.warning("No text content to embed. Skipping.")
|
162
|
+
return
|
163
|
+
|
164
|
+
logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
|
165
|
+
generated_embeddings = self.embedding_model.encode(
|
166
|
+
texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
|
167
|
+
)
|
168
|
+
|
169
|
+
for i, item_info in enumerate(original_items_info):
|
170
|
+
data_to_add.append({
|
171
|
+
"id": item_info["id"],
|
172
|
+
"vector": generated_embeddings[i].tolist(),
|
173
|
+
"text": item_info["text"],
|
174
|
+
"metadata_json": item_info["metadata_json"]
|
175
|
+
})
|
176
|
+
|
177
|
+
if not data_to_add:
|
178
|
+
logger.warning("No data prepared for LanceDB. Skipping add.")
|
179
|
+
return
|
180
|
+
|
181
|
+
# Create a PyArrow table with the same schema as the LanceDB table
|
182
|
+
schema = self._get_schema()
|
183
|
+
arrays = [
|
184
|
+
pa.array([item["id"] for item in data_to_add], type=pa.string()),
|
185
|
+
pa.array([item["vector"] for item in data_to_add]),
|
186
|
+
pa.array([item["text"] for item in data_to_add], type=pa.string()),
|
187
|
+
pa.array([item["metadata_json"] for item in data_to_add], type=pa.string()),
|
188
|
+
]
|
189
|
+
table = pa.Table.from_arrays(arrays, schema=schema)
|
190
|
+
|
191
|
+
logger.info(f"Adding/updating {len(data_to_add)} documents to LanceDB table '{self.collection_name}'.")
|
192
|
+
self._table.merge_insert("id").when_matched_update_all().when_not_matched_insert_all().execute(
|
193
|
+
table,
|
194
|
+
)
|
195
|
+
logger.info(f"Successfully added/updated {len(data_to_add)} documents. Table count: {self._table.count_rows()}")
|
196
|
+
|
197
|
+
def search(
|
198
|
+
self,
|
199
|
+
query: Any,
|
200
|
+
options: BaseSearchOptions,
|
201
|
+
) -> List[Dict[str, Any]]:
|
202
|
+
if self._table is None:
|
203
|
+
raise RuntimeError(f"LanceDB table '{self.collection_name}' not initialized.")
|
204
|
+
|
205
|
+
logger.info(f"Search request for table='{self.collection_name}', query_type={type(query).__name__}, options={options}")
|
206
|
+
query_text = ""
|
207
|
+
if isinstance(query, (str, Path)): query_text = str(query)
|
208
|
+
elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
|
209
|
+
query_text = query.extract_text()
|
210
|
+
if not query_text or not query_text.strip(): return []
|
211
|
+
else:
|
212
|
+
raise TypeError(f"Unsupported query type: {type(query)}")
|
213
|
+
|
214
|
+
query_vector = self.embedding_model.encode(query_text).tolist()
|
215
|
+
|
216
|
+
lancedb_filter = None
|
217
|
+
if options.filters:
|
218
|
+
if isinstance(options.filters, str):
|
219
|
+
lancedb_filter = options.filters
|
220
|
+
elif isinstance(options.filters, dict):
|
221
|
+
filter_parts = []
|
222
|
+
for k, v in options.filters.items():
|
223
|
+
if isinstance(v, str):
|
224
|
+
filter_parts.append(f"{k} = '{v}'")
|
225
|
+
else:
|
226
|
+
filter_parts.append(f"{k} = {v}")
|
227
|
+
if filter_parts:
|
228
|
+
lancedb_filter = " AND ".join(filter_parts)
|
229
|
+
logger.warning(f"Filter conversion from dict is basic: {options.filters} -> {lancedb_filter}. For metadata_json, use SQL path expressions.")
|
230
|
+
|
231
|
+
search_query = self._table.search(query_vector).limit(options.top_k)
|
232
|
+
if lancedb_filter:
|
233
|
+
search_query = search_query.where(lancedb_filter)
|
234
|
+
|
235
|
+
results_df = search_query.to_df()
|
236
|
+
final_results: List[Dict[str, Any]] = []
|
237
|
+
import json
|
238
|
+
|
239
|
+
for _, row in results_df.iterrows():
|
240
|
+
metadata = {}
|
241
|
+
if "metadata_json" in row and row["metadata_json"]:
|
242
|
+
try:
|
243
|
+
metadata = json.loads(row["metadata_json"])
|
244
|
+
except json.JSONDecodeError:
|
245
|
+
logger.warning(f"Failed to parse metadata_json for id {row.get('id')}")
|
246
|
+
|
247
|
+
score = 1 - row["_distance"] if "_distance" in row else 0.0
|
248
|
+
|
249
|
+
final_results.append({
|
250
|
+
"id": row.get("id"),
|
251
|
+
"content_snippet": row["text"][:200] if "text" in row and row["text"] else "",
|
252
|
+
"score": score,
|
253
|
+
"page_number": metadata.get("page_number"),
|
254
|
+
"pdf_path": metadata.get("pdf_path"),
|
255
|
+
"metadata": metadata,
|
256
|
+
})
|
257
|
+
logger.info(f"Search returned {len(final_results)} results from LanceDB table '{self.collection_name}'.")
|
258
|
+
return final_results
|
259
|
+
|
260
|
+
def delete_index(self) -> bool:
|
261
|
+
if self._db is None:
|
262
|
+
logger.warning("LanceDB connection not initialized. Cannot delete index.")
|
263
|
+
return False
|
264
|
+
logger.warning(f"Request to delete LanceDB table '{self.collection_name}'.")
|
265
|
+
|
266
|
+
self._db.drop_table(self.collection_name)
|
267
|
+
self._table = None
|
268
|
+
logger.info(f"LanceDB table '{self.collection_name}' deleted successfully.")
|
269
|
+
return True
|
270
|
+
|
271
|
+
def index_exists(self) -> bool:
|
272
|
+
if self._db is None:
|
273
|
+
return False
|
274
|
+
exists = self.collection_name in self._db.table_names()
|
275
|
+
if exists:
|
276
|
+
tbl = self._db.open_table(self.collection_name)
|
277
|
+
count = tbl.count_rows()
|
278
|
+
logger.debug(f"LanceDB table '{self.collection_name}' found with {count} documents. Exists: {count > 0}")
|
279
|
+
return count > 0
|
280
|
+
|
281
|
+
logger.debug(f"LanceDB table '{self.collection_name}' not found in db.table_names().")
|
282
|
+
return False
|
283
|
+
|
284
|
+
def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
|
285
|
+
if self._table is None:
|
286
|
+
raise RuntimeError("Table not initialized")
|
287
|
+
logger.debug(f"Listing documents for LanceDB table '{self.collection_name}' (include_metadata={include_metadata})...")
|
288
|
+
|
289
|
+
select_columns = ["id"]
|
290
|
+
if include_metadata:
|
291
|
+
select_columns.append("metadata_json")
|
292
|
+
|
293
|
+
lancedb_filter = kwargs.get("filters")
|
294
|
+
|
295
|
+
query = self._table.to_lance().scanner(columns=select_columns, filter=lancedb_filter)
|
296
|
+
results_table = query.to_table()
|
297
|
+
results_list = results_table.to_pylist()
|
298
|
+
|
299
|
+
formatted_docs: List[Dict[str, Any]] = []
|
300
|
+
import json
|
301
|
+
for row in results_list:
|
302
|
+
doc_data: Dict[str, Any] = {"id": row.get("id")}
|
303
|
+
if include_metadata and "metadata_json" in row and row["metadata_json"]:
|
304
|
+
try:
|
305
|
+
metadata = json.loads(row["metadata_json"])
|
306
|
+
doc_data["meta"] = metadata
|
307
|
+
except json.JSONDecodeError:
|
308
|
+
doc_data["meta"] = {}
|
309
|
+
formatted_docs.append(doc_data)
|
310
|
+
logger.info(f"Retrieved {len(formatted_docs)} documents from LanceDB table '{self.collection_name}'.")
|
311
|
+
return formatted_docs
|
312
|
+
|
313
|
+
def delete_documents(self, ids: List[str]) -> None:
|
314
|
+
if self._table is None:
|
315
|
+
raise RuntimeError("Table not initialized")
|
316
|
+
if not ids:
|
317
|
+
logger.debug("No document IDs provided for deletion. Skipping.")
|
318
|
+
return
|
319
|
+
|
320
|
+
id_filter_string = ", ".join([f"'{doc_id}'" for doc_id in ids])
|
321
|
+
delete_condition = f"id IN ({id_filter_string})"
|
322
|
+
logger.warning(f"Request to delete {len(ids)} documents from LanceDB table '{self.collection_name}' with condition: {delete_condition}")
|
323
|
+
|
324
|
+
self._table.delete(delete_condition)
|
325
|
+
logger.info(f"Successfully requested deletion of {len(ids)} documents. Table count now: {self._table.count_rows()}")
|