PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/web_search_engines/engines/search_engine_local.py CHANGED Viewed

@@ -1,46 +1,107 @@
-from typing import Dict, List, Any, Optional, Tuple, Union
-import os
-import json
 import hashlib
+import json
+import logging
+import os
 import time
+import uuid
+from concurrent.futures import ProcessPoolExecutor
 from datetime import datetime
 from pathlib import Path
-import tiktoken
-import logging
-import re
-import pickle
+from typing import Any, Dict, Iterable, List, Optional
-from faiss import normalize_L2
-from langchain_core.language_models import BaseLLM
+from faiss import IndexFlatL2
+from langchain_community.docstore.in_memory import InMemoryDocstore
 from langchain_community.document_loaders import (
-    PyPDFLoader,
-    TextLoader,
-    UnstructuredMarkdownLoader,
-    UnstructuredWordDocumentLoader,
     CSVLoader,
+    PyPDFLoader,
+    TextLoader,
     UnstructuredExcelLoader,
-    DirectoryLoader
+    UnstructuredMarkdownLoader,
+    UnstructuredWordDocumentLoader,
 )
 from langchain_community.document_loaders.base import BaseLoader
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
-from langchain_community.vectorstores.utils import DistanceStrategy
 from langchain_community.embeddings import (
     HuggingFaceEmbeddings,
     OllamaEmbeddings,
-    SentenceTransformerEmbeddings
+    SentenceTransformerEmbeddings,
 )
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+from langchain_core.language_models import BaseLLM
+from langchain_text_splitters import RecursiveCharacterTextSplitter
-from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
-from local_deep_research import config
+from ...config import search_config
+from ..search_engine_base import BaseSearchEngine
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
+    """Get an appropriate document loader for a file based on its extension"""
+    file_path = Path(file_path)
+    extension = file_path.suffix.lower()
+    try:
+        if extension == ".pdf":
+            return PyPDFLoader(str(file_path))
+        elif extension == ".txt":
+            return TextLoader(str(file_path))
+        elif extension in [".md", ".markdown"]:
+            return UnstructuredMarkdownLoader(str(file_path))
+        elif extension in [".doc", ".docx"]:
+            return UnstructuredWordDocumentLoader(str(file_path))
+        elif extension == ".csv":
+            return CSVLoader(str(file_path))
+        elif extension in [".xls", ".xlsx"]:
+            return UnstructuredExcelLoader(str(file_path))
+        else:
+            # Try the text loader as a fallback for unknown extensions
+            logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
+            return TextLoader(str(file_path), encoding="utf-8")
+    except Exception as e:
+        logger.error(f"Error creating loader for {file_path}: {e}")
+        return None
+def _load_document(file_path: Path) -> List[Document]:
+    """
+    Loads documents from a file.
+    Args:
+        file_path: The path to the document to load.
+    Returns:
+        The loaded documents, or an empty list if it failed to load.
+    """
+    # Get a loader for this file
+    loader = _get_file_loader(str(file_path))
+    if loader is None:
+        # No loader for this filetype.
+        return []
+    try:
+        # Load the document
+        docs = loader.load()
+        # Add source path metadata and ID.
+        for doc in docs:
+            doc.metadata["source"] = str(file_path)
+            doc.metadata["filename"] = file_path.name
+    except Exception as e:
+        logger.error(f"Error loading {file_path}: {e}")
+        return []
+    return docs
 class LocalEmbeddingManager:
     """Handles embedding generation and storage for local document search"""
     def __init__(
         self,
         embedding_model: str = "all-MiniLM-L6-v2",
@@ -53,7 +114,7 @@ class LocalEmbeddingManager:
     ):
         """
         Initialize the embedding manager for local document search.
         Args:
             embedding_model: Name of the embedding model to use
             embedding_device: Device to run embeddings on ('cpu' or 'cuda')
@@ -63,7 +124,7 @@ class LocalEmbeddingManager:
             chunk_overlap: Overlap between chunks
             cache_dir: Directory to store embedding cache and index
         """
         self.embedding_model = embedding_model
         self.embedding_device = embedding_device
         self.embedding_model_type = embedding_model_type
@@ -71,29 +132,29 @@ class LocalEmbeddingManager:
         self.chunk_size = chunk_size
         self.chunk_overlap = chunk_overlap
         self.cache_dir = Path(cache_dir)
         # Create cache directory if it doesn't exist
         self.cache_dir.mkdir(parents=True, exist_ok=True)
         # Initialize the embedding model
         self._embeddings = None
         # Initialize the text splitter
         self.text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=chunk_size,
-            chunk_overlap=chunk_overlap
+            chunk_size=chunk_size, chunk_overlap=chunk_overlap
         )
         # Track indexed folders and their metadata
         self.indexed_folders = self._load_indexed_folders()
         # Vector store cache
         self.vector_stores = {}
     @property
     def embeddings(self):
         """
         Lazily initialize embeddings when first accessed.
-        This allows the LocalEmbeddingManager to be created without
+        This allows the LocalEmbeddingManager to be created without
         immediately loading models, which is helpful when no local search is performed.
         """
         if self._embeddings is None:
@@ -107,30 +168,38 @@ class LocalEmbeddingManager:
             if self.embedding_model_type == "ollama":
                 # Use Ollama for embeddings
                 if not self.ollama_base_url:
-                    self.ollama_base_url = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
-                logger.info(f"Initializing Ollama embeddings with model {self.embedding_model}")
+                    self.ollama_base_url = os.getenv(
+                        "OLLAMA_BASE_URL", "http://localhost:11434"
+                    )
+                logger.info(
+                    f"Initializing Ollama embeddings with model {self.embedding_model}"
+                )
                 return OllamaEmbeddings(
-                    model=self.embedding_model,
-                    base_url=self.ollama_base_url
+                    model=self.embedding_model, base_url=self.ollama_base_url
                 )
             else:
                 # Default: Use SentenceTransformers/HuggingFace
-                logger.info(f"Initializing SentenceTransformerEmbeddings with model {self.embedding_model}")
+                logger.info(
+                    f"Initializing SentenceTransformerEmbeddings with model {self.embedding_model}"
+                )
                 return SentenceTransformerEmbeddings(
                     model_name=self.embedding_model,
-                    model_kwargs={"device": self.embedding_device}
+                    model_kwargs={"device": self.embedding_device},
                 )
         except Exception as e:
             logger.error(f"Error initializing embeddings: {e}")
-            logger.warning("Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2")
+            logger.warning(
+                "Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2"
+            )
             return HuggingFaceEmbeddings(
                 model_name="sentence-transformers/all-MiniLM-L6-v2"
             )
     def _load_or_create_vector_store(self):
         """Load the vector store from disk or create it if needed"""
         vector_store_path = self._get_vector_store_path()
         # Check if vector store exists and is up to date
         if vector_store_path.exists() and not self._check_folders_modified():
             logger.info(f"Loading existing vector store from {vector_store_path}")
@@ -139,86 +208,121 @@ class LocalEmbeddingManager:
                     str(vector_store_path),
                     self.embeddings,
                     allow_dangerous_deserialization=True,
-                    normalize_L2=True
+                    normalize_L2=True,
                 )
                 # Add this code to show document count
                 doc_count = len(vector_store.index_to_docstore_id)
                 logger.info(f"Loaded index with {doc_count} document chunks")
                 return vector_store
             except Exception as e:
                 logger.error(f"Error loading vector store: {e}")
                 logger.info("Will create a new vector store")
         # Create a new vector store
-        return self._create_vector_store()
+        return self._create_vector_store()
     def _load_indexed_folders(self) -> Dict[str, Dict[str, Any]]:
         """Load metadata about indexed folders from disk"""
         index_metadata_path = self.cache_dir / "index_metadata.json"
         if index_metadata_path.exists():
             try:
                 with open(index_metadata_path, "r") as f:
                     return json.load(f)
             except Exception as e:
                 logger.error(f"Error loading index metadata: {e}")
         return {}
     def _save_indexed_folders(self):
         """Save metadata about indexed folders to disk"""
         index_metadata_path = self.cache_dir / "index_metadata.json"
         try:
             with open(index_metadata_path, "w") as f:
                 json.dump(self.indexed_folders, f, indent=2)
         except Exception as e:
             logger.error(f"Error saving index metadata: {e}")
-    def _get_folder_hash(self, folder_path: str) -> str:
+    @staticmethod
+    def get_folder_hash(folder_path: Path) -> str:
         """Generate a hash for a folder based on its path"""
-        # Strip trailing slashes if we have them.
-        if folder_path.endswith("/"):
-            folder_path = folder_path[:-1]
+        # Canonicalize the path so we don't have weird Windows vs. Linux
+        # problems or issues with trailing slashes.
+        canonical_folder_path = "/".join(folder_path.parts)
+        return hashlib.md5(canonical_folder_path.encode()).hexdigest()
-        return hashlib.md5(folder_path.encode()).hexdigest()
-    def _get_index_path(self, folder_path: str) -> Path:
+    def _get_index_path(self, folder_path: Path) -> Path:
         """Get the path where the index for a specific folder should be stored"""
-        folder_hash = self._get_folder_hash(folder_path)
+        folder_hash = self.get_folder_hash(folder_path)
         return self.cache_dir / f"index_{folder_hash}"
-    def _check_folder_modified(self, folder_path: str) -> bool:
+    def _check_folder_modified(self, folder_path: Path) -> bool:
         """Check if a folder has been modified since it was last indexed"""
-        folder_path = Path(folder_path)
+    @staticmethod
+    def _get_all_files(folder_path: Path) -> Iterable[Path]:
+        """
+        Gets all the files, recursively, in a folder.
+        Args:
+            folder_path: The path to the folder.
+        Yields:
+            Each of the files in the folder.
+        """
+        for root, _, files in os.walk(folder_path):
+            for file in files:
+                yield Path(root) / file
+    def _get_modified_files(self, folder_path: Path) -> List[Path]:
+        """
+        Gets the files in a folder that have been modified since it was last
+        indexed.
+        Args:
+            folder_path: The path to the folder to check.
+        Returns:
+            A list of the files that were modified.
+        """
         if not folder_path.exists() or not folder_path.is_dir():
-            return False
-        folder_hash = self._get_folder_hash(str(folder_path))
-        # If folder has never been indexed, it's considered modified
+            return []
+        folder_hash = self.get_folder_hash(folder_path)
         if folder_hash not in self.indexed_folders:
-            return True
-        last_indexed = self.indexed_folders[folder_hash].get("last_indexed", 0)
+            # If folder has never been indexed, everything has been modified.
+            last_indexed = 0
+            indexed_files = set()
+        else:
+            last_indexed = self.indexed_folders[folder_hash].get("last_indexed", 0)
+            indexed_files = (
+                self.indexed_folders[folder_hash].get("indexed_files", {}).keys()
+            )
         # Check if any file in the folder has been modified since last indexing
-        for root, _, files in os.walk(folder_path):
-            for file in files:
-                file_path = Path(root) / file
-                if file_path.stat().st_mtime > last_indexed:
-                    return True
-        return False
+        modified_files = []
+        for file_path in self._get_all_files(folder_path):
+            file_stats = file_path.stat()
+            if file_stats.st_mtime > last_indexed:
+                modified_files.append(file_path)
+            elif str(file_path.relative_to(folder_path)) not in indexed_files:
+                # This file somehow never got indexed.
+                modified_files.append(file_path)
-    def _check_config_changed(self, folder_path: str) -> bool:
+        return modified_files
+    def _check_config_changed(self, folder_path: Path) -> bool:
         """
         Checks if the embedding configuration for a folder has been changed
         since it was last indexed.
         """
-        folder_hash = self._get_folder_hash(folder_path)
+        folder_hash = self.get_folder_hash(folder_path)
         if folder_hash not in self.indexed_folders:
             # It hasn't been indexed at all. That's a new configuration,
@@ -231,228 +335,225 @@ class LocalEmbeddingManager:
         embedding_model = embedding_config.get("embedding_model", "")
         if (chunk_size, chunk_overlap, embedding_model) != (
-                self.chunk_size, self.chunk_overlap, self.embedding_model
+            self.chunk_size,
+            self.chunk_overlap,
+            self.embedding_model,
         ):
-            logger.info(
-                "Embedding configuration has changed, re-indexing folder."
-            )
+            logger.info("Embedding configuration has changed, re-indexing folder.")
             return True
         return False
-    def get_file_loader(self, file_path: str) -> Optional[BaseLoader]:
-        """Get an appropriate document loader for a file based on its extension"""
-        file_path = Path(file_path)
-        extension = file_path.suffix.lower()
-        try:
-            if extension == ".pdf":
-                return PyPDFLoader(str(file_path))
-            elif extension == ".txt":
-                return TextLoader(str(file_path))
-            elif extension in [".md", ".markdown"]:
-                return UnstructuredMarkdownLoader(str(file_path))
-            elif extension in [".doc", ".docx"]:
-                return UnstructuredWordDocumentLoader(str(file_path))
-            elif extension == ".csv":
-                return CSVLoader(str(file_path))
-            elif extension in [".xls", ".xlsx"]:
-                return UnstructuredExcelLoader(str(file_path))
-            else:
-                # Try the text loader as a fallback for unknown extensions
-                logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
-                return TextLoader(str(file_path), encoding="utf-8")
-        except Exception as e:
-            logger.error(f"Error creating loader for {file_path}: {e}")
-            return None
     def index_folder(self, folder_path: str, force_reindex: bool = False) -> bool:
         """
         Index all documents in a folder for vector search.
         Args:
             folder_path: Path to the folder to index
             force_reindex: Whether to force reindexing even if unchanged
         Returns:
             bool: True if indexing was successful, False otherwise
         """
         folder_path = Path(folder_path)
         # Validate folder
         if not folder_path.exists():
             logger.error(f"Folder not found: {folder_path}")
             return False
         if not folder_path.is_dir():
             logger.error(f"Path is not a directory: {folder_path}")
             return False
         folder_str = str(folder_path)
-        folder_hash = self._get_folder_hash(folder_str)
-        index_path = self._get_index_path(folder_str)
-        # Check if folder needs to be reindexed
-        if (not force_reindex and not self._check_folder_modified(folder_str)
-                and not self._check_config_changed(folder_str)):
-            logger.info(f"Folder {folder_path} has not been modified since last indexing")
-            # Load the vector store from disk if not already loaded
-            if folder_hash not in self.vector_stores:
-                try:
-                    self.vector_stores[folder_hash] = FAISS.load_local(
-                        str(index_path),
-                        self.embeddings,
-                        allow_dangerous_deserialization=True,
-                        normalize_L2=True,
-                    )
-                    logger.info(f"Loaded index for {folder_path} from disk")
-                except Exception as e:
-                    logger.error(f"Error loading index for {folder_path}: {e}")
-                    # If loading fails, force reindexing
-                    force_reindex = True
-            else:
-                logger.info(f"Using cached index for {folder_path}")
-            # If no reindexing is needed and vector store loaded successfully
-            if not force_reindex and folder_hash in self.vector_stores:
-                return True
+        folder_hash = self.get_folder_hash(folder_path)
+        index_path = self._get_index_path(folder_path)
+        if force_reindex or self._check_config_changed(folder_path):
+            logger.info(f"Re-indexing entire folder: {folder_path}")
+            modified_files = list(self._get_all_files(folder_path))
+        else:
+            # Just re-index the modified files if we can get away with it.
+            modified_files = self._get_modified_files(folder_path)
+            logger.info(f"Re-indexing {len(modified_files)} modified files...")
+        # Load the vector store from disk if not already loaded
+        if folder_hash not in self.vector_stores and index_path.exists():
+            try:
+                self.vector_stores[folder_hash] = FAISS.load_local(
+                    str(index_path),
+                    self.embeddings,
+                    allow_dangerous_deserialization=True,
+                    normalize_L2=True,
+                )
+                logger.info(f"Loaded index for {folder_path} from disk")
+            except Exception as e:
+                logger.error(f"Error loading index for {folder_path}: {e}")
+                # If loading fails, force reindexing
+                force_reindex = True
         logger.info(f"Indexing folder: {folder_path}")
         start_time = time.time()
         # Find documents to index
         all_docs = []
-        file_count = 0
-        error_count = 0
-        for root, _, files in os.walk(folder_path):
-            for file in files:
-                file_path = Path(root) / file
-                # Skip hidden files and directories
-                if file.startswith(".") or any(part.startswith(".") for part in file_path.parts):
-                    continue
-                # Get a loader for this file
-                loader = self.get_file_loader(str(file_path))
-                if loader:
-                    try:
-                        # Load the document
-                        docs = loader.load()
-                        # Add source path metadata
-                        for doc in docs:
-                            doc.metadata["source"] = str(file_path)
-                            doc.metadata["filename"] = file
-                        all_docs.extend(docs)
-                        file_count += 1
-                    except Exception as e:
-                        logger.error(f"Error loading {file_path}: {e}")
-                        error_count += 1
-        if not all_docs:
-            logger.warning(f"No documents found in {folder_path} or all documents failed to load")
-            return False
+        # Remove hidden files and directories.
+        modified_files = [
+            p
+            for p in modified_files
+            if not p.name.startswith(".")
+            and not any(part.startswith(".") for part in p.parts)
+        ]
+        # Index them.
+        with ProcessPoolExecutor() as executor:
+            all_docs_nested = executor.map(_load_document, modified_files)
+        # Flatten the result.
+        for docs in all_docs_nested:
+            all_docs.extend(docs)
+        if force_reindex or folder_hash not in self.vector_stores:
+            logger.info(f"Creating new index for {folder_path}")
+            # Embed a test query to figure out embedding length.
+            test_embedding = self.embeddings.embed_query("hello world")
+            index = IndexFlatL2(len(test_embedding))
+            self.vector_stores[folder_hash] = FAISS(
+                self.embeddings,
+                index=index,
+                docstore=InMemoryDocstore(),
+                index_to_docstore_id={},
+                normalize_L2=True,
+            )
         # Split documents into chunks
         logger.info(f"Splitting {len(all_docs)} documents into chunks")
         splits = self.text_splitter.split_documents(all_docs)
-        logger.info(f"Created {len(splits)} chunks from {file_count} files")
+        logger.info(f"Created {len(splits)} chunks from {len(modified_files)} files")
         # Create vector store
-        logger.info(f"Creating vector store with {len(splits)} chunks")
-        vector_store = FAISS.from_documents(
-            splits,
-            self.embeddings,
-            normalize_L2=True
-        )
+        ids = []
+        if splits:
+            logger.info(f"Adding {len(splits)} chunks to vector store")
+            ids = [uuid.uuid4().hex for _ in splits]
+            self.vector_stores[folder_hash].add_documents(splits, ids=ids)
+        # Update indexing time for individual files.
+        index_time = time.time()
+        indexed_files = {}
+        if folder_hash in self.indexed_folders:
+            indexed_files = (
+                self.indexed_folders[folder_hash].get("indexed_files", {}).copy()
+            )
+        for split_id, split in zip(ids, splits):
+            split_source = str(Path(split.metadata["source"]).relative_to(folder_path))
+            id_list = indexed_files.setdefault(split_source, [])
+            id_list.append(split_id)
+        # Check for any files that were removed and remove them from the
+        # vector store.
+        delete_ids = []
+        delete_paths = []
+        for relative_path, chunk_ids in indexed_files.items():
+            if not (folder_path / Path(relative_path)).exists():
+                delete_ids.extend(chunk_ids)
+                delete_paths.append(relative_path)
+        if delete_ids:
+            logger.info(
+                f"Deleting {len(delete_paths)} non-existent files from the " f"index."
+            )
+            self.vector_stores[folder_hash].delete(delete_ids)
+        for path in delete_paths:
+            del indexed_files[path]
         # Save the vector store to disk
         logger.info(f"Saving index to {index_path}")
-        vector_store.save_local(str(index_path))
-        # Update cache
-        self.vector_stores[folder_hash] = vector_store
+        self.vector_stores[folder_hash].save_local(str(index_path))
         # Update metadata
         self.indexed_folders[folder_hash] = {
             "path": folder_str,
-            "last_indexed": time.time(),
-            "file_count": file_count,
+            "last_indexed": index_time,
+            "file_count": len(modified_files),
             "chunk_count": len(splits),
-            "error_count": error_count,
             "embedding_model": self.embedding_model,
             "chunk_size": self.chunk_size,
-            "chunk_overlap": self.chunk_overlap
+            "chunk_overlap": self.chunk_overlap,
+            "indexed_files": indexed_files,
         }
         # Save updated metadata
         self._save_indexed_folders()
         elapsed_time = time.time() - start_time
-        logger.info(f"Indexed {file_count} files in {elapsed_time:.2f} seconds")
+        logger.info(
+            f"Indexed {len(modified_files)} files in {elapsed_time:.2f} seconds"
+        )
         return True
     def search(
-        self,
-        query: str,
+        self,
+        query: str,
         folder_paths: List[str],
         limit: int = 10,
         score_threshold: float = 0.0,
     ) -> List[Dict[str, Any]]:
         """
         Search for documents relevant to a query across specified folders.
         Args:
             query: The search query
             folder_paths: List of folder paths to search in
             limit: Maximum number of results to return
             score_threshold: Minimum similarity score threshold
         Returns:
             List of results with document content and metadata
         """
+        folder_paths = [Path(p) for p in folder_paths]
         # Add detailed debugging for each folder
         for folder_path in folder_paths:
-            folder_hash = self._get_folder_hash(folder_path)
+            folder_hash = self.get_folder_hash(folder_path)
             index_path = self._get_index_path(folder_path)
             logger.info(f"Diagnostic for {folder_path}:")
             logger.info(f"  - Folder hash: {folder_hash}")
             logger.info(f"  - Index path: {index_path}")
             logger.info(f"  - Index exists on disk: {index_path.exists()}")
-            logger.info(f"  - Is in indexed_folders: {folder_hash in self.indexed_folders}")
+            logger.info(
+                f"  - Is in indexed_folders: {folder_hash in self.indexed_folders}"
+            )
             if folder_hash in self.indexed_folders:
                 meta = self.indexed_folders[folder_hash]
-                logger.info(f"  - Metadata: file_count={meta.get('file_count', 0)}, chunk_count={meta.get('chunk_count', 0)}")
+                logger.info(
+                    f"  - Metadata: file_count={meta.get('file_count', 0)}, chunk_count={meta.get('chunk_count', 0)}"
+                )
         # Validate folders exist
         valid_folder_paths = []
         for path in folder_paths:
-            if os.path.exists(path) and os.path.isdir(path):
+            if path.exists() and path.is_dir():
                 valid_folder_paths.append(path)
             else:
                 logger.warning(f"Skipping non-existent folder in search: {path}")
         # If no valid folders, return empty results
         if not valid_folder_paths:
             logger.warning(f"No valid folders to search among: {folder_paths}")
             return []
         all_results = []
         for folder_path in valid_folder_paths:
-            folder_hash = self._get_folder_hash(folder_path)
+            folder_hash = self.get_folder_hash(folder_path)
             # Skip folders that haven't been indexed
             if folder_hash not in self.indexed_folders:
                 logger.warning(f"Folder {folder_path} has not been indexed")
                 continue
             # Make sure the vector store is loaded
             if folder_hash not in self.vector_stores:
                 index_path = self._get_index_path(folder_path)
@@ -461,74 +562,71 @@ class LocalEmbeddingManager:
                         str(index_path),
                         self.embeddings,
                         allow_dangerous_deserialization=True,
-                        nomalize_L2=True
+                        nomalize_L2=True,
                     )
                 except Exception as e:
                     logger.error(f"Error loading index for {folder_path}: {e}")
                     continue
             # Search in this folder
             vector_store = self.vector_stores[folder_hash]
             try:
-                docs_with_scores = (
-                    vector_store.similarity_search_with_relevance_scores(
-                        query,
-                        k=limit
-                    )
+                docs_with_scores = vector_store.similarity_search_with_relevance_scores(
+                    query, k=limit
                 )
                 for doc, similarity in docs_with_scores:
                     # Skip results below the threshold
                     if similarity < score_threshold:
                         continue
                     result = {
                         "content": doc.page_content,
                         "metadata": doc.metadata,
                         "similarity": float(similarity),
-                        "folder": folder_path
+                        "folder": folder_path,
                     }
                     all_results.append(result)
             except Exception as e:
                 logger.error(f"Error searching in {folder_path}: {e}")
         # Sort by similarity (highest first)
         all_results.sort(key=lambda x: x["similarity"], reverse=True)
         # Limit to the requested number
         return all_results[:limit]
     def clear_cache(self):
         """Clear all cached vector stores from memory (not disk)"""
         self.vector_stores.clear()
     def get_indexed_folders_info(self) -> List[Dict[str, Any]]:
         """Get information about all indexed folders"""
         info = []
         for folder_hash, metadata in self.indexed_folders.items():
             folder_info = metadata.copy()
             # Add formatted last indexed time
             if "last_indexed" in folder_info:
                 folder_info["last_indexed_formatted"] = datetime.fromtimestamp(
                     folder_info["last_indexed"]
                 ).strftime("%Y-%m-%d %H:%M:%S")
             # Check if index file exists
-            index_path = self._get_index_path(folder_info["path"])
+            index_path = self._get_index_path(Path(folder_info["path"]))
             folder_info["index_exists"] = index_path.exists()
             info.append(folder_info)
         return info
 class LocalSearchEngine(BaseSearchEngine):
     """Local document search engine with two-phase retrieval"""
     def __init__(
         self,
         paths: List[str],
@@ -547,7 +645,7 @@ class LocalSearchEngine(BaseSearchEngine):
     ):
         """
         Initialize the local search engine.
         Args:
             paths: List of folder paths to search in
             llm: Language model for relevance filtering
@@ -565,7 +663,7 @@ class LocalSearchEngine(BaseSearchEngine):
         """
         # Initialize the base search engine
         super().__init__(llm=llm, max_filtered_results=max_filtered_results)
         # Validate folder paths
         self.folder_paths = paths
         self.valid_folder_paths = []
@@ -574,15 +672,19 @@ class LocalSearchEngine(BaseSearchEngine):
                 self.valid_folder_paths.append(path)
             else:
                 logger.warning(f"Folder not found or is not a directory: {path}")
         # If no valid folders, log a clear message
         if not self.valid_folder_paths and paths:
             logger.warning(f"No valid folders found among: {paths}")
-            logger.warning("This search engine will return no results until valid folders are configured")
+            logger.warning(
+                "This search engine will return no results until valid folders are configured"
+            )
         self.max_results = max_results
-        self.collections = collections or {"default": {"paths": paths, "description": "Default collection"}}
+        self.collections = collections or {
+            "default": {"paths": paths, "description": "Default collection"}
+        }
         # Initialize the embedding manager with only valid folders
         self.embedding_manager = LocalEmbeddingManager(
             embedding_model=embedding_model,
@@ -591,339 +693,400 @@ class LocalSearchEngine(BaseSearchEngine):
             ollama_base_url=ollama_base_url,
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
-            cache_dir=cache_dir
+            cache_dir=cache_dir,
         )
         # Index all folders
         self._index_folders(force_reindex)
     def _index_folders(self, force_reindex: bool = False):
         """Index all valid configured folders"""
         indexed = []
         failed = []
         skipped = []
         # Keep track of invalid folders
         for folder in self.folder_paths:
             if folder not in self.valid_folder_paths:
                 skipped.append(folder)
                 continue
             success = self.embedding_manager.index_folder(folder, force_reindex)
             if success:
                 indexed.append(folder)
             else:
                 failed.append(folder)
         if indexed:
-            logger.info(f"Successfully indexed {len(indexed)} folders: {', '.join(indexed)}")
+            logger.info(
+                f"Successfully indexed {len(indexed)} folders: {', '.join(indexed)}"
+            )
         if failed:
-            logger.warning(f"Failed to index {len(failed)} folders: {', '.join(failed)}")
+            logger.warning(
+                f"Failed to index {len(failed)} folders: {', '.join(failed)}"
+            )
         if skipped:
-            logger.warning(f"Skipped {len(skipped)} invalid folders: {', '.join(skipped)}")
-    def _get_previews(self, query: str, collection_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
+            logger.warning(
+                f"Skipped {len(skipped)} invalid folders: {', '.join(skipped)}"
+            )
+    def _get_previews(
+        self, query: str, collection_names: Optional[List[str]] = None
+    ) -> List[Dict[str, Any]]:
         """
         Get preview information for documents matching the query.
         Args:
             query: The search query
             collection_names: Specific collections to search within (if None, search all)
         Returns:
             List of preview dictionaries
         """
         # Determine which collections to search
         if collection_names:
             # Search only in specified collections
-            collections_to_search = {name: self.collections[name] for name in collection_names
-                                    if name in self.collections}
+            collections_to_search = {
+                name: self.collections[name]
+                for name in collection_names
+                if name in self.collections
+            }
             if not collections_to_search:
                 logger.warning(f"No valid collections found among: {collection_names}")
                 return []
         else:
             # Search in all collections
             collections_to_search = self.collections
         # Extract all folder paths from the collections to search
         search_paths = []
         for collection_config in collections_to_search.values():
             if "paths" in collection_config:
                 search_paths.extend(collection_config["paths"])
-        logger.info(f"Searching local documents in collections: {list(collections_to_search.keys())}")
+        logger.info(
+            f"Searching local documents in collections: {list(collections_to_search.keys())}"
+        )
         # Filter out invalid paths
-        valid_search_paths = [path for path in search_paths if path in self.valid_folder_paths]
+        valid_search_paths = [
+            path for path in search_paths if path in self.valid_folder_paths
+        ]
         if not valid_search_paths:
-            logger.warning(f"No valid folders to search in collections: {list(collections_to_search.keys())}")
+            logger.warning(
+                f"No valid folders to search in collections: {list(collections_to_search.keys())}"
+            )
             return []
         # Search across the valid selected folders
         raw_results = self.embedding_manager.search(
             query=query,
             folder_paths=valid_search_paths,
             limit=self.max_results,
-            score_threshold=0.1  # Skip very low relevance results
+            score_threshold=0.1,  # Skip very low relevance results
         )
         if not raw_results:
             logger.info(f"No local documents found for query: {query}")
             return []
         # Convert to preview format
         previews = []
         for i, result in enumerate(raw_results):
             # Create a unique ID
-            result_id = f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
+            result_id = (
+                f"local-{i}-{hashlib.md5(result['content'][:50].encode()).hexdigest()}"
+            )
             # Extract filename and path
-            source_path = result['metadata'].get('source', 'Unknown')
-            filename = result['metadata'].get('filename', os.path.basename(source_path))
+            source_path = result["metadata"].get("source", "Unknown")
+            filename = result["metadata"].get("filename", os.path.basename(source_path))
             # Create preview snippet (first ~200 chars of content)
-            snippet = result['content'][:200] + "..." if len(result['content']) > 200 else result['content']
+            snippet = (
+                result["content"][:200] + "..."
+                if len(result["content"]) > 200
+                else result["content"]
+            )
             # Determine which collection this document belongs to
             collection_name = "Unknown"
-            folder_path = result['folder']
+            folder_path = result["folder"]
             for name, collection in self.collections.items():
-                if any(folder_path.startswith(path) for path in collection.get("paths", [])):
-                    collection_name = name
+                if any(
+                    folder_path.is_relative_to(path)
+                    for path in collection.get("paths", [])
+                ):
                     break
             # Format the preview
             preview = {
                 "id": result_id,
                 "title": filename,
                 "snippet": snippet,
                 "link": source_path,
-                "similarity": result['similarity'],
-                "folder": folder_path,
+                "similarity": result["similarity"],
+                "folder": folder_path.as_posix(),
                 "collection": collection_name,
-                "collection_description": self.collections.get(collection_name, {}).get("description", ""),
-                "_full_content": result['content'],  # Store full content for later
-                "_metadata": result['metadata']  # Store metadata for later
+                "collection_description": self.collections.get(collection_name, {}).get(
+                    "description", ""
+                ),
+                "_full_content": result["content"],  # Store full content for later
+                "_metadata": result["metadata"],  # Store metadata for later
             }
             previews.append(preview)
         logger.info(f"Found {len(previews)} local document matches")
         return previews
-    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
         """
         Get full content for the relevant documents.
         For local search, the full content is already available.
         Args:
             relevant_items: List of relevant preview dictionaries
         Returns:
             List of result dictionaries with full content
         """
         # Check if we should add full content
-        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
             logger.info("Snippet-only mode, skipping full content addition")
             return relevant_items
         # For local search, we already have the full content
         results = []
         for item in relevant_items:
             # Create a copy with full content
             result = item.copy()
             # Add full content if we have it
             if "_full_content" in item:
                 result["content"] = item["_full_content"]
                 result["full_content"] = item["_full_content"]
                 # Remove temporary fields
                 if "_full_content" in result:
                     del result["_full_content"]
             # Add metadata if we have it
             if "_metadata" in item:
                 result["document_metadata"] = item["_metadata"]
                 # Remove temporary fields
                 if "_metadata" in result:
                     del result["_metadata"]
             results.append(result)
         return results
-    def run(self, query: str, collection_names: Optional[List[str]] = None) -> List[Dict[str, Any]]:
+    def run(
+        self, query: str, collection_names: Optional[List[str]] = None
+    ) -> List[Dict[str, Any]]:
         """
         Execute a search using the two-phase approach.
         Args:
             query: The search query
             collection_names: Specific collections to search within (if None, search all)
         Returns:
             List of search result dictionaries with full content
         """
-        logger.info(f"---Execute a search using Local Documents---")
+        logger.info("---Execute a search using Local Documents---")
         # Check if we have any special collection parameters in the query
         collection_prefix = "collection:"
         remaining_query = query
         specified_collections = []
         # Parse query for collection specifications like "collection:research_papers query terms"
         query_parts = query.split()
         for part in query_parts:
             if part.lower().startswith(collection_prefix):
-                collection_name = part[len(collection_prefix):].strip()
+                collection_name = part[len(collection_prefix) :].strip()
                 if collection_name in self.collections:
                     specified_collections.append(collection_name)
                     # Remove this part from the query
                     remaining_query = remaining_query.replace(part, "", 1).strip()
         # If collections were specified in the query, they override the parameter
         if specified_collections:
             collection_names = specified_collections
             query = remaining_query
         # Phase 1: Get previews (with collection filtering)
         previews = self._get_previews(query, collection_names)
         if not previews:
             return []
         # Phase 2: Filter for relevance
         relevant_items = self._filter_for_relevance(previews, query)
         if not relevant_items:
             return []
         # Phase 3: Get full content for relevant items
-        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
             logger.info("Returning snippet-only results as per config")
             results = relevant_items
         else:
             results = self._get_full_content(relevant_items)
         # Clean up temporary data
         self.embedding_manager.clear_cache()
         return results
     def get_collections_info(self) -> List[Dict[str, Any]]:
         """
         Get information about all collections, including indexing status.
         Returns:
             List of collection information dictionaries
         """
         collections_info = []
         for name, collection in self.collections.items():
             paths = collection.get("paths", [])
+            paths = [Path(p) for p in paths]
             description = collection.get("description", "")
             # Get indexing information for each path
             paths_info = []
             for path in paths:
                 # Check if folder exists
-                exists = os.path.exists(path) and os.path.isdir(path)
+                exists = path.exists() and path.is_dir()
                 # Check if folder is indexed
-                folder_hash = self.embedding_manager._get_folder_hash(path)
+                folder_hash = self.embedding_manager.get_folder_hash(path)
                 indexed = folder_hash in self.embedding_manager.indexed_folders
                 # Get index details if available
                 index_info = {}
                 if indexed:
-                    index_info = self.embedding_manager.indexed_folders[folder_hash].copy()
-                paths_info.append({
-                    "path": path,
-                    "exists": exists,
-                    "indexed": indexed,
-                    "index_info": index_info
-                })
-            collections_info.append({
-                "name": name,
-                "description": description,
-                "paths": paths,
-                "paths_info": paths_info,
-                "document_count": sum(info.get("index_info", {}).get("file_count", 0) for info in paths_info),
-                "chunk_count": sum(info.get("index_info", {}).get("chunk_count", 0) for info in paths_info),
-                "all_indexed": all(info["indexed"] for info in paths_info if info["exists"])
-            })
+                    index_info = self.embedding_manager.indexed_folders[
+                        folder_hash
+                    ].copy()
+                paths_info.append(
+                    {
+                        "path": path,
+                        "exists": exists,
+                        "indexed": indexed,
+                        "index_info": index_info,
+                    }
+                )
+            collections_info.append(
+                {
+                    "name": name,
+                    "description": description,
+                    "paths": paths,
+                    "paths_info": paths_info,
+                    "document_count": sum(
+                        info.get("index_info", {}).get("file_count", 0)
+                        for info in paths_info
+                    ),
+                    "chunk_count": sum(
+                        info.get("index_info", {}).get("chunk_count", 0)
+                        for info in paths_info
+                    ),
+                    "all_indexed": all(
+                        info["indexed"] for info in paths_info if info["exists"]
+                    ),
+                }
+            )
         return collections_info
     def reindex_collection(self, collection_name: str) -> bool:
         """
         Reindex a specific collection.
         Args:
             collection_name: Name of the collection to reindex
         Returns:
             True if reindexing was successful, False otherwise
         """
         if collection_name not in self.collections:
             logger.error(f"Collection '{collection_name}' not found")
             return False
         paths = self.collections[collection_name].get("paths", [])
         success = True
         for path in paths:
             if not self.embedding_manager.index_folder(path, force_reindex=True):
                 success = False
         return success
     @classmethod
-    def from_config(cls, config_dict: Dict[str, Any], llm: Optional[BaseLLM] = None) -> "LocalSearchEngine":
+    def from_config(
+        cls, config_dict: Dict[str, Any], llm: Optional[BaseLLM] = None
+    ) -> "LocalSearchEngine":
         """
         Create a LocalSearchEngine instance from a configuration dictionary.
         Args:
             config_dict: Configuration dictionary
             llm: Language model for relevance filtering
         Returns:
             Initialized LocalSearchEngine instance
         """
         # Required parameters
         folder_paths = []
         collections = config_dict.get("collections", {})
         # Extract all folder paths from collections
         for collection_config in collections.values():
             if "paths" in collection_config:
                 folder_paths.extend(collection_config["paths"])
         # Fall back to folder_paths if no collections defined
         if not folder_paths:
             folder_paths = config_dict.get("folder_paths", [])
             # Create a default collection if using folder_paths
             if folder_paths:
-                collections = {"default": {"paths": folder_paths, "description": "Default collection"}}
+                collections = {
+                    "default": {
+                        "paths": folder_paths,
+                        "description": "Default collection",
+                    }
+                }
         # Optional parameters with defaults
         max_results = config_dict.get("max_results", 10)
         max_filtered_results = config_dict.get("max_filtered_results")
         embedding_model = config_dict.get("embedding_model", "all-MiniLM-L6-v2")
         embedding_device = config_dict.get("embedding_device", "cpu")
-        embedding_model_type = config_dict.get("embedding_model_type", "sentence_transformers")
+        embedding_model_type = config_dict.get(
+            "embedding_model_type", "sentence_transformers"
+        )
         ollama_base_url = config_dict.get("ollama_base_url")
         force_reindex = config_dict.get("force_reindex", False)
         chunk_size = config_dict.get("chunk_size", 1000)
         chunk_overlap = config_dict.get("chunk_overlap", 200)
         cache_dir = config_dict.get("cache_dir", ".cache/local_search")
         return cls(
             paths=folder_paths,
             collections=collections,
@@ -937,5 +1100,5 @@ class LocalSearchEngine(BaseSearchEngine):
             force_reindex=force_reindex,
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
-            cache_dir=cache_dir
+            cache_dir=cache_dir,
         )

local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl