PyPI - local-deep-research - Versions diffs - 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

local-deep-research 0.3.12py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py ADDED Viewed

@@ -0,0 +1,343 @@
+import json
+import logging
+from typing import Any, Dict, List, Optional
+from elasticsearch import Elasticsearch
+from langchain_core.language_models import BaseLLM
+from ...config import search_config
+from ..search_engine_base import BaseSearchEngine
+logger = logging.getLogger(__name__)
+class ElasticsearchSearchEngine(BaseSearchEngine):
+    """Elasticsearch search engine implementation with two-phase approach"""
+    def __init__(
+        self,
+        hosts: List[str] = ["http://localhost:9200"],
+        index_name: str = "documents",
+        username: Optional[str] = None,
+        password: Optional[str] = None,
+        api_key: Optional[str] = None,
+        cloud_id: Optional[str] = None,
+        max_results: int = 10,
+        highlight_fields: List[str] = ["content", "title"],
+        search_fields: List[str] = ["content", "title"],
+        filter_query: Optional[Dict[str, Any]] = None,
+        llm: Optional[BaseLLM] = None,
+        max_filtered_results: Optional[int] = None,
+    ):
+        """
+        Initialize the Elasticsearch search engine.
+        Args:
+            hosts: List of Elasticsearch hosts
+            index_name: Name of the index to search
+            username: Optional username for authentication
+            password: Optional password for authentication
+            api_key: Optional API key for authentication
+            cloud_id: Optional Elastic Cloud ID
+            max_results: Maximum number of search results
+            highlight_fields: Fields to highlight in search results
+            search_fields: Fields to search in
+            filter_query: Optional filter query in Elasticsearch DSL format
+            llm: Language model for relevance filtering
+            max_filtered_results: Maximum number of results to keep after filtering
+        """
+        # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
+        super().__init__(
+            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+        )
+        self.index_name = index_name
+        self.highlight_fields = highlight_fields
+        self.search_fields = search_fields
+        self.filter_query = filter_query or {}
+        # Initialize the Elasticsearch client
+        es_args = {}
+        # Basic authentication
+        if username and password:
+            es_args["basic_auth"] = (username, password)
+        # API key authentication
+        if api_key:
+            es_args["api_key"] = api_key
+        # Cloud ID for Elastic Cloud
+        if cloud_id:
+            es_args["cloud_id"] = cloud_id
+        # Connect to Elasticsearch
+        self.client = Elasticsearch(hosts, **es_args)
+        # Verify connection
+        try:
+            info = self.client.info()
+            logger.info(f"Connected to Elasticsearch cluster: {info.get('cluster_name')}")
+            logger.info(f"Elasticsearch version: {info.get('version', {}).get('number')}")
+        except Exception as e:
+            logger.error(f"Failed to connect to Elasticsearch: {str(e)}")
+            raise ConnectionError(f"Could not connect to Elasticsearch: {str(e)}")
+    def _get_previews(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Get preview information for Elasticsearch documents.
+        Args:
+            query: The search query
+        Returns:
+            List of preview dictionaries
+        """
+        logger.info(f"Getting document previews from Elasticsearch with query: {query}")
+        try:
+            # Build the search query
+            search_query = {
+                "query": {
+                    "multi_match": {
+                        "query": query,
+                        "fields": self.search_fields,
+                        "type": "best_fields",
+                        "tie_breaker": 0.3,
+                    }
+                },
+                "highlight": {
+                    "fields": {field: {} for field in self.highlight_fields},
+                    "pre_tags": ["<em>"],
+                    "post_tags": ["</em>"],
+                },
+                "size": self.max_results,
+            }
+            # Add filter if provided
+            if self.filter_query:
+                search_query["query"] = {
+                    "bool": {
+                        "must": search_query["query"],
+                        "filter": self.filter_query
+                    }
+                }
+            # Execute the search
+            response = self.client.search(
+                index=self.index_name,
+                body=search_query,
+            )
+            # Process the search results
+            hits = response.get("hits", {}).get("hits", [])
+            # Format results as previews with basic information
+            previews = []
+            for hit in hits:
+                source = hit.get("_source", {})
+                highlight = hit.get("highlight", {})
+                # Extract highlighted snippets or fall back to original content
+                snippet = ""
+                for field in self.highlight_fields:
+                    if field in highlight and highlight[field]:
+                        # Join all highlights for this field
+                        field_snippets = " ... ".join(highlight[field])
+                        snippet += field_snippets + " "
+                # If no highlights, use a portion of the content
+                if not snippet and "content" in source:
+                    content = source.get("content", "")
+                    snippet = content[:250] + "..." if len(content) > 250 else content
+                # Create preview object
+                preview = {
+                    "id": hit.get("_id", ""),
+                    "title": source.get("title", "Untitled Document"),
+                    "link": source.get("url", "") or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
+                    "snippet": snippet.strip(),
+                    "score": hit.get("_score", 0),
+                    "_index": hit.get("_index", self.index_name),
+                }
+                previews.append(preview)
+            logger.info(f"Found {len(previews)} preview results from Elasticsearch")
+            return previews
+        except Exception as e:
+            logger.error(f"Error getting Elasticsearch previews: {str(e)}")
+            return []
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """
+        Get full content for the relevant Elasticsearch documents.
+        Args:
+            relevant_items: List of relevant preview dictionaries
+        Returns:
+            List of result dictionaries with full content
+        """
+        # Check if we should get full content
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
+            logger.info("Snippet-only mode, skipping full content retrieval")
+            return relevant_items
+        logger.info("Getting full content for relevant Elasticsearch documents")
+        results = []
+        for item in relevant_items:
+            # Start with the preview data
+            result = item.copy()
+            # Get the document ID
+            doc_id = item.get("id")
+            if not doc_id:
+                # Skip items without ID
+                logger.warning(f"Skipping item without ID: {item}")
+                results.append(result)
+                continue
+            try:
+                # Fetch the full document
+                doc_response = self.client.get(
+                    index=self.index_name,
+                    id=doc_id,
+                )
+                # Get the source document
+                source = doc_response.get("_source", {})
+                # Add full content to the result
+                result["content"] = source.get("content", result.get("snippet", ""))
+                result["full_content"] = source.get("content", "")
+                # Add metadata from source
+                for key, value in source.items():
+                    if key not in result and key not in ["content"]:
+                        result[key] = value
+            except Exception as e:
+                logger.error(f"Error fetching full content for document {doc_id}: {str(e)}")
+                # Keep the preview data if we can't get the full content
+            results.append(result)
+        return results
+    def search_by_query_string(self, query_string: str) -> List[Dict[str, Any]]:
+        """
+        Perform a search using Elasticsearch Query String syntax.
+        Args:
+            query_string: The query in Elasticsearch Query String syntax
+        Returns:
+            List of search results
+        """
+        try:
+            # Build the search query
+            search_query = {
+                "query": {
+                    "query_string": {
+                        "query": query_string,
+                        "fields": self.search_fields,
+                    }
+                },
+                "highlight": {
+                    "fields": {field: {} for field in self.highlight_fields},
+                    "pre_tags": ["<em>"],
+                    "post_tags": ["</em>"],
+                },
+                "size": self.max_results,
+            }
+            # Execute the search
+            response = self.client.search(
+                index=self.index_name,
+                body=search_query,
+            )
+            # Process and return the results
+            previews = self._process_es_response(response)
+            return self._get_full_content(previews)
+        except Exception as e:
+            logger.error(f"Error in query_string search: {str(e)}")
+            return []
+    def search_by_dsl(self, query_dsl: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Perform a search using Elasticsearch DSL (Query Domain Specific Language).
+        Args:
+            query_dsl: The query in Elasticsearch DSL format
+        Returns:
+            List of search results
+        """
+        try:
+            # Execute the search with the provided DSL
+            response = self.client.search(
+                index=self.index_name,
+                body=query_dsl,
+            )
+            # Process and return the results
+            previews = self._process_es_response(response)
+            return self._get_full_content(previews)
+        except Exception as e:
+            logger.error(f"Error in DSL search: {str(e)}")
+            return []
+    def _process_es_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Process Elasticsearch response into preview dictionaries.
+        Args:
+            response: Elasticsearch response dictionary
+        Returns:
+            List of preview dictionaries
+        """
+        hits = response.get("hits", {}).get("hits", [])
+        # Format results as previews
+        previews = []
+        for hit in hits:
+            source = hit.get("_source", {})
+            highlight = hit.get("highlight", {})
+            # Extract highlighted snippets or fall back to original content
+            snippet = ""
+            for field in self.highlight_fields:
+                if field in highlight and highlight[field]:
+                    field_snippets = " ... ".join(highlight[field])
+                    snippet += field_snippets + " "
+            # If no highlights, use a portion of the content
+            if not snippet and "content" in source:
+                content = source.get("content", "")
+                snippet = content[:250] + "..." if len(content) > 250 else content
+            # Create preview object
+            preview = {
+                "id": hit.get("_id", ""),
+                "title": source.get("title", "Untitled Document"),
+                "link": source.get("url", "") or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
+                "snippet": snippet.strip(),
+                "score": hit.get("_score", 0),
+                "_index": hit.get("_index", self.index_name),
+            }
+            previews.append(preview)
+        return previews

local_deep_research/web_search_engines/engines/search_engine_google_pse.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import os
 import random
 import time
 from typing import Any, Dict, List, Optional
@@ -88,17 +87,26 @@ class GooglePSESearchEngine(BaseSearchEngine):
         # Region/Country setting
         self.region = region
-        # API key and Search Engine ID
-        self.api_key = api_key or os.getenv("GOOGLE_PSE_API_KEY")
-        self.search_engine_id = search_engine_id or os.getenv("GOOGLE_PSE_ENGINE_ID")
+        # API key and Search Engine ID - check params, env vars, or database
+        from ...utilities.db_utils import get_db_setting
+        self.api_key = api_key
+        if not self.api_key:
+            self.api_key = get_db_setting("search.engine.web.google_pse.api_key")
+        self.search_engine_id = search_engine_id
+        if not self.search_engine_id:
+            self.search_engine_id = get_db_setting(
+                "search.engine.web.google_pse.engine_id"
+            )
         if not self.api_key:
             raise ValueError(
-                "Google API key is required. Set it in the GOOGLE_PSE_API_KEY environment variable."
+                "Google API key is required. Set it in the UI settings, use the api_key parameter, or set the GOOGLE_PSE_API_KEY environment variable."
             )
         if not self.search_engine_id:
             raise ValueError(
-                "Google Search Engine ID is required. Set it in the GOOGLE_PSE_ENGINE_ID environment variable."
+                "Google Search Engine ID is required. Set it in the UI settings, use the search_engine_id parameter, or set the GOOGLE_PSE_ENGINE_ID environment variable."
             )
         # Validate connection and credentials

local_deep_research/web_search_engines/engines/search_engine_local.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import hashlib
 import json
-import logging
 import os
 import time
 import uuid
@@ -29,16 +28,13 @@ from langchain_community.vectorstores import FAISS
 from langchain_core.documents import Document
 from langchain_core.language_models import BaseLLM
 from langchain_text_splitters import RecursiveCharacterTextSplitter
+from loguru import logger
 from ...config import search_config
 from ...utilities.db_utils import get_db_setting
 from ...utilities.url_utils import normalize_url
 from ..search_engine_base import BaseSearchEngine
-# Setup logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
     """Get an appropriate document loader for a file based on its extension"""
@@ -62,8 +58,8 @@ def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
             # Try the text loader as a fallback for unknown extensions
             logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
             return TextLoader(str(file_path), encoding="utf-8")
-    except Exception as e:
-        logger.error(f"Error creating loader for {file_path}: {e}")
+    except Exception:
+        logger.exception(f"Error creating loader for {file_path}")
         return None
@@ -94,8 +90,8 @@ def _load_document(file_path: Path) -> List[Document]:
             doc.metadata["source"] = str(file_path)
             doc.metadata["filename"] = file_path.name
-    except Exception as e:
-        logger.error(f"Error loading {file_path}: {e}")
+    except Exception:
+        logger.exception(f"Error loading {file_path}")
         return []
     return docs
@@ -197,8 +193,8 @@ class LocalEmbeddingManager:
                     model_name=self.embedding_model,
                     model_kwargs={"device": self.embedding_device},
                 )
-        except Exception as e:
-            logger.error(f"Error initializing embeddings: {e}")
+        except Exception:
+            logger.exception("Error initializing embeddings")
             logger.warning(
                 "Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2"
             )
@@ -226,8 +222,8 @@ class LocalEmbeddingManager:
                 logger.info(f"Loaded index with {doc_count} document chunks")
                 return vector_store
-            except Exception as e:
-                logger.error(f"Error loading vector store: {e}")
+            except Exception:
+                logger.exception("Error loading vector store")
                 logger.info("Will create a new vector store")
         # Create a new vector store
@@ -241,8 +237,8 @@ class LocalEmbeddingManager:
             try:
                 with open(index_metadata_path, "r") as f:
                     return json.load(f)
-            except Exception as e:
-                logger.error(f"Error loading index metadata: {e}")
+            except Exception:
+                logger.exception("Error loading index metadata")
         return {}
@@ -253,8 +249,8 @@ class LocalEmbeddingManager:
         try:
             with open(index_metadata_path, "w") as f:
                 json.dump(self.indexed_folders, f, indent=2)
-        except Exception as e:
-            logger.error(f"Error saving index metadata: {e}")
+        except Exception:
+            logger.exception("Error saving index metadata")
     @staticmethod
     def get_folder_hash(folder_path: Path) -> str:
@@ -397,8 +393,8 @@ class LocalEmbeddingManager:
                     normalize_L2=True,
                 )
                 logger.info(f"Loaded index for {folder_path} from disk")
-            except Exception as e:
-                logger.error(f"Error loading index for {folder_path}: {e}")
+            except Exception:
+                logger.exception(f"Error loading index for {folder_path}")
                 # If loading fails, force reindexing
                 force_reindex = True
@@ -574,8 +570,8 @@ class LocalEmbeddingManager:
                         allow_dangerous_deserialization=True,
                         normalize_L2=True,
                     )
-                except Exception as e:
-                    logger.error(f"Error loading index for {folder_path}: {e}")
+                except Exception:
+                    logger.exception(f"Error loading index for {folder_path}")
                     continue
             # Search in this folder
@@ -599,8 +595,8 @@ class LocalEmbeddingManager:
                     }
                     all_results.append(result)
-            except Exception as e:
-                logger.error(f"Error searching in {folder_path}: {e}")
+            except Exception:
+                logger.exception(f"Error searching in {folder_path}")
         # Sort by similarity (highest first)
         all_results.sort(key=lambda x: x["similarity"], reverse=True)

local_deep_research/web_search_engines/engines/search_engine_local_all.py CHANGED Viewed

@@ -2,19 +2,16 @@
 Search engine that searches across all local collections
 """
-import logging
 from typing import Any, Dict, List, Optional, cast
 from langchain_core.language_models import BaseLLM
+from loguru import logger
 from ..search_engine_base import BaseSearchEngine
 from ..search_engine_factory import create_search_engine
 from ..search_engines_config import local_search_engines
 from .search_engine_local import LocalSearchEngine
-# Setup logging
-logger = logging.getLogger(__name__)
 class LocalAllSearchEngine(BaseSearchEngine):
     """
@@ -62,9 +59,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
                             "name": engine.name,
                             "description": engine.description,
                         }
-                except Exception as e:
-                    logger.error(
-                        f"Error creating search engine for collection '{collection_id}': {e}"
+                except Exception:
+                    logger.exception(
+                        f"Error creating search engine for collection '{collection_id}'"
                     )
         except ImportError:
             logger.warning("No local collections configuration found")
@@ -97,8 +94,8 @@ class LocalAllSearchEngine(BaseSearchEngine):
                     preview["collection_description"] = engine_info["description"]
                 all_previews.extend(previews)
-            except Exception as e:
-                logger.error(f"Error searching collection '{collection_id}': {e}")
+            except Exception:
+                logger.exception(f"Error searching collection '{collection_id}'")
         if not all_previews:
             logger.info(f"No local documents found for query: {query}")
@@ -139,9 +136,9 @@ class LocalAllSearchEngine(BaseSearchEngine):
             try:
                 results = engine._get_full_content(items)
                 all_results.extend(results)
-            except Exception as e:
-                logger.error(
-                    f"Error getting full content from collection '{collection_id}': {e}"
+            except Exception:
+                logger.exception(
+                    f"Error getting full content from collection '{collection_id}'"
                 )
                 # Fall back to returning the items without full content
                 all_results.extend(items)

local_deep_research/web_search_engines/engines/search_engine_searxng.py CHANGED Viewed

@@ -1,20 +1,16 @@
 import enum
-import logging
 import os
 import time
 from typing import Any, Dict, List, Optional
 import requests
 from langchain_core.language_models import BaseLLM
+from loguru import logger
 from ...config import search_config
 from ..search_engine_base import BaseSearchEngine
 from .full_search import FullSearchResults
-# Setup logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 @enum.unique
 class SafeSearchSetting(enum.IntEnum):
@@ -70,9 +66,8 @@ class SearXNGSearchEngine(BaseSearchEngine):
             llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
         )
-        self.instance_url = instance_url
         # Validate and normalize the instance URL if provided
-        self.instance_url = self.instance_url.rstrip("/")
+        self.instance_url = instance_url.rstrip("/")
         logger.info(f"SearXNG initialized with instance URL: {self.instance_url}")
         try:
             # Make sure it's accessible.
@@ -182,8 +177,8 @@ class SearXNGSearchEngine(BaseSearchEngine):
                     self.instance_url, headers=initial_headers, timeout=10
                 )
                 cookies = initial_response.cookies
-            except Exception as e:
-                logger.warning(f"Failed to get initial cookies: {e}")
+            except Exception:
+                logger.exception("Failed to get initial cookies")
                 cookies = None
             params = {
@@ -311,15 +306,15 @@ class SearXNGSearchEngine(BaseSearchEngine):
                 except ImportError:
                     logger.error("BeautifulSoup not available for HTML parsing")
                     return []
-                except Exception as e:
-                    logger.error(f"Error parsing HTML results: {str(e)}")
+                except Exception:
+                    logger.exception("Error parsing HTML results")
                     return []
             else:
                 logger.error(f"SearXNG returned status code {response.status_code}")
                 return []
-        except Exception as e:
-            logger.error(f"Error getting SearXNG results: {e}")
+        except Exception:
+            logger.exception("Error getting SearXNG results")
             return []
     def _get_previews(self, query: str) -> List[Dict[str, Any]]:
@@ -391,8 +386,8 @@ class SearXNGSearchEngine(BaseSearchEngine):
             results_with_content = self.full_search._get_full_content(relevant_items)
             return results_with_content
-        except Exception as e:
-            logger.error(f"Error retrieving full content: {e}")
+        except Exception:
+            logger.exception("Error retrieving full content")
             return relevant_items
     def invoke(self, query: str) -> List[Dict[str, Any]]:
@@ -511,7 +506,7 @@ https://searxng.github.io/searxng/admin/installation.html
             results = super().run(query)
             logger.info(f"SearXNG search completed with {len(results)} results")
             return results
-        except Exception as e:
-            logger.error(f"Error in SearXNG run method: {str(e)}")
+        except Exception:
+            logger.exception("Error in SearXNG run method")
             # Return empty results on error
             return []

local_deep_research/web_search_engines/engines/search_engine_serpapi.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import logging
-import os
 from typing import Any, Dict, List, Optional
 from langchain_community.utilities import SerpAPIWrapper
@@ -64,11 +63,16 @@ class SerpAPISearchEngine(BaseSearchEngine):
                 "russian": "ru",
             }
-        # Get API key
-        serpapi_api_key = api_key or os.getenv("SERP_API_KEY")
+        # Get API key - check params, env vars, or database
+        from ...utilities.db_utils import get_db_setting
+        serpapi_api_key = api_key
+        if not serpapi_api_key:
+            serpapi_api_key = get_db_setting("search.engine.web.serpapi.api_key")
         if not serpapi_api_key:
             raise ValueError(
-                "SERP_API_KEY not found. Please provide api_key or set the SERP_API_KEY environment variable."
+                "SerpAPI key not found. Please provide api_key parameter, set the SERP_API_KEY environment variable, or set it in the UI settings."
             )
         # Get language code

local-deep-research 0.3.12__py3-none-any.whl → 0.4.0__py3-none-any.whl

local-deep-research 0.3.12py3-none-any.whl → 0.4.0py3-none-any.whl