PyPI - local-deep-research - Versions diffs - 0.5.9__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

local-deep-research 0.5.9py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

local_deep_research/web_search_engines/engines/search_engine_retriever.py ADDED Viewed

@@ -0,0 +1,192 @@
+"""
+Search engine implementation that wraps any LangChain retriever.
+This allows using vector stores, databases, or any custom retriever as a search source in LDR.
+"""
+from typing import Any, Dict, List
+from langchain.schema import BaseRetriever, Document
+from loguru import logger
+from ..search_engine_base import BaseSearchEngine
+class RetrieverSearchEngine(BaseSearchEngine):
+    """
+    Search engine that uses any LangChain retriever.
+    This allows users to plug in any LangChain retriever (vector stores,
+    databases, custom implementations) and use it as a search engine in LDR.
+    """
+    def __init__(
+        self,
+        retriever: BaseRetriever,
+        max_results: int = 10,
+        name: str = None,
+        **kwargs,
+    ):
+        """
+        Initialize the retriever-based search engine.
+        Args:
+            retriever: Any LangChain BaseRetriever instance
+            max_results: Maximum number of results to return
+            name: Display name for this retriever (defaults to retriever class name)
+            **kwargs: Additional parameters passed to parent
+        """
+        super().__init__(max_results=max_results, **kwargs)
+        self.retriever = retriever
+        self.name = name if name is not None else retriever.__class__.__name__
+    def run(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Execute search using the LangChain retriever.
+        Args:
+            query: Search query
+        Returns:
+            List of search results in LDR format
+        """
+        try:
+            # Use the retriever to get relevant documents
+            docs = self.retriever.invoke(query)
+            # Convert LangChain documents to LDR search result format
+            results = []
+            for i, doc in enumerate(docs[: self.max_results]):
+                result = self._convert_document_to_result(doc, i)
+                results.append(result)
+            logger.info(
+                f"Retriever '{self.name}' returned {len(results)} results for query: {query}"
+            )
+            return results
+        except Exception:
+            logger.exception("Error in retriever search")
+            return []
+    def _convert_document_to_result(
+        self, doc: Document, index: int
+    ) -> Dict[str, Any]:
+        """
+        Convert a LangChain Document to LDR search result format.
+        Args:
+            doc: LangChain Document
+            index: Result index
+        Returns:
+            Search result in LDR format
+        """
+        # Extract metadata
+        metadata = doc.metadata or {}
+        # Build the result
+        result = {
+            # Required fields for LDR
+            "title": metadata.get("title", f"Document {index + 1}"),
+            "url": metadata.get(
+                "source",
+                metadata.get("url", f"retriever://{self.name}/doc_{index}"),
+            ),
+            "snippet": doc.page_content[:500] if doc.page_content else "",
+            # Optional fields
+            "full_content": doc.page_content,
+            "author": metadata.get("author", ""),
+            "date": metadata.get("date", ""),
+            # Include all metadata for flexibility
+            "metadata": metadata,
+            # Score if available
+            "score": metadata.get("score", 1.0),
+            # Source information
+            "source": self.name,
+            "retriever_type": self.retriever.__class__.__name__,
+        }
+        return result
+    def _get_previews(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Get preview information from the retriever.
+        Args:
+            query: Search query
+        Returns:
+            List of preview dictionaries
+        """
+        try:
+            # Use the retriever to get relevant documents
+            docs = self.retriever.invoke(query)
+            # Convert to preview format
+            previews = []
+            for i, doc in enumerate(docs[: self.max_results]):
+                preview = self._convert_document_to_result(doc, i)
+                previews.append(preview)
+            logger.info(
+                f"Retriever '{self.name}' returned {len(previews)} previews for query: {query}"
+            )
+            return previews
+        except Exception:
+            logger.exception("Error getting previews from retriever")
+            return []
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """
+        For retrievers, previews already contain full content.
+        Args:
+            relevant_items: List of relevant preview dictionaries
+        Returns:
+            Same list with full content (already included)
+        """
+        # For retrievers, the preview already contains the full content
+        # Just ensure the 'full_content' field is present
+        for item in relevant_items:
+            if "full_content" not in item and "snippet" in item:
+                item["full_content"] = item["snippet"]
+        return relevant_items
+    async def arun(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Async version of search using the retriever.
+        Args:
+            query: Search query
+        Returns:
+            List of search results in LDR format
+        """
+        try:
+            # Use async retriever if available
+            if hasattr(self.retriever, "aget_relevant_documents"):
+                docs = await self.retriever.aget_relevant_documents(query)
+            else:
+                # Fall back to sync version
+                logger.debug(
+                    f"Retriever '{self.name}' doesn't support async, using sync version"
+                )
+                return self.run(query)
+            # Convert documents to results
+            results = []
+            for i, doc in enumerate(docs[: self.max_results]):
+                result = self._convert_document_to_result(doc, i)
+                results.append(result)
+            logger.info(
+                f"Retriever '{self.name}' returned {len(results)} async results for query: {query}"
+            )
+            return results
+        except Exception:
+            logger.exception("Error in async retriever search")
+            return []

local_deep_research/web_search_engines/engines/search_engine_tavily.py ADDED Viewed

@@ -0,0 +1,307 @@
+import logging
+import os
+from typing import Any, Dict, List, Optional
+import requests
+from langchain_core.language_models import BaseLLM
+from ...config import search_config
+from ..search_engine_base import BaseSearchEngine
+from ..rate_limiting import RateLimitError
+logger = logging.getLogger(__name__)
+class TavilySearchEngine(BaseSearchEngine):
+    """Tavily search engine implementation with two-phase approach"""
+    def __init__(
+        self,
+        max_results: int = 10,
+        region: str = "US",
+        time_period: str = "y",
+        safe_search: bool = True,
+        search_language: str = "English",
+        api_key: Optional[str] = None,
+        llm: Optional[BaseLLM] = None,
+        include_full_content: bool = True,
+        max_filtered_results: Optional[int] = None,
+        search_depth: str = "basic",
+        include_domains: Optional[List[str]] = None,
+        exclude_domains: Optional[List[str]] = None,
+        **kwargs,
+    ):
+        """
+        Initialize the Tavily search engine.
+        Args:
+            max_results: Maximum number of search results
+            region: Region code for search results (not used by Tavily currently)
+            time_period: Time period for search results (not used by Tavily currently)
+            safe_search: Whether to enable safe search (not used by Tavily currently)
+            search_language: Language for search results (not used by Tavily currently)
+            api_key: Tavily API key (can also be set in TAVILY_API_KEY env)
+            llm: Language model for relevance filtering
+            include_full_content: Whether to include full webpage content in results
+            max_filtered_results: Maximum number of results to keep after filtering
+            search_depth: "basic" or "advanced" - controls search quality vs speed
+            include_domains: List of domains to include in search
+            exclude_domains: List of domains to exclude from search
+            **kwargs: Additional parameters (ignored but accepted for compatibility)
+        """
+        # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
+        super().__init__(
+            llm=llm,
+            max_filtered_results=max_filtered_results,
+            max_results=max_results,
+        )
+        self.include_full_content = include_full_content
+        self.search_depth = search_depth
+        self.include_domains = include_domains or []
+        self.exclude_domains = exclude_domains or []
+        # Get API key - check params, database, or env vars
+        from ...utilities.db_utils import get_db_setting
+        tavily_api_key = api_key
+        if not tavily_api_key:
+            tavily_api_key = get_db_setting("search.engine.web.tavily.api_key")
+        if not tavily_api_key:
+            tavily_api_key = os.environ.get("TAVILY_API_KEY")
+        if not tavily_api_key:
+            raise ValueError(
+                "Tavily API key not found. Please provide api_key parameter, "
+                "set it in the UI settings, or set TAVILY_API_KEY environment variable."
+            )
+        self.api_key = tavily_api_key
+        self.base_url = "https://api.tavily.com"
+        # If full content is requested, initialize FullSearchResults
+        if include_full_content:
+            # Import FullSearchResults only if needed
+            try:
+                from .full_search import FullSearchResults
+                # Create a simple wrapper for Tavily API calls
+                class TavilyWrapper:
+                    def __init__(self, parent):
+                        self.parent = parent
+                    def run(self, query):
+                        return self.parent._get_previews(query)
+                self.full_search = FullSearchResults(
+                    llm=llm,
+                    web_search=TavilyWrapper(self),
+                    language=search_language,
+                    max_results=max_results,
+                    region=region,
+                    time=time_period,
+                    safesearch="moderate" if safe_search else "off",
+                )
+            except ImportError:
+                logger.warning(
+                    "Warning: FullSearchResults not available. Full content retrieval disabled."
+                )
+                self.include_full_content = False
+    def _get_previews(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Get preview information from Tavily Search.
+        Args:
+            query: The search query
+        Returns:
+            List of preview dictionaries
+        """
+        logger.info("Getting search results from Tavily")
+        try:
+            # Prepare the request payload
+            payload = {
+                "api_key": self.api_key,
+                "query": query[:400],  # Limit query length
+                "search_depth": self.search_depth,
+                "max_results": min(
+                    20, self.max_results
+                ),  # Tavily has a max limit
+                "include_answer": False,  # We don't need the AI answer
+                "include_images": False,  # We don't need images
+                "include_raw_content": self.include_full_content,  # Get content if requested
+            }
+            # Add domain filters if specified
+            if self.include_domains:
+                payload["include_domains"] = self.include_domains
+            if self.exclude_domains:
+                payload["exclude_domains"] = self.exclude_domains
+            # Make the API request
+            response = requests.post(
+                f"{self.base_url}/search",
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=30,
+            )
+            # Check for errors
+            if response.status_code == 429:
+                raise RateLimitError(
+                    f"Tavily rate limit hit: {response.status_code} - {response.text}"
+                )
+            response.raise_for_status()
+            # Parse the response
+            data = response.json()
+            results = data.get("results", [])
+            # Format results as previews
+            previews = []
+            for i, result in enumerate(results):
+                preview = {
+                    "id": result.get("url", str(i)),  # Use URL as ID
+                    "title": result.get("title", ""),
+                    "link": result.get("url", ""),
+                    "snippet": result.get(
+                        "content", ""
+                    ),  # Tavily calls it "content"
+                    "displayed_link": result.get("url", ""),
+                    "position": i,
+                }
+                # Store full Tavily result for later
+                preview["_full_result"] = result
+                previews.append(preview)
+            # Store the previews for potential full content retrieval
+            self._search_results = previews
+            return previews
+        except RateLimitError:
+            raise  # Re-raise rate limit errors
+        except requests.exceptions.RequestException as e:
+            error_msg = str(e)
+            logger.exception("Error getting Tavily results")
+            # Check for rate limit patterns in error message
+            if any(
+                pattern in error_msg.lower()
+                for pattern in [
+                    "429",
+                    "rate limit",
+                    "quota",
+                    "too many requests",
+                ]
+            ):
+                raise RateLimitError(f"Tavily rate limit hit: {error_msg}")
+            return []
+        except Exception:
+            logger.exception("Unexpected error getting Tavily results")
+            return []
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
+        """
+        Get full content for the relevant search results.
+        If include_full_content is True and raw content was retrieved,
+        includes it in the results.
+        Args:
+            relevant_items: List of relevant preview dictionaries
+        Returns:
+            List of result dictionaries with full content if available
+        """
+        # Check if we should get full content
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
+            logger.info("Snippet-only mode, skipping full content retrieval")
+            # Return the relevant items with their full Tavily information
+            results = []
+            for item in relevant_items:
+                # Use the full result if available, otherwise use the preview
+                if "_full_result" in item:
+                    result = item["_full_result"]
+                    # Remove temporary field
+                    if "_full_result" in result:
+                        del result["_full_result"]
+                else:
+                    result = item
+                results.append(result)
+            return results
+        # If full content retrieval is enabled
+        if self.include_full_content and hasattr(self, "full_search"):
+            logger.info("Retrieving full webpage content")
+            try:
+                # Use FullSearchResults to get full content
+                results_with_content = self.full_search._get_full_content(
+                    relevant_items
+                )
+                return results_with_content
+            except Exception:
+                logger.exception("Error retrieving full content")
+                # Fall back to returning the items without full content
+        # Return items with their full Tavily information
+        results = []
+        for item in relevant_items:
+            # Use the full result if available, otherwise use the preview
+            if "_full_result" in item:
+                result = item["_full_result"].copy()
+                # If Tavily provided raw_content, include it
+                if "raw_content" in result and self.include_full_content:
+                    result["content"] = result.get(
+                        "raw_content", result.get("content", "")
+                    )
+                # Remove temporary field
+                if "_full_result" in result:
+                    del result["_full_result"]
+            else:
+                result = item.copy()
+                if "_full_result" in result:
+                    del result["_full_result"]
+            results.append(result)
+        return results
+    def run(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Execute a search using Tavily with the two-phase approach.
+        Args:
+            query: The search query
+        Returns:
+            List of search results
+        """
+        logger.info("---Execute a search using Tavily---")
+        # Use the implementation from the parent class which handles all phases
+        results = super().run(query)
+        # Clean up
+        if hasattr(self, "_search_results"):
+            del self._search_results
+        return results

local_deep_research/web_search_engines/rate_limiting/__init__.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""
+Adaptive rate limiting module for search engines.
+"""
+from .exceptions import RateLimitError, AdaptiveRetryError, RateLimitConfigError
+from .tracker import AdaptiveRateLimitTracker, get_tracker
+__all__ = [
+    "RateLimitError",
+    "AdaptiveRetryError",
+    "RateLimitConfigError",
+    "AdaptiveRateLimitTracker",
+    "get_tracker",
+]

local_deep_research/web_search_engines/rate_limiting/__main__.py ADDED Viewed

@@ -0,0 +1,9 @@
+"""
+Entry point for rate limiting CLI module.
+Allows running: python -m local_deep_research.web_search_engines.rate_limiting
+"""
+from .cli import main
+if __name__ == "__main__":
+    main()

local-deep-research 0.5.9__py3-none-any.whl → 0.6.1__py3-none-any.whl

local-deep-research 0.5.9py3-none-any.whl → 0.6.1py3-none-any.whl