PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/web_search_engines/search_engine_base.py CHANGED Viewed

@@ -1,123 +1,158 @@
+import json
+import logging
 from abc import ABC, abstractmethod
-from typing import Dict, List, Any, Optional
-from langchain_core.language_models import BaseLLM
 from datetime import datetime
-import json
-from local_deep_research.utilties.search_utilities import remove_think_tags
+from typing import Any, Dict, List, Optional
+from langchain_core.language_models import BaseLLM
+from ..config import search_config
-import logging
 logger = logging.getLogger(__name__)
 class BaseSearchEngine(ABC):
     """
     Abstract base class for search engines with two-phase retrieval capability.
     Handles common parameters and implements the two-phase search approach.
     """
-    def __init__(self,
-                llm: Optional[BaseLLM] = None,
-                max_filtered_results: Optional[int] = None,
-                max_results: Optional[int] = 10,  # Default value if not provided
-                **kwargs):
+    def __init__(
+        self,
+        llm: Optional[BaseLLM] = None,
+        max_filtered_results: Optional[int] = None,
+        max_results: Optional[int] = 10,  # Default value if not provided
+        **kwargs,
+    ):
         """
         Initialize the search engine with common parameters.
         Args:
             llm: Optional language model for relevance filtering
             max_filtered_results: Maximum number of results to keep after filtering
             max_results: Maximum number of search results to return
             **kwargs: Additional engine-specific parameters
         """
-        if max_filtered_results == None: max_filtered_results = 5
-        self.llm = llm  # LLM for relevance filtering
-        self.max_filtered_results = max_filtered_results  # Limit filtered results
-        # Ensure max_results is never None and is a positive integer
+        if max_filtered_results is None:
+            max_filtered_results = 5
         if max_results is None:
-            self.max_results = 25  # Default if None
-        else:
-            self.max_results = max(1, int(max_results))
+            max_results = 10
+        self.llm = llm  # LLM for relevance filtering
+        self._max_filtered_results = int(max_filtered_results)  # Ensure it's an integer
+        self._max_results = max(1, int(max_results))  # Ensure it's a positive integer
+    @property
+    def max_filtered_results(self) -> int:
+        """Get the maximum number of filtered results."""
+        return self._max_filtered_results
+    @max_filtered_results.setter
+    def max_filtered_results(self, value: int) -> None:
+        """Set the maximum number of filtered results."""
+        if value is None:
+            value = 5
+            logger.warning("Setting max_filtered_results to 5")
+        self._max_filtered_results = int(value)
+    @property
+    def max_results(self) -> int:
+        """Get the maximum number of search results."""
+        return self._max_results
+    @max_results.setter
+    def max_results(self, value: int) -> None:
+        """Set the maximum number of search results."""
+        if value is None:
+            value = 10
+        self._max_results = max(1, int(value))
     def run(self, query: str) -> List[Dict[str, Any]]:
         """
         Run the search engine with a given query, retrieving and filtering results.
-        This implements a two-phase retrieval approach:
+        This implements a two-phase retrieval approach:
         1. Get preview information for many results
         2. Filter the previews for relevance
         3. Get full content for only the relevant results
         Args:
             query: The search query
         Returns:
             List of search results with full content (if available)
         """
         # Ensure we're measuring time correctly for citation tracking
         # Step 1: Get preview information for items
         previews = self._get_previews(query)
         if not previews:
-            logger.info(f"Search engine {self.__class__.__name__} returned no preview results for query: {query}")
+            logger.info(
+                f"Search engine {self.__class__.__name__} returned no preview results for query: {query}"
+            )
             return []
         # Step 2: Filter previews for relevance with LLM
         filtered_items = self._filter_for_relevance(previews, query)
         if not filtered_items:
-            logger.info(f"All preview results were filtered out as irrelevant for query: {query}")
+            logger.info(
+                f"All preview results were filtered out as irrelevant for query: {query}"
+            )
             # Do not fall back to previews, return empty list instead
             return []
         # Step 3: Get full content for filtered items
         # Import config inside the method to avoid circular import
-        from local_deep_research import config
-        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
             logger.info("Returning snippet-only results as per config")
             results = filtered_items
         else:
             results = self._get_full_content(filtered_items)
         return results
     def invoke(self, query: str) -> List[Dict[str, Any]]:
         """Compatibility method for LangChain tools"""
         return self.run(query)
-    def _filter_for_relevance(self, previews: List[Dict[str, Any]], query: str) -> List[Dict[str, Any]]:
+    def _filter_for_relevance(
+        self, previews: List[Dict[str, Any]], query: str
+    ) -> List[Dict[str, Any]]:
         """
-        Filter search results for relevance to the query using an LLM.
-        Checks config.SKIP_RELEVANCE_FILTER to determine whether to perform filtering.
+        Filter search results by relevance to the query using the LLM.
         Args:
-            previews: List of search result dictionaries with preview information
+            previews: List of preview dictionaries
             query: The original search query
         Returns:
-            Filtered list of the most relevant search results
+            Filtered list of preview dictionaries
         """
-        # Import config inside the method to avoid circular import
-        from local_deep_research import config
-        # Skip filtering if configured to do so or if no LLM is available
-        if hasattr(config, 'SKIP_RELEVANCE_FILTER') and config.SKIP_RELEVANCE_FILTER:
-            # Return all previews up to max_filtered_results if no filtering is performed
-            limit = self.max_filtered_results or 5
-            return previews[:limit]
-        # Default implementation uses LLM if available
-        if not self.llm or not previews:
-            # If no LLM available, return all previews as relevant
-            if self.max_filtered_results and len(previews) > self.max_filtered_results:
-                return previews[:self.max_filtered_results]
+        # If no LLM or too few previews, return all
+        if not self.llm or len(previews) <= 1:
             return previews
-        now = datetime.now()
-        current_time = now.strftime("%Y-%m-%d")
+        # Create a simple context for LLM
+        preview_context = []
+        for i, preview in enumerate(previews):
+            title = preview.get("title", "Untitled").strip()
+            snippet = preview.get("snippet", "").strip()
+            # Clean up snippet if too long
+            if len(snippet) > 300:
+                snippet = snippet[:300] + "..."
+            preview_context.append(f"[{i}] Title: {title}\nSnippet: {snippet}")
+        # Set a reasonable limit on context length
+        current_date = datetime.now().strftime("%Y-%m-%d")
         prompt = f"""Analyze these search results and provide a ranked list of the most relevant ones.
 IMPORTANT: Evaluate and rank based on these criteria (in order of importance):
-1. Timeliness - current/recent information as of {current_time}
+1. Timeliness - current/recent information as of {current_date}
 2. Direct relevance to query: "{query}"
 3. Source reliability (prefer official sources, established websites)
 4. Factual accuracy (cross-reference major claims)
@@ -130,67 +165,107 @@ Include ONLY indices that meet ALL criteria, with the most relevant first.
 Example response: [4, 0, 2]
 Respond with ONLY the JSON array, no other text."""
         try:
             # Get LLM's evaluation
             response = self.llm.invoke(prompt)
-            # Extract JSON array from response
-            response_text = remove_think_tags(response.content)
-            # Clean up response to handle potential formatting issues
+            # Log the raw response for debugging
+            logger.info(f"Raw LLM response for relevance filtering: {response}")
+            # Handle different response formats
+            response_text = ""
+            if hasattr(response, "content"):
+                response_text = response.content
+            else:
+                response_text = str(response)
+            # Clean up response
             response_text = response_text.strip()
-            # Find the first occurrence of '[' and the last occurrence of ']'
-            start_idx = response_text.find('[')
-            end_idx = response_text.rfind(']')
+            logger.debug(f"Cleaned response text: {response_text}")
+            # Find JSON array in response
+            start_idx = response_text.find("[")
+            end_idx = response_text.rfind("]")
             if start_idx >= 0 and end_idx > start_idx:
-                array_text = response_text[start_idx:end_idx+1]
-                ranked_indices = json.loads(array_text)
-                # Return the results in ranked order
-                ranked_results = []
-                for idx in ranked_indices:
-                    if idx < len(previews):
-                        ranked_results.append(previews[idx])
-                # Limit to max_filtered_results if specified
-                if self.max_filtered_results and len(ranked_results) > self.max_filtered_results:
-                    logger.info(f"Limiting filtered results to top {self.max_filtered_results}")
-                    return ranked_results[:self.max_filtered_results]
-                return ranked_results
+                array_text = response_text[start_idx : end_idx + 1]
+                try:
+                    ranked_indices = json.loads(array_text)
+                    # Validate that ranked_indices is a list of integers
+                    if not isinstance(ranked_indices, list):
+                        logger.warning(
+                            "LLM response is not a list, returning empty results"
+                        )
+                        return []
+                    if not all(isinstance(idx, int) for idx in ranked_indices):
+                        logger.warning(
+                            "LLM response contains non-integer indices, returning empty results"
+                        )
+                        return []
+                    # Return the results in ranked order
+                    ranked_results = []
+                    for idx in ranked_indices:
+                        if idx < len(previews):
+                            ranked_results.append(previews[idx])
+                        else:
+                            logger.warning(f"Index {idx} out of range, skipping")
+                    # Limit to max_filtered_results if specified
+                    if (
+                        self.max_filtered_results
+                        and len(ranked_results) > self.max_filtered_results
+                    ):
+                        logger.info(
+                            f"Limiting filtered results to top {self.max_filtered_results}"
+                        )
+                        return ranked_results[: self.max_filtered_results]
+                    return ranked_results
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Failed to parse JSON from LLM response: {e}")
+                    logger.debug(f"Problematic JSON text: {array_text}")
+                    return []
             else:
-                logger.info("Could not find JSON array in response, returning no previews")
-                return []
+                logger.warning(
+                    "Could not find JSON array in response, returning original previews"
+                )
+                logger.debug(f"Response text without JSON array: {response_text}")
+                return previews[: min(5, len(previews))]
         except Exception as e:
-            logger.info(f"Relevance filtering error: {e}")
-            # Fall back to returning all previews (or top N) on error
-            return[]
+            logger.error(f"Relevance filtering error: {e}", exc_info=True)
+            # Fall back to returning top results on error
+            return previews[: min(5, len(previews))]
     @abstractmethod
     def _get_previews(self, query: str) -> List[Dict[str, Any]]:
         """
         Get preview information (titles, summaries) for initial search results.
         Args:
             query: The search query
         Returns:
             List of preview dictionaries with at least 'id', 'title', and 'snippet' keys
         """
         pass
     @abstractmethod
-    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
         """
         Get full content for the relevant items.
         Args:
             relevant_items: List of relevant preview dictionaries
         Returns:
             List of result dictionaries with full content
         """
-        pass
+        pass

local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl