PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/web_search_engines/engines/full_search.py CHANGED Viewed

@@ -1,13 +1,17 @@
+import json
+import logging
+import os
+from datetime import datetime
+from typing import Dict, List
 import justext
 from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_community.document_transformers import BeautifulSoupTransformer
 from langchain_core.language_models import BaseLLM
-from typing import List, Dict
-import json, os
-from .utilties.search_utilities import remove_think_tags
-from datetime import datetime
-from local_deep_research import config
-import logging
+from ...config.search_config import QUALITY_CHECK_DDG_URLS
+from ...utilities.search_utilities import remove_think_tags
 logger = logging.getLogger(__name__)
@@ -15,14 +19,13 @@ class FullSearchResults:
     def __init__(
         self,
         llm: BaseLLM,  # Add LLM parameter
-        web_search: list,
+        web_search: list,
         output_format: str = "list",
         language: str = "English",
         max_results: int = 10,
         region: str = "wt-wt",
         time: str = "y",
-        safesearch: str = "Moderate"
+        safesearch: str = "Moderate",
     ):
         self.llm = llm
         self.output_format = output_format
@@ -31,10 +34,9 @@ class FullSearchResults:
         self.region = region
         self.time = time
         self.safesearch = safesearch
-        self.web_search =web_search
+        self.web_search = web_search
         os.environ["USER_AGENT"] = "Local Deep Research/1.0"
         self.bs_transformer = BeautifulSoupTransformer()
         self.tags_to_extract = ["p", "div", "span"]
@@ -54,7 +56,7 @@ class FullSearchResults:
             {results}
             Return a JSON array of indices (0-based) for sources that meet ALL criteria.
-            ONLY Return a JSON array of indices (0-based) and nothing else. No letters.
+            ONLY Return a JSON array of indices (0-based) and nothing else. No letters.
             Example response: \n[0, 2, 4]\n\n"""
         try:
@@ -66,7 +68,7 @@ class FullSearchResults:
             return [r for i, r in enumerate(results) if i in good_indices]
         except Exception as e:
             logger.error(f"URL filtering error: {e}")
-            return []
+            return []
     def remove_boilerplate(self, html: str) -> str:
         if not html or not html.strip():
@@ -77,13 +79,13 @@ class FullSearchResults:
     def run(self, query: str):
         nr_full_text = 0
-        # Step 1: Get search results
+        # Step 1: Get search results
         search_results = self.web_search.invoke(query)
         if not isinstance(search_results, list):
             raise ValueError("Expected the search results in list format.")
         # Step 2: Filter URLs using LLM
-        if config.QUALITY_CHECK_DDG_URLS:
+        if QUALITY_CHECK_DDG_URLS:
             filtered_results = self.check_urls(search_results, query)
         else:
             filtered_results = search_results
@@ -126,4 +128,4 @@ class FullSearchResults:
         return self.run(query)
     def __call__(self, query: str):
-        return self.invoke(query)
+        return self.invoke(query)

local_deep_research/web_search_engines/engines/meta_search_engine.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import logging
 import os
-from typing import Dict, List, Any, Optional
+from typing import Any, Dict, List, Optional
-from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
-from local_deep_research.web_search_engines.search_engines_config import SEARCH_ENGINES
-from local_deep_research.web_search_engines.search_engine_factory import create_search_engine
-from local_deep_research.web_search_engines.engines.search_engine_wikipedia import WikipediaSearchEngine
-from local_deep_research import config
+from ...config import search_config
+from ...web.services.socket_service import emit_socket_event
+from ..search_engine_base import BaseSearchEngine
+from ..search_engine_factory import create_search_engine
+from ..search_engines_config import SEARCH_ENGINES
+from .search_engine_wikipedia import WikipediaSearchEngine
 # Setup logging
 logging.basicConfig(level=logging.INFO)
@@ -18,17 +19,20 @@ class MetaSearchEngine(BaseSearchEngine):
     LLM-powered meta search engine that intelligently selects and uses
     the appropriate search engines based on query analysis
     """
-    def __init__(self,
-                 llm,
-                 max_results: int = 10,
-                 use_api_key_services: bool = True,
-                 max_engines_to_try: int = 3,
-                 max_filtered_results: Optional[int] = None,
-                 **kwargs):
+    def __init__(
+        self,
+        llm,
+        max_results: int = 10,
+        use_api_key_services: bool = True,
+        max_engines_to_try: int = 3,
+        max_filtered_results: Optional[int] = None,
+        engine_selection_callback=None,
+        **kwargs,
+    ):
         """
         Initialize the meta search engine.
         Args:
             llm: Language model instance for query classification and relevance filtering
             max_results: Maximum number of search results to return
@@ -37,247 +41,294 @@ class MetaSearchEngine(BaseSearchEngine):
             max_filtered_results: Maximum number of results to keep after filtering
             **kwargs: Additional parameters (ignored but accepted for compatibility)
         """
-        # Initialize the BaseSearchEngine with the LLM and max_filtered_results
-        super().__init__(llm=llm, max_filtered_results=max_filtered_results)
-        self.max_results = max_results
+        # Initialize the BaseSearchEngine with the LLM, max_filtered_results, and max_results
+        super().__init__(
+            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+        )
         self.use_api_key_services = use_api_key_services
         self.max_engines_to_try = max_engines_to_try
         # Cache for engine instances
         self.engine_cache = {}
         # Get available engines (excluding 'meta' and 'auto')
         self.available_engines = self._get_available_engines()
-        logger.info(f"Meta Search Engine initialized with {len(self.available_engines)} available engines: {', '.join(self.available_engines)}")
+        logger.info(
+            f"Meta Search Engine initialized with {len(self.available_engines)} available engines: {', '.join(self.available_engines)}"
+        )
         # Create a fallback engine in case everything else fails
         self.fallback_engine = WikipediaSearchEngine(
-            max_results=max_results,
+            max_results=self.max_results,
             llm=llm,
-            max_filtered_results=max_filtered_results
+            max_filtered_results=max_filtered_results,
         )
     def _get_available_engines(self) -> List[str]:
         """Get list of available engines, excluding 'meta' and 'auto'"""
         # Filter out 'meta' and 'auto' and check API key availability
         available = []
-        for name, config in SEARCH_ENGINES.items():
+        for name, config_ in SEARCH_ENGINES.items():
             if name in ["meta", "auto"]:
                 continue
-            if config.get("requires_api_key", False) and not self.use_api_key_services:
+            if config_.get("requires_api_key", False) and not self.use_api_key_services:
                 continue
-            if config.get("requires_api_key", False):
-                api_key_env = config.get("api_key_env")
+            if config_.get("requires_api_key", False):
+                api_key_env = config_.get("api_key_env")
                 api_key = os.getenv(api_key_env) if api_key_env else None
                 if not api_key:
                     continue
             available.append(name)
         # Make sure we have at least one engine available
         if not available and "wikipedia" in SEARCH_ENGINES:
             available.append("wikipedia")
         return available
     def analyze_query(self, query: str) -> List[str]:
         """
-        Use the LLM to analyze the query and return a ranked list of
-        recommended search engines to try
+        Analyze the query to determine the best search engines to use.
+        Args:
+            query: The search query
+        Returns:
+            List of search engine names sorted by suitability
         """
-        if not self.available_engines:
-            logger.warning("No search engines available")
-            return []
-        engine_descriptions = []
-        for name in self.available_engines:
-            logger.info(f"Processing search engine: {name}")
-            try:
-                description = f"- {name.upper()}: Good for {', '.join(SEARCH_ENGINES[name]['strengths'][:3])}. " \
-                            f"Weaknesses: {', '.join(SEARCH_ENGINES[name]['weaknesses'][:2])}. " \
-                            f"Reliability: {SEARCH_ENGINES[name]['reliability']*100:.0f}%"
-                engine_descriptions.append(description)
-            except KeyError as e:
-                logger.error(f"Missing key for engine {name}: {e}")
-                # Add a basic description for engines with missing configuration
-                engine_descriptions.append(f"- {name.upper()}: General purpose search engine.")
-            except Exception as e:
-                logger.error(f"Error processing engine {name}: {e}")
-                engine_descriptions.append(f"- {name.upper()}: General purpose search engine.")
+        try:
+            # Check if the LLM is available to help select engines
+            if not self.llm:
+                logger.warning(
+                    "No LLM available for query analysis, using default engines"
+                )
+                # Return engines sorted by reliability
+                return sorted(
+                    self.available_engines,
+                    key=lambda x: SEARCH_ENGINES.get(x, {}).get("reliability", 0),
+                    reverse=True,
+                )
-        engine_descriptions = "\n".join(engine_descriptions)
-        prompt = f"""Analyze this search query and rank the available search engines in order of most to least appropriate for answering it.
-Query: "{query}"
+            # Create a prompt that outlines the available search engines and their strengths
+            engines_info = []
+            for engine_name in self.available_engines:
+                try:
+                    if engine_name in SEARCH_ENGINES:
+                        strengths = SEARCH_ENGINES[engine_name].get(
+                            "strengths", "General search"
+                        )
+                        weaknesses = SEARCH_ENGINES[engine_name].get(
+                            "weaknesses", "None specified"
+                        )
+                        description = SEARCH_ENGINES[engine_name].get(
+                            "description", engine_name
+                        )
+                        engines_info.append(
+                            f"- {engine_name}: {description}\n  Strengths: {strengths}\n  Weaknesses: {weaknesses}"
+                        )
+                except KeyError as e:
+                    logger.error(f"Missing key for engine {engine_name}: {str(e)}")
-Available search engines:
-{engine_descriptions}
+            prompt = f"""You are a search query analyst. Consider this search query:
-Consider:
-1. The nature of the query (factual, academic, product-related, news, etc.)
-2. The strengths and weaknesses of each engine
-3. The reliability of each engine
+QUERY: {query}
-Return ONLY a comma-separated list of search engine names in your recommended order. Example: "wikipedia,arxiv,duckduckgo"
-Do not include any engines that are not listed above. Only return the comma-separated list, nothing else."""
+I have these search engines available:
+{chr(10).join(engines_info)}
-        # Get response from LLM
-        try:
+Determine which search engines would be most appropriate for answering this query.
+First analyze the nature of the query (factual, scientific, code-related, etc.)
+Then select the 1-3 most appropriate search engines for this type of query.
+Output ONLY a comma-separated list of the search engine names in order of most appropriate to least appropriate.
+Example output: wikipedia,arxiv,github"""
+            # Get analysis from LLM
             response = self.llm.invoke(prompt)
-            content = response.content.strip()
-            # Parse the response into a list of engine names
-            engine_names = [name.strip().lower() for name in content.split(',')]
-            # Filter out any invalid engine names
-            valid_engines = [name for name in engine_names if name in self.available_engines]
+            # Handle different response formats
+            if hasattr(response, "content"):
+                content = response.content.strip()
+            else:
+                content = str(response).strip()
+            # Extract engine names
+            valid_engines = []
+            for engine_name in content.split(","):
+                cleaned_name = engine_name.strip().lower()
+                if cleaned_name in self.available_engines:
+                    valid_engines.append(cleaned_name)
             # If no valid engines were returned, use default order based on reliability
             if not valid_engines:
                 valid_engines = sorted(
-                    self.available_engines,
-                    key=lambda x: SEARCH_ENGINES[x]["reliability"],
-                    reverse=True
+                    self.available_engines,
+                    key=lambda x: SEARCH_ENGINES.get(x, {}).get("reliability", 0),
+                    reverse=True,
                 )
             return valid_engines
         except Exception as e:
             logger.error(f"Error analyzing query with LLM: {str(e)}")
             # Fall back to reliability-based ordering
             return sorted(
-                self.available_engines,
-                key=lambda x: SEARCH_ENGINES[x]["reliability"],
-                reverse=True
+                self.available_engines,
+                key=lambda x: SEARCH_ENGINES.get(x, {}).get("reliability", 0),
+                reverse=True,
             )
     def _get_previews(self, query: str) -> List[Dict[str, Any]]:
         """
         Get preview information by selecting the best search engine for this query.
         Args:
             query: The search query
         Returns:
             List of preview dictionaries
         """
         # Get ranked list of engines for this query
         ranked_engines = self.analyze_query(query)
         if not ranked_engines:
-            logger.warning("No suitable search engines found for query, using fallback engine")
+            logger.warning(
+                "No suitable search engines found for query, using fallback engine"
+            )
             return self.fallback_engine._get_previews(query)
         # Limit the number of engines to try
-        engines_to_try = ranked_engines[:self.max_engines_to_try]
-        logger.info(f"Search plan created. Will try these engines in order: {', '.join(engines_to_try)}")
+        engines_to_try = ranked_engines[: self.max_engines_to_try]
+        logger.info(
+            f"SEARCH_PLAN: Will try these engines in order: {', '.join(engines_to_try)}"
+        )
         all_errors = []
         # Try each engine in order
         for engine_name in engines_to_try:
             logger.info(f"Trying search engine: {engine_name}")
             # Get or create the engine instance
             engine = self._get_engine_instance(engine_name)
             if not engine:
                 logger.warning(f"Failed to initialize {engine_name}, skipping")
                 all_errors.append(f"Failed to initialize {engine_name}")
                 continue
             try:
                 # Get previews from this engine
                 previews = engine._get_previews(query)
                 # If search was successful, return results
                 if previews and len(previews) > 0:
-                    logger.info(f"Successfully got {len(previews)} preview results from {engine_name}")
+                    logger.info(f"ENGINE_SELECTED: {engine_name}")
+                    logger.info(
+                        f"Successfully got {len(previews)} preview results from {engine_name}"
+                    )
                     # Store selected engine for later use
                     self._selected_engine = engine
                     self._selected_engine_name = engine_name
+                    # Emit a socket event to inform about the selected engine
+                    try:
+                        emit_socket_event(
+                            "search_engine_selected",
+                            {"engine": engine_name, "result_count": len(previews)},
+                        )
+                    except Exception as socket_error:
+                        logger.error(
+                            f"Socket emit error (non-critical): {str(socket_error)}"
+                        )
                     return previews
                 logger.info(f"{engine_name} returned no previews")
                 all_errors.append(f"{engine_name} returned no previews")
             except Exception as e:
                 error_msg = f"Error getting previews from {engine_name}: {str(e)}"
                 logger.error(error_msg)
                 all_errors.append(error_msg)
         # If we reach here, all engines failed, use fallback
-        logger.warning(f"All engines failed or returned no preview results: {', '.join(all_errors)}")
+        logger.warning(
+            f"All engines failed or returned no preview results: {', '.join(all_errors)}"
+        )
         logger.info("Using fallback Wikipedia engine for previews")
         self._selected_engine = self.fallback_engine
         self._selected_engine_name = "wikipedia"
         return self.fallback_engine._get_previews(query)
-    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
         """
         Get full content using the engine that provided the previews.
         Args:
             relevant_items: List of relevant preview dictionaries
         Returns:
             List of result dictionaries with full content
         """
         # Check if we should get full content
-        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
             logger.info("Snippet-only mode, skipping full content retrieval")
             return relevant_items
         logger.info("Getting full content for relevant items")
         # Use the selected engine to get full content
-        if hasattr(self, '_selected_engine'):
+        if hasattr(self, "_selected_engine"):
             try:
                 logger.info(f"Using {self._selected_engine_name} to get full content")
                 return self._selected_engine._get_full_content(relevant_items)
             except Exception as e:
-                logger.error(f"Error getting full content from {self._selected_engine_name}: {str(e)}")
+                logger.error(
+                    f"Error getting full content from {self._selected_engine_name}: {str(e)}"
+                )
                 # Fall back to returning relevant items without full content
                 return relevant_items
         else:
-            logger.warning("No engine was selected during preview phase, returning relevant items as-is")
+            logger.warning(
+                "No engine was selected during preview phase, returning relevant items as-is"
+            )
             return relevant_items
     def _get_engine_instance(self, engine_name: str) -> Optional[BaseSearchEngine]:
         """Get or create an instance of the specified search engine"""
         # Return cached instance if available
         if engine_name in self.engine_cache:
             return self.engine_cache[engine_name]
         # Create a new instance
         engine = None
         try:
             # Only pass parameters that all engines accept
-            common_params = {
-                "llm": self.llm,
-                "max_results": self.max_results
-            }
+            common_params = {"llm": self.llm, "max_results": self.max_results}
             # Add max_filtered_results if specified
             if self.max_filtered_results is not None:
                 common_params["max_filtered_results"] = self.max_filtered_results
-            engine = create_search_engine(
-                engine_name,
-                **common_params
-            )
+            engine = create_search_engine(engine_name, **common_params)
         except Exception as e:
             logger.error(f"Error creating engine instance for {engine_name}: {str(e)}")
             return None
         if engine:
             # Cache the instance
             self.engine_cache[engine_name] = engine
         return engine
     def invoke(self, query: str) -> List[Dict[str, Any]]:
         """Compatibility method for LangChain tools"""
-        return self.run(query)
+        return self.run(query)

local-deep-research 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl