PyPI - local-deep-research - Versions diffs - 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

local_deep_research/web_search_engines/engines/search_engine_elasticsearch.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import json
 import logging
 from typing import Any, Dict, List, Optional
@@ -48,40 +47,48 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
         """
         # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
         super().__init__(
-            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+            llm=llm,
+            max_filtered_results=max_filtered_results,
+            max_results=max_results,
         )
         self.index_name = index_name
         self.highlight_fields = highlight_fields
         self.search_fields = search_fields
         self.filter_query = filter_query or {}
         # Initialize the Elasticsearch client
         es_args = {}
         # Basic authentication
         if username and password:
             es_args["basic_auth"] = (username, password)
         # API key authentication
         if api_key:
             es_args["api_key"] = api_key
         # Cloud ID for Elastic Cloud
         if cloud_id:
             es_args["cloud_id"] = cloud_id
         # Connect to Elasticsearch
         self.client = Elasticsearch(hosts, **es_args)
         # Verify connection
         try:
             info = self.client.info()
-            logger.info(f"Connected to Elasticsearch cluster: {info.get('cluster_name')}")
-            logger.info(f"Elasticsearch version: {info.get('version', {}).get('number')}")
+            logger.info(
+                f"Connected to Elasticsearch cluster: {info.get('cluster_name')}"
+            )
+            logger.info(
+                f"Elasticsearch version: {info.get('version', {}).get('number')}"
+            )
         except Exception as e:
             logger.error(f"Failed to connect to Elasticsearch: {str(e)}")
-            raise ConnectionError(f"Could not connect to Elasticsearch: {str(e)}")
+            raise ConnectionError(
+                f"Could not connect to Elasticsearch: {str(e)}"
+            )
     def _get_previews(self, query: str) -> List[Dict[str, Any]]:
         """
@@ -93,7 +100,9 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
         Returns:
             List of preview dictionaries
         """
-        logger.info(f"Getting document previews from Elasticsearch with query: {query}")
+        logger.info(
+            f"Getting document previews from Elasticsearch with query: {query}"
+        )
         try:
             # Build the search query
@@ -113,31 +122,31 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
                 },
                 "size": self.max_results,
             }
             # Add filter if provided
             if self.filter_query:
                 search_query["query"] = {
                     "bool": {
                         "must": search_query["query"],
-                        "filter": self.filter_query
+                        "filter": self.filter_query,
                     }
                 }
             # Execute the search
             response = self.client.search(
                 index=self.index_name,
                 body=search_query,
             )
             # Process the search results
             hits = response.get("hits", {}).get("hits", [])
             # Format results as previews with basic information
             previews = []
             for hit in hits:
                 source = hit.get("_source", {})
                 highlight = hit.get("highlight", {})
                 # Extract highlighted snippets or fall back to original content
                 snippet = ""
                 for field in self.highlight_fields:
@@ -145,25 +154,30 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
                         # Join all highlights for this field
                         field_snippets = " ... ".join(highlight[field])
                         snippet += field_snippets + " "
                 # If no highlights, use a portion of the content
                 if not snippet and "content" in source:
                     content = source.get("content", "")
-                    snippet = content[:250] + "..." if len(content) > 250 else content
+                    snippet = (
+                        content[:250] + "..." if len(content) > 250 else content
+                    )
                 # Create preview object
                 preview = {
                     "id": hit.get("_id", ""),
                     "title": source.get("title", "Untitled Document"),
-                    "link": source.get("url", "") or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
+                    "link": source.get("url", "")
+                    or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
                     "snippet": snippet.strip(),
                     "score": hit.get("_score", 0),
                     "_index": hit.get("_index", self.index_name),
                 }
                 previews.append(preview)
-            logger.info(f"Found {len(previews)} preview results from Elasticsearch")
+            logger.info(
+                f"Found {len(previews)} preview results from Elasticsearch"
+            )
             return previews
         except Exception as e:
@@ -196,7 +210,7 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
         for item in relevant_items:
             # Start with the preview data
             result = item.copy()
             # Get the document ID
             doc_id = item.get("id")
             if not doc_id:
@@ -204,30 +218,34 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
                 logger.warning(f"Skipping item without ID: {item}")
                 results.append(result)
                 continue
             try:
                 # Fetch the full document
                 doc_response = self.client.get(
                     index=self.index_name,
                     id=doc_id,
                 )
                 # Get the source document
                 source = doc_response.get("_source", {})
                 # Add full content to the result
-                result["content"] = source.get("content", result.get("snippet", ""))
+                result["content"] = source.get(
+                    "content", result.get("snippet", "")
+                )
                 result["full_content"] = source.get("content", "")
                 # Add metadata from source
                 for key, value in source.items():
                     if key not in result and key not in ["content"]:
                         result[key] = value
             except Exception as e:
-                logger.error(f"Error fetching full content for document {doc_id}: {str(e)}")
+                logger.error(
+                    f"Error fetching full content for document {doc_id}: {str(e)}"
+                )
                 # Keep the preview data if we can't get the full content
             results.append(result)
         return results
@@ -235,10 +253,10 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
     def search_by_query_string(self, query_string: str) -> List[Dict[str, Any]]:
         """
         Perform a search using Elasticsearch Query String syntax.
         Args:
             query_string: The query in Elasticsearch Query String syntax
         Returns:
             List of search results
         """
@@ -258,28 +276,28 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
                 },
                 "size": self.max_results,
             }
             # Execute the search
             response = self.client.search(
                 index=self.index_name,
                 body=search_query,
             )
             # Process and return the results
             previews = self._process_es_response(response)
             return self._get_full_content(previews)
         except Exception as e:
             logger.error(f"Error in query_string search: {str(e)}")
             return []
     def search_by_dsl(self, query_dsl: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Perform a search using Elasticsearch DSL (Query Domain Specific Language).
         Args:
             query_dsl: The query in Elasticsearch DSL format
         Returns:
             List of search results
         """
@@ -289,55 +307,60 @@ class ElasticsearchSearchEngine(BaseSearchEngine):
                 index=self.index_name,
                 body=query_dsl,
             )
             # Process and return the results
             previews = self._process_es_response(response)
             return self._get_full_content(previews)
         except Exception as e:
             logger.error(f"Error in DSL search: {str(e)}")
             return []
-    def _process_es_response(self, response: Dict[str, Any]) -> List[Dict[str, Any]]:
+    def _process_es_response(
+        self, response: Dict[str, Any]
+    ) -> List[Dict[str, Any]]:
         """
         Process Elasticsearch response into preview dictionaries.
         Args:
             response: Elasticsearch response dictionary
         Returns:
             List of preview dictionaries
         """
         hits = response.get("hits", {}).get("hits", [])
         # Format results as previews
         previews = []
         for hit in hits:
             source = hit.get("_source", {})
             highlight = hit.get("highlight", {})
             # Extract highlighted snippets or fall back to original content
             snippet = ""
             for field in self.highlight_fields:
                 if field in highlight and highlight[field]:
                     field_snippets = " ... ".join(highlight[field])
                     snippet += field_snippets + " "
             # If no highlights, use a portion of the content
             if not snippet and "content" in source:
                 content = source.get("content", "")
-                snippet = content[:250] + "..." if len(content) > 250 else content
+                snippet = (
+                    content[:250] + "..." if len(content) > 250 else content
+                )
             # Create preview object
             preview = {
                 "id": hit.get("_id", ""),
                 "title": source.get("title", "Untitled Document"),
-                "link": source.get("url", "") or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
+                "link": source.get("url", "")
+                or f"elasticsearch://{self.index_name}/{hit.get('_id', '')}",
                 "snippet": snippet.strip(),
                 "score": hit.get("_score", 0),
                 "_index": hit.get("_index", self.index_name),
             }
             previews.append(preview)
-        return previews
+        return previews

local_deep_research/web_search_engines/engines/search_engine_github.py CHANGED Viewed

@@ -46,7 +46,9 @@ class GitHubSearchEngine(BaseSearchEngine):
         """
         # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
         super().__init__(
-            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+            llm=llm,
+            max_filtered_results=max_filtered_results,
+            max_results=max_results,
         )
         self.api_key = api_key or os.getenv("GITHUB_API_KEY")
         self.search_type = search_type
@@ -224,7 +226,9 @@ class GitHubSearchEngine(BaseSearchEngine):
                 # If no results, try to provide more guidance
                 if not results:
-                    logger.warning("No results found. Consider these search tips:")
+                    logger.warning(
+                        "No results found. Consider these search tips:"
+                    )
                     logger.warning("1. Use shorter, more specific queries")
                     logger.warning(
                         "2. For repositories, try adding 'stars:>100' or 'language:python'"
@@ -255,7 +259,8 @@ class GitHubSearchEngine(BaseSearchEngine):
         try:
             # Get README
             response = requests.get(
-                f"{self.api_base}/repos/{repo_full_name}/readme", headers=self.headers
+                f"{self.api_base}/repos/{repo_full_name}/readme",
+                headers=self.headers,
             )
             # Check for rate limiting
@@ -267,7 +272,9 @@ class GitHubSearchEngine(BaseSearchEngine):
                 encoding = data.get("encoding", "")
                 if encoding == "base64" and content:
-                    return base64.b64decode(content).decode("utf-8", errors="replace")
+                    return base64.b64decode(content).decode(
+                        "utf-8", errors="replace"
+                    )
                 return content
             else:
                 logger.warning(
@@ -312,7 +319,9 @@ class GitHubSearchEngine(BaseSearchEngine):
             if response.status_code == 200:
                 issues = response.json()
-                logger.info(f"Got {len(issues)} recent issues for {repo_full_name}")
+                logger.info(
+                    f"Got {len(issues)} recent issues for {repo_full_name}"
+                )
             else:
                 logger.warning(
                     f"Could not get issues for {repo_full_name}: {response.status_code}"
@@ -346,17 +355,23 @@ class GitHubSearchEngine(BaseSearchEngine):
                 encoding = data.get("encoding", "")
                 if encoding == "base64" and content:
-                    return base64.b64decode(content).decode("utf-8", errors="replace")
+                    return base64.b64decode(content).decode(
+                        "utf-8", errors="replace"
+                    )
                 return content
             else:
-                logger.warning(f"Could not get file content: {response.status_code}")
+                logger.warning(
+                    f"Could not get file content: {response.status_code}"
+                )
                 return ""
         except Exception as e:
             logger.error(f"Error getting file content: {e}")
             return ""
-    def _format_repository_preview(self, repo: Dict[str, Any]) -> Dict[str, Any]:
+    def _format_repository_preview(
+        self, repo: Dict[str, Any]
+    ) -> Dict[str, Any]:
         """Format repository search result as preview"""
         return {
             "id": str(repo.get("id", "")),
@@ -393,7 +408,9 @@ class GitHubSearchEngine(BaseSearchEngine):
     def _format_issue_preview(self, issue: Dict[str, Any]) -> Dict[str, Any]:
         """Format issue search result as preview"""
         repo = (
-            issue.get("repository", {}) if "repository" in issue else {"full_name": ""}
+            issue.get("repository", {})
+            if "repository" in issue
+            else {"full_name": ""}
         )
         return {
             "id": f"issue_{issue.get('number', '')}",
@@ -503,7 +520,9 @@ class GitHubSearchEngine(BaseSearchEngine):
                     keywords[:5]
                 )  # Add up to 5 keywords
-            logger.info(f"Using specialized contribution query: {specialized_query}")
+            logger.info(
+                f"Using specialized contribution query: {specialized_query}"
+            )
             # Perform GitHub search with specialized query
             results = self._search_github(specialized_query)
@@ -560,7 +579,9 @@ class GitHubSearchEngine(BaseSearchEngine):
             logger.info("Snippet-only mode, skipping full content retrieval")
             return relevant_items
-        logger.info(f"Getting full content for {len(relevant_items)} GitHub results")
+        logger.info(
+            f"Getting full content for {len(relevant_items)} GitHub results"
+        )
         results = []
         for item in relevant_items:
@@ -610,7 +631,10 @@ class GitHubSearchEngine(BaseSearchEngine):
                     f"Public repositories: {item.get('public_repos', 0)}\n"
                 )
-                if item.get("snippet") and item.get("snippet") != "No bio provided":
+                if (
+                    item.get("snippet")
+                    and item.get("snippet") != "No bio provided"
+                ):
                     profile_summary += f"\nBio: {item.get('snippet')}\n"
                 result["full_content"] = profile_summary
@@ -620,7 +644,9 @@ class GitHubSearchEngine(BaseSearchEngine):
         return results
-    def search_repository(self, repo_owner: str, repo_name: str) -> Dict[str, Any]:
+    def search_repository(
+        self, repo_owner: str, repo_name: str
+    ) -> Dict[str, Any]:
         """
         Get detailed information about a specific repository.
@@ -672,7 +698,10 @@ class GitHubSearchEngine(BaseSearchEngine):
             return {}
     def search_code(
-        self, query: str, language: Optional[str] = None, user: Optional[str] = None
+        self,
+        query: str,
+        language: Optional[str] = None,
+        user: Optional[str] = None,
     ) -> List[Dict[str, Any]]:
         """
         Search for code with more specific parameters.
@@ -769,7 +798,9 @@ class GitHubSearchEngine(BaseSearchEngine):
                 results = data.get("items", [])
                 # Format results
-                previews = [self._format_issue_preview(result) for result in results]
+                previews = [
+                    self._format_issue_preview(result) for result in results
+                ]
                 # For issues, we don't need to get full content
                 return previews

local_deep_research/web_search_engines/engines/search_engine_google_pse.py CHANGED Viewed

@@ -51,7 +51,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
         """
         # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
         super().__init__(
-            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+            llm=llm,
+            max_filtered_results=max_filtered_results,
+            max_results=max_results,
         )
         self.include_full_content = include_full_content
@@ -61,7 +63,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
         # Rate limiting - keep track of last request time
         self.last_request_time = 0
-        self.min_request_interval = 0.5  # Minimum time between requests in seconds
+        self.min_request_interval = (
+            0.5  # Minimum time between requests in seconds
+        )
         # Language code mapping
         language_code_mapping = {
@@ -92,7 +96,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
         self.api_key = api_key
         if not self.api_key:
-            self.api_key = get_db_setting("search.engine.web.google_pse.api_key")
+            self.api_key = get_db_setting(
+                "search.engine.web.google_pse.api_key"
+            )
         self.search_engine_id = search_engine_id
         if not self.search_engine_id:
@@ -187,7 +193,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
                 # Add jitter to retries after the first attempt
                 if attempt > 0:
                     jitter = random.uniform(0.5, 1.5)
-                    sleep_time = self.retry_delay * (2 ** (attempt - 1)) * jitter
+                    sleep_time = (
+                        self.retry_delay * (2 ** (attempt - 1)) * jitter
+                    )
                     logger.info(
                         "Retry attempt %s / %s for query '%s'. Waiting %s s",
                         attempt + 1,
@@ -272,7 +280,7 @@ class GooglePSESearchEngine(BaseSearchEngine):
                         {
                             "title": title,
                             "snippet": snippet,
-                            "url": url,
+                            "link": url,
                             "source": "Google Programmable Search",
                         }
                     )
@@ -296,7 +304,9 @@ class GooglePSESearchEngine(BaseSearchEngine):
                 logger.error("Error getting search results: %s", str(e))
                 break
-        logger.info("Retrieved %s search results for query: '%s'", len(results), query)
+        logger.info(
+            "Retrieved %s search results for query: '%s'", len(results), query
+        )
         return results
     def _get_full_content(

local_deep_research/web_search_engines/engines/search_engine_guardian.py CHANGED Viewed

@@ -48,7 +48,9 @@ class GuardianSearchEngine(BaseSearchEngine):
         """
         # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
         super().__init__(
-            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+            llm=llm,
+            max_filtered_results=max_filtered_results,
+            max_results=max_results,
         )
         self.api_key = api_key or os.getenv("GUARDIAN_API_KEY")
         self.optimize_queries = optimize_queries
@@ -204,15 +206,19 @@ ONE WORD ONLY:"""
                 logger.info(
                     "Query classified as HISTORICAL - extending search timeframe"
                 )
-                ten_years_ago = (datetime.now() - timedelta(days=3650)).strftime(
-                    "%Y-%m-%d"
-                )
+                ten_years_ago = (
+                    datetime.now() - timedelta(days=3650)
+                ).strftime("%Y-%m-%d")
                 self.from_date = ten_years_ago
             elif "CURRENT" in answer:
                 # For current events, focus on recent content
-                logger.info("Query classified as CURRENT - focusing on recent content")
-                recent = (datetime.now() - timedelta(days=60)).strftime("%Y-%m-%d")
+                logger.info(
+                    "Query classified as CURRENT - focusing on recent content"
+                )
+                recent = (datetime.now() - timedelta(days=60)).strftime(
+                    "%Y-%m-%d"
+                )
                 self.from_date = recent
                 self.order_by = "newest"  # Prioritize newest for current events
@@ -246,7 +252,9 @@ ONE WORD ONLY:"""
             # Strategy 1: Expand to 6 months
             logger.info("Strategy 1: Expanding time range to 6 months")
-            six_months_ago = (datetime.now() - timedelta(days=180)).strftime("%Y-%m-%d")
+            six_months_ago = (datetime.now() - timedelta(days=180)).strftime(
+                "%Y-%m-%d"
+            )
             self.from_date = six_months_ago
             articles1 = self._get_all_data(query)
@@ -256,7 +264,9 @@ ONE WORD ONLY:"""
             # Strategy 2: Expand to all time and try relevance order
             if len(articles) < 3:
-                logger.info("Strategy 2: Expanding to all time with relevance ordering")
+                logger.info(
+                    "Strategy 2: Expanding to all time with relevance ordering"
+                )
                 self.from_date = "2000-01-01"  # Effectively "all time"
                 self.order_by = "relevance"
@@ -315,12 +325,15 @@ ONE WORD ONLY:"""
             # Always request all fields for simplicity
             # Ensure max_results is an integer to avoid comparison errors
             page_size = min(
-                int(self.max_results) if self.max_results is not None else 10, 50
+                int(self.max_results) if self.max_results is not None else 10,
+                50,
             )
             # Log full parameters for debugging
             logger.info(f"Guardian API search query: '{query}'")
-            logger.info(f"Guardian API date range: {self.from_date} to {self.to_date}")
+            logger.info(
+                f"Guardian API date range: {self.from_date} to {self.to_date}"
+            )
             params = {
                 "q": query,
@@ -363,7 +376,9 @@ ONE WORD ONLY:"""
                 # Format the article with all fields
                 result = {
                     "id": article.get("id", ""),
-                    "title": fields.get("headline", article.get("webTitle", "")),
+                    "title": fields.get(
+                        "headline", article.get("webTitle", "")
+                    ),
                     "link": article.get("webUrl", ""),
                     "snippet": fields.get("trailText", ""),
                     "publication_date": article.get("webPublicationDate", ""),
@@ -399,7 +414,9 @@ ONE WORD ONLY:"""
         Returns:
             List of preview dictionaries
         """
-        logger.info(f"Getting articles from The Guardian API for query: {query}")
+        logger.info(
+            f"Getting articles from The Guardian API for query: {query}"
+        )
         # Step 1: Optimize the query using LLM
         optimized_query = self._optimize_query_for_guardian(query)
@@ -471,7 +488,10 @@ ONE WORD ONLY:"""
             article_id = item.get("id", "")
             # Get the full article from our cache
-            if hasattr(self, "_full_articles") and article_id in self._full_articles:
+            if (
+                hasattr(self, "_full_articles")
+                and article_id in self._full_articles
+            ):
                 results.append(self._full_articles[article_id])
             else:
                 # If not found (shouldn't happen), just use the preview
@@ -502,7 +522,9 @@ ONE WORD ONLY:"""
             # If no results, try one more time with a simplified query
             if not previews:
-                simple_query = " ".join([w for w in query.split() if len(w) > 3][:3])
+                simple_query = " ".join(
+                    [w for w in query.split() if len(w) > 3][:3]
+                )
                 logger.warning(
                     f"No Guardian articles found, trying simplified query: {simple_query}"
                 )
@@ -518,7 +540,9 @@ ONE WORD ONLY:"""
             # If still no results after all attempts, return empty list
             if not previews:
-                logger.warning("No Guardian articles found after multiple attempts")
+                logger.warning(
+                    "No Guardian articles found after multiple attempts"
+                )
                 return []
             # Filter for relevance if we have an LLM

local-deep-research 0.4.4__py3-none-any.whl → 0.5.2__py3-none-any.whl

local-deep-research 0.4.4py3-none-any.whl → 0.5.2py3-none-any.whl