PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/web_search_engines/engines/search_engine_semantic_scholar.py CHANGED Viewed

@@ -1,48 +1,50 @@
-import requests
 import logging
-import json
-from typing import Dict, List, Any, Optional, Tuple, Union
-from langchain_core.language_models import BaseLLM
-import time
 import re
-from datetime import datetime
+import time
+from typing import Any, Dict, List, Optional, Tuple
+import requests
+from langchain_core.language_models import BaseLLM
 from requests.adapters import HTTPAdapter
 from urllib3.util import Retry
-from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
-from local_deep_research import config
+from ...config import search_config
+from ..search_engine_base import BaseSearchEngine
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class SemanticScholarSearchEngine(BaseSearchEngine):
     """
     Semantic Scholar search engine implementation with two-phase approach.
     Provides efficient access to scientific literature across all fields.
     """
-    def __init__(self,
-                max_results: int = 10,
-                api_key: Optional[str] = None,
-                year_range: Optional[Tuple[int, int]] = None,
-                get_abstracts: bool = True,
-                get_references: bool = False,
-                get_citations: bool = False,
-                get_embeddings: bool = False,
-                get_tldr: bool = True,
-                citation_limit: int = 10,
-                reference_limit: int = 10,
-                llm: Optional[BaseLLM] = None,
-                max_filtered_results: Optional[int] = None,
-                optimize_queries: bool = True,
-                max_retries: int = 5,
-                retry_backoff_factor: float = 1.0,
-                fields_of_study: Optional[List[str]] = None,
-                publication_types: Optional[List[str]] = None):
+    def __init__(
+        self,
+        max_results: int = 10,
+        api_key: Optional[str] = None,
+        year_range: Optional[Tuple[int, int]] = None,
+        get_abstracts: bool = True,
+        get_references: bool = False,
+        get_citations: bool = False,
+        get_embeddings: bool = False,
+        get_tldr: bool = True,
+        citation_limit: int = 10,
+        reference_limit: int = 10,
+        llm: Optional[BaseLLM] = None,
+        max_filtered_results: Optional[int] = None,
+        optimize_queries: bool = True,
+        max_retries: int = 5,
+        retry_backoff_factor: float = 1.0,
+        fields_of_study: Optional[List[str]] = None,
+        publication_types: Optional[List[str]] = None,
+    ):
         """
         Initialize the Semantic Scholar search engine.
         Args:
             max_results: Maximum number of search results
             api_key: Semantic Scholar API key for higher rate limits (optional)
@@ -63,8 +65,10 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
             publication_types: List of publication types to filter results
         """
         # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
-        super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
+        super().__init__(
+            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+        )
         self.api_key = api_key
         self.year_range = year_range
         self.get_abstracts = get_abstracts
@@ -79,71 +83,76 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
         self.retry_backoff_factor = retry_backoff_factor
         self.fields_of_study = fields_of_study
         self.publication_types = publication_types
         # Base API URLs
         self.base_url = "https://api.semanticscholar.org/graph/v1"
         self.paper_search_url = f"{self.base_url}/paper/search"
         self.paper_details_url = f"{self.base_url}/paper"
         # Create a session with retry capabilities
         self.session = self._create_session()
         # Rate limiting
         self.rate_limit_wait = 1.0  # Default 1 second between requests
         self.last_request_time = 0
     def _create_session(self) -> requests.Session:
         """Create and configure a requests session with retry capabilities"""
         session = requests.Session()
         # Configure automatic retries with exponential backoff
         retry_strategy = Retry(
             total=self.max_retries,
             backoff_factor=self.retry_backoff_factor,
             status_forcelist=[429, 500, 502, 503, 504],
-            allowed_methods={"HEAD", "GET", "POST", "OPTIONS"}
+            allowed_methods={"HEAD", "GET", "POST", "OPTIONS"},
         )
         adapter = HTTPAdapter(max_retries=retry_strategy)
         session.mount("https://", adapter)
         # Set up headers
         headers = {"Accept": "application/json"}
         if self.api_key:
             headers["x-api-key"] = self.api_key
         session.headers.update(headers)
         return session
     def _respect_rate_limit(self):
         """Apply rate limiting between requests"""
         current_time = time.time()
         elapsed = current_time - self.last_request_time
         if elapsed < self.rate_limit_wait:
             wait_time = self.rate_limit_wait - elapsed
-            logger.debug(f"Rate limiting: waiting {wait_time:.2f}s")
+            logger.debug("Rate limiting: waiting %.2f s", wait_time)
             time.sleep(wait_time)
         self.last_request_time = time.time()
-    def _make_request(self, url: str, params: Optional[Dict] = None, data: Optional[Dict] = None,
-                     method: str = "GET") -> Dict:
+    def _make_request(
+        self,
+        url: str,
+        params: Optional[Dict] = None,
+        data: Optional[Dict] = None,
+        method: str = "GET",
+    ) -> Dict:
         """
         Make a request to the Semantic Scholar API.
         Args:
             url: API endpoint URL
             params: Query parameters
             data: JSON data for POST requests
             method: HTTP method (GET or POST)
         Returns:
             API response as dictionary
         """
         self._respect_rate_limit()
         try:
             if method.upper() == "GET":
                 response = self.session.get(url, params=params, timeout=30)
@@ -151,34 +160,34 @@ class SemanticScholarSearchEngine(BaseSearchEngine):
                 response = self.session.post(url, params=params, json=data, timeout=30)
             else:
                 raise ValueError(f"Unsupported HTTP method: {method}")
             # Handle rate limiting manually if retry strategy fails
             if response.status_code == 429:
                 logger.warning("Rate limit exceeded, waiting and retrying...")
                 time.sleep(2.0)  # Wait longer on rate limit
                 self.rate_limit_wait *= 1.5  # Increase wait time for future requests
                 return self._make_request(url, params, data, method)  # Retry
             response.raise_for_status()
             return response.json()
         except requests.RequestException as e:
             logger.error(f"API request failed: {e}")
             return {}
     def _optimize_query(self, query: str) -> str:
         """
         Optimize a natural language query for Semantic Scholar search.
         If LLM is available, uses it to extract key terms and concepts.
         Args:
             query: Natural language query
         Returns:
             Optimized query string
         """
         if not self.llm or not self.optimize_queries:
             return query
         try:
             prompt = f"""Transform this natural language question into an optimized academic search query.
@@ -198,113 +207,122 @@ EXAMPLE TRANSFORMATIONS:
 Return ONLY the optimized search query with no explanation.
 """
             response = self.llm.invoke(prompt)
             optimized_query = response.content.strip()
             # Clean up the query - remove any explanations
-            lines = optimized_query.split('\n')
+            lines = optimized_query.split("\n")
             optimized_query = lines[0].strip()
             # Safety check - if query looks too much like an explanation, use original
             if len(optimized_query.split()) > 15 or ":" in optimized_query:
-                logger.warning("Query optimization result looks too verbose, using original")
+                logger.warning(
+                    "Query optimization result looks too verbose, using original"
+                )
                 return query
             logger.info(f"Original query: '{query}'")
             logger.info(f"Optimized for search: '{optimized_query}'")
             return optimized_query
         except Exception as e:
             logger.error(f"Error optimizing query: {e}")
             return query  # Fall back to original query on error
     def _direct_search(self, query: str) -> List[Dict[str, Any]]:
         """
         Make a direct search request to the Semantic Scholar API.
         Args:
             query: The search query
         Returns:
             List of paper dictionaries
         """
         try:
             # Configure fields to retrieve
             fields = [
-                "paperId",
-                "externalIds",
-                "url",
-                "title",
-                "abstract",
-                "venue",
-                "year",
-                "authors"
+                "paperId",
+                "externalIds",
+                "url",
+                "title",
+                "abstract",
+                "venue",
+                "year",
+                "authors",
             ]
             if self.get_tldr:
                 fields.append("tldr")
             params = {
                 "query": query,
                 "limit": min(self.max_results, 100),  # API limit is 100 per request
-                "fields": ",".join(fields)
+                "fields": ",".join(fields),
             }
             # Add year filter if specified
             if self.year_range:
                 start_year, end_year = self.year_range
                 params["year"] = f"{start_year}-{end_year}"
             # Add fields of study filter if specified
             if self.fields_of_study:
                 params["fieldsOfStudy"] = ",".join(self.fields_of_study)
             # Add publication types filter if specified
             if self.publication_types:
                 params["publicationTypes"] = ",".join(self.publication_types)
             response = self._make_request(self.paper_search_url, params)
             if "data" in response:
                 papers = response["data"]
-                logger.info(f"Found {len(papers)} papers with direct search for query: '{query}'")
+                logger.info(
+                    f"Found {len(papers)} papers with direct search for query: '{query}'"
+                )
                 return papers
             else:
-                logger.warning(f"No data in response for direct search query: '{query}'")
+                logger.warning(
+                    f"No data in response for direct search query: '{query}'"
+                )
                 return []
         except Exception as e:
             logger.error(f"Error in direct search: {e}")
             return []
     def _adaptive_search(self, query: str) -> Tuple[List[Dict[str, Any]], str]:
         """
         Perform an adaptive search that adjusts based on result volume.
         Uses LLM to generate better fallback queries when available.
         Args:
             query: The search query
         Returns:
             Tuple of (list of paper results, search strategy used)
         """
         # Start with a standard search
         papers = self._direct_search(query)
         strategy = "standard"
         # If no results, try different variations
         if not papers:
             # Try removing quotes to broaden search
             if '"' in query:
-                unquoted_query = query.replace('"', '')
-                logger.info(f"No results with quoted terms, trying without quotes: {unquoted_query}")
+                unquoted_query = query.replace('"', "")
+                logger.info(
+                    "No results with quoted terms, trying without quotes: %s",
+                    unquoted_query,
+                )
                 papers = self._direct_search(unquoted_query)
                 if papers:
                     strategy = "unquoted"
                     return papers, strategy
             # If LLM is available, use it to generate better fallback queries
             if self.llm:
                 try:
@@ -325,99 +343,109 @@ Format each query on a new line with no numbering or explanation. Keep each quer
 """
                     # Get the LLM's response
                     response = self.llm.invoke(prompt)
                     # Extract the alternative queries
                     alt_queries = []
-                    if hasattr(response, 'content'):  # Handle various LLM response formats
+                    if hasattr(
+                        response, "content"
+                    ):  # Handle various LLM response formats
                         content = response.content
-                        alt_queries = [q.strip() for q in content.strip().split('\n') if q.strip()]
+                        alt_queries = [
+                            q.strip() for q in content.strip().split("\n") if q.strip()
+                        ]
                     elif isinstance(response, str):
-                        alt_queries = [q.strip() for q in response.strip().split('\n') if q.strip()]
+                        alt_queries = [
+                            q.strip() for q in response.strip().split("\n") if q.strip()
+                        ]
                     # Try each alternative query
                     for alt_query in alt_queries[:3]:  # Limit to first 3 alternatives
-                        logger.info(f"Trying LLM-suggested query: {alt_query}")
+                        logger.info("Trying LLM-suggested query: %s", alt_query)
                         alt_papers = self._direct_search(alt_query)
                         if alt_papers:
-                            logger.info(f"Found {len(alt_papers)} papers using LLM-suggested query: {alt_query}")
+                            logger.info(
+                                "Found %s papers using LLM-suggested query: %s",
+                                len(alt_papers),
+                                alt_query,
+                            )
                             strategy = "llm_alternative"
                             return alt_papers, strategy
                 except Exception as e:
-                    logger.error(f"Error using LLM for query refinement: {e}")
+                    logger.error("Error using LLM for query refinement: %s", e)
                     # Fall through to simpler strategies
             # Fallback: Try with the longest words (likely specific terms)
-            words = re.findall(r'\w+', query)
+            words = re.findall(r"\w+", query)
             longer_words = [word for word in words if len(word) > 6]
             if longer_words:
                 # Use up to 3 of the longest words
                 longer_words = sorted(longer_words, key=len, reverse=True)[:3]
-                key_terms_query = ' '.join(longer_words)
-                logger.info(f"Trying with key terms: {key_terms_query}")
+                key_terms_query = " ".join(longer_words)
+                logger.info("Trying with key terms: %s", key_terms_query)
                 papers = self._direct_search(key_terms_query)
                 if papers:
                     strategy = "key_terms"
                     return papers, strategy
             # Final fallback: Try with just the longest word
             if words:
                 longest_word = max(words, key=len)
                 if len(longest_word) > 5:  # Only use if it's reasonably long
-                    logger.info(f"Trying with single key term: {longest_word}")
+                    logger.info("Trying with single key term: %s", longest_word)
                     papers = self._direct_search(longest_word)
                     if papers:
                         strategy = "single_term"
                         return papers, strategy
         return papers, strategy
     def _get_paper_details(self, paper_id: str) -> Dict[str, Any]:
         """
         Get detailed information about a specific paper.
         Args:
             paper_id: Semantic Scholar Paper ID
         Returns:
             Dictionary with paper details
         """
         try:
             # Construct fields parameter
             fields = [
-                "paperId",
-                "externalIds",
-                "corpusId",
-                "url",
-                "title",
-                "abstract",
-                "venue",
-                "year",
-                "authors",
-                "fieldsOfStudy"
+                "paperId",
+                "externalIds",
+                "corpusId",
+                "url",
+                "title",
+                "abstract",
+                "venue",
+                "year",
+                "authors",
+                "fieldsOfStudy",
             ]
             if self.get_tldr:
                 fields.append("tldr")
             if self.get_embeddings:
                 fields.append("embedding")
             # Add citation and reference fields if requested
             if self.get_citations:
                 fields.append(f"citations.limit({self.citation_limit})")
             if self.get_references:
                 fields.append(f"references.limit({self.reference_limit})")
             # Make the request
             url = f"{self.paper_details_url}/{paper_id}"
             params = {"fields": ",".join(fields)}
             return self._make_request(url, params)
         except Exception as e:
             logger.error(f"Error getting paper details for {paper_id}: {e}")
             return {}
@@ -425,25 +453,25 @@ Format each query on a new line with no numbering or explanation. Keep each quer
     def _get_previews(self, query: str) -> List[Dict[str, Any]]:
         """
         Get preview information for Semantic Scholar papers.
         Args:
             query: The search query
         Returns:
             List of preview dictionaries
         """
         logger.info(f"Getting Semantic Scholar previews for query: {query}")
         # Optimize the query if LLM is available
         optimized_query = self._optimize_query(query)
         # Use the adaptive search approach
         papers, strategy = self._adaptive_search(optimized_query)
         if not papers:
-            logger.warning(f"No Semantic Scholar results found")
+            logger.warning("No Semantic Scholar results found")
             return []
         # Format as previews
         previews = []
         for paper in papers:
@@ -451,28 +479,34 @@ Format each query on a new line with no numbering or explanation. Keep each quer
                 # Format authors - ensure we have a valid list with string values
                 authors = []
                 if "authors" in paper and paper["authors"]:
-                    authors = [author.get("name", "") for author in paper["authors"] if author and author.get("name")]
+                    authors = [
+                        author.get("name", "")
+                        for author in paper["authors"]
+                        if author and author.get("name")
+                    ]
                 # Ensure we have valid strings for all fields
                 paper_id = paper.get("paperId", "")
                 title = paper.get("title", "")
                 url = paper.get("url", "")
                 # Handle abstract safely, ensuring we always have a string
                 abstract = paper.get("abstract")
                 snippet = ""
                 if abstract:
-                    snippet = abstract[:250] + "..." if len(abstract) > 250 else abstract
+                    snippet = (
+                        abstract[:250] + "..." if len(abstract) > 250 else abstract
+                    )
                 venue = paper.get("venue", "")
                 year = paper.get("year")
                 external_ids = paper.get("externalIds", {})
                 # Handle TLDR safely
                 tldr_text = ""
                 if paper.get("tldr") and isinstance(paper.get("tldr"), dict):
                     tldr_text = paper.get("tldr", {}).get("text", "")
                 # Create preview with basic information, ensuring no None values
                 preview = {
                     "id": paper_id if paper_id else "",
@@ -486,76 +520,85 @@ Format each query on a new line with no numbering or explanation. Keep each quer
                     "source": "Semantic Scholar",
                     "_paper_id": paper_id if paper_id else "",
                     "_search_strategy": strategy,
-                    "tldr": tldr_text
+                    "tldr": tldr_text,
                 }
                 # Store the full paper object for later reference
                 preview["_full_paper"] = paper
                 previews.append(preview)
             except Exception as e:
                 logger.error(f"Error processing paper preview: {e}")
                 # Continue with the next paper
         # Sort by year (newer first) if available
         previews = sorted(
             previews,
             key=lambda p: p.get("year", 0) if p.get("year") is not None else 0,
-            reverse=True
+            reverse=True,
+        )
+        logger.info(
+            f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}"
         )
-        logger.info(f"Found {len(previews)} Semantic Scholar previews using strategy: {strategy}")
         return previews
-    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
         """
         Get full content for the relevant Semantic Scholar papers.
         Gets additional details like citations, references, and full metadata.
         Args:
             relevant_items: List of relevant preview dictionaries
         Returns:
             List of result dictionaries with full content
         """
         # Check if we should add full content
-        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
             logger.info("Snippet-only mode, skipping full content retrieval")
             return relevant_items
-        logger.info(f"Getting content for {len(relevant_items)} Semantic Scholar papers")
+        logger.info(
+            f"Getting content for {len(relevant_items)} Semantic Scholar papers"
+        )
         results = []
         for item in relevant_items:
             result = item.copy()
             paper_id = item.get("_paper_id", "")
             # Skip if no paper ID
             if not paper_id:
                 results.append(result)
                 continue
             # Get paper details if citations or references are requested
             if self.get_citations or self.get_references or self.get_embeddings:
                 paper_details = self._get_paper_details(paper_id)
                 if paper_details:
                     # Add citation information
                     if self.get_citations and "citations" in paper_details:
                         result["citations"] = paper_details["citations"]
                     # Add reference information
                     if self.get_references and "references" in paper_details:
                         result["references"] = paper_details["references"]
                     # Add embedding if available
                     if self.get_embeddings and "embedding" in paper_details:
                         result["embedding"] = paper_details["embedding"]
                     # Add fields of study
                     if "fieldsOfStudy" in paper_details:
                         result["fields_of_study"] = paper_details["fieldsOfStudy"]
             # Remove temporary fields
             if "_paper_id" in result:
                 del result["_paper_id"]
@@ -563,7 +606,7 @@ Format each query on a new line with no numbering or explanation. Keep each quer
                 del result["_search_strategy"]
             if "_full_paper" in result:
                 del result["_full_paper"]
             results.append(result)
-        return results
+        return results

local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl