PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/web_search_engines/engines/search_engine_pubmed.py CHANGED Viewed

@@ -1,38 +1,41 @@
-import requests
 import logging
+import re
+import time
 import xml.etree.ElementTree as ET
-from typing import Dict, List, Any, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple
+import requests
 from langchain_core.language_models import BaseLLM
-import time
-import re
-from datetime import datetime
-from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
-from local_deep_research import config
+from ...config import search_config
+from ..search_engine_base import BaseSearchEngine
 # Setup logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class PubMedSearchEngine(BaseSearchEngine):
     """
     PubMed search engine implementation with two-phase approach and adaptive search.
     Provides efficient access to biomedical literature while minimizing API usage.
     """
-    def __init__(self,
-                max_results: int = 10,
-                api_key: Optional[str] = None,
-                days_limit: Optional[int] = None,
-                get_abstracts: bool = True,
-                get_full_text: bool = False,
-                full_text_limit: int = 3,
-                llm: Optional[BaseLLM] = None,
-                max_filtered_results: Optional[int] = None,
-                optimize_queries: bool = True):
+    def __init__(
+        self,
+        max_results: int = 10,
+        api_key: Optional[str] = None,
+        days_limit: Optional[int] = None,
+        get_abstracts: bool = True,
+        get_full_text: bool = False,
+        full_text_limit: int = 3,
+        llm: Optional[BaseLLM] = None,
+        max_filtered_results: Optional[int] = None,
+        optimize_queries: bool = True,
+    ):
         """
         Initialize the PubMed search engine.
         Args:
             max_results: Maximum number of search results
             api_key: NCBI API key for higher rate limits (optional)
@@ -45,32 +48,34 @@ class PubMedSearchEngine(BaseSearchEngine):
             optimize_queries: Whether to optimize natural language queries for PubMed
         """
         # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
-        super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
-        self.max_results=max(self.max_results,25)
+        super().__init__(
+            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+        )
+        self.max_results = max(self.max_results, 25)
         self.api_key = api_key
         self.days_limit = days_limit
         self.get_abstracts = get_abstracts
         self.get_full_text = get_full_text
         self.full_text_limit = full_text_limit
         self.optimize_queries = optimize_queries
         # Base API URLs
         self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
         self.search_url = f"{self.base_url}/esearch.fcgi"
         self.summary_url = f"{self.base_url}/esummary.fcgi"
         self.fetch_url = f"{self.base_url}/efetch.fcgi"
         self.link_url = f"{self.base_url}/elink.fcgi"
         # PMC base URL for full text
         self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"
     def _get_result_count(self, query: str) -> int:
         """
         Get the total number of results for a query without retrieving the results themselves.
         Args:
             query: The search query
         Returns:
             Total number of matching results
         """
@@ -80,69 +85,70 @@ class PubMedSearchEngine(BaseSearchEngine):
                 "db": "pubmed",
                 "term": query,
                 "retmode": "json",
-                "retmax": 0  # Don't need actual results, just the count
+                "retmax": 0,  # Don't need actual results, just the count
             }
             # Add API key if available
             if self.api_key:
                 params["api_key"] = self.api_key
             # Execute search request
             response = requests.get(self.search_url, params=params)
             response.raise_for_status()
             # Parse response
             data = response.json()
             count = int(data["esearchresult"]["count"])
-            logger.info(f"Query '{query}' has {count} total results in PubMed")
+            logger.info("Query '%s' has %s total results in PubMed", query, count)
             return count
         except Exception as e:
             logger.error(f"Error getting result count: {e}")
             return 0
     def _extract_core_terms(self, query: str) -> str:
         """
         Extract core terms from a complex query for volume estimation.
         Args:
             query: PubMed query string
         Returns:
             Simplified query with core terms
         """
         # Remove field specifications and operators
-        simplified = re.sub(r'\[\w+\]', '', query)  # Remove [Field] tags
-        simplified = re.sub(r'\b(AND|OR|NOT)\b', '', simplified)  # Remove operators
+        simplified = re.sub(r"\[\w+\]", "", query)  # Remove [Field] tags
+        simplified = re.sub(r"\b(AND|OR|NOT)\b", "", simplified)  # Remove operators
         # Remove quotes and parentheses
-        simplified = simplified.replace('"', '').replace('(', '').replace(')', '')
+        simplified = simplified.replace('"', "").replace("(", "").replace(")", "")
         # Split by whitespace and join terms with 4+ chars (likely meaningful)
         terms = [term for term in simplified.split() if len(term) >= 4]
         # Join with AND to create a basic search
         return " ".join(terms[:5])  # Limit to top 5 terms
     def _expand_time_window(self, time_filter: str) -> str:
         """
         Expand a time window to get more results.
         Args:
             time_filter: Current time filter
         Returns:
             Expanded time filter
         """
         # Parse current time window
         import re
         match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter)
         if not match:
             return '"last 10 years"[pdat]'
         amount, unit = int(match.group(1)), match.group(2)
         # Expand based on current unit
         if unit == "months" or unit == "month":
             if amount < 6:
@@ -158,24 +164,24 @@ class PubMedSearchEngine(BaseSearchEngine):
                 return '"last 5 years"[pdat]'
             else:
                 return '"last 10 years"[pdat]'
         return '"last 10 years"[pdat]'
     def _optimize_query_for_pubmed(self, query: str) -> str:
         """
         Optimize a natural language query for PubMed search.
         Uses LLM to transform questions into effective keyword-based queries.
         Args:
             query: Natural language query
         Returns:
             Optimized query string for PubMed
         """
         if not self.llm or not self.optimize_queries:
             # Return original query if no LLM available or optimization disabled
             return query
         try:
             # Prompt for query optimization
             prompt = f"""Transform this natural language question into an optimized PubMed search query.
@@ -200,138 +206,194 @@ EXAMPLE QUERIES:
 Return ONLY the search query without any explanations.
 """
             # Get response from LLM
             response = self.llm.invoke(prompt)
             raw_response = response.content.strip()
             # Clean up the query - extract only the actual query and remove any explanations
             # First check if there are multiple lines and take the first non-empty line
-            lines = raw_response.split('\n')
+            lines = raw_response.split("\n")
             cleaned_lines = [line.strip() for line in lines if line.strip()]
             if cleaned_lines:
                 optimized_query = cleaned_lines[0]
                 # Remove any quotes that wrap the entire query
                 if optimized_query.startswith('"') and optimized_query.endswith('"'):
                     optimized_query = optimized_query[1:-1]
                 # Remove any explanation phrases that might be at the beginning
-                explanation_starters = ["here is", "here's", "this query", "the following"]
+                explanation_starters = [
+                    "here is",
+                    "here's",
+                    "this query",
+                    "the following",
+                ]
                 for starter in explanation_starters:
                     if optimized_query.lower().startswith(starter):
                         # Find the actual query part - typically after a colon
-                        colon_pos = optimized_query.find(':')
+                        colon_pos = optimized_query.find(":")
                         if colon_pos > 0:
-                            optimized_query = optimized_query[colon_pos + 1:].strip()
+                            optimized_query = optimized_query[colon_pos + 1 :].strip()
                 # Check if the query still seems to contain explanations
-                if len(optimized_query) > 200 or "this query will" in optimized_query.lower():
+                if (
+                    len(optimized_query) > 200
+                    or "this query will" in optimized_query.lower()
+                ):
                     # It's probably still an explanation - try to extract just the query part
                     # Look for common patterns in the explanation like parentheses
-                    pattern = r'\([^)]+\)\s+AND\s+'
+                    pattern = r"\([^)]+\)\s+AND\s+"
                     import re
                     matches = re.findall(pattern, optimized_query)
                     if matches:
                         # Extract just the query syntax parts
                         query_parts = []
-                        for part in re.split(r'\.\s+', optimized_query):
-                            if '(' in part and ')' in part and ('AND' in part or 'OR' in part):
+                        for part in re.split(r"\.\s+", optimized_query):
+                            if (
+                                "(" in part
+                                and ")" in part
+                                and ("AND" in part or "OR" in part)
+                            ):
                                 query_parts.append(part)
                         if query_parts:
-                            optimized_query = ' '.join(query_parts)
+                            optimized_query = " ".join(query_parts)
             else:
                 # Fall back to original query if cleaning fails
                 logger.warning("Failed to extract a clean query from LLM response")
                 optimized_query = query
             # Final safety check - if query looks too much like an explanation, use original
             if len(optimized_query.split()) > 30:
                 logger.warning("Query too verbose, falling back to simpler form")
                 # Create a simple query from the original
-                words = [w for w in query.split() if len(w) > 3 and w.lower() not in ('what', 'are', 'the', 'and', 'for', 'with', 'from', 'have', 'been', 'recent')]
-                optimized_query = ' AND '.join(words[:3])
-            # Safety check for invalid or overly complex MeSH terms
+                words = [
+                    w
+                    for w in query.split()
+                    if len(w) > 3
+                    and w.lower()
+                    not in (
+                        "what",
+                        "are",
+                        "the",
+                        "and",
+                        "for",
+                        "with",
+                        "from",
+                        "have",
+                        "been",
+                        "recent",
+                    )
+                ]
+                optimized_query = " AND ".join(words[:3])
+            # Safety check for invalid or overly complex MeSH terms
             # This helps prevent errors with non-existent or complex MeSH terms
             import re
             mesh_terms = re.findall(r'"[^"]+"[Mesh]', optimized_query)
-            known_valid_mesh = ["Vaccines", "COVID-19", "Influenza", "Infectious Disease Medicine",
-                               "Communicable Diseases", "RNA, Messenger", "Vaccination",
-                               "Immunization"]
+            known_valid_mesh = [
+                "Vaccines",
+                "COVID-19",
+                "Influenza",
+                "Infectious Disease Medicine",
+                "Communicable Diseases",
+                "RNA, Messenger",
+                "Vaccination",
+                "Immunization",
+            ]
             # Replace potentially problematic MeSH terms with Title/Abstract searches
             for term in mesh_terms:
-                term_name = term.split('"')[1]  # Extract term name without quotes and [Mesh]
+                term_name = term.split('"')[
+                    1
+                ]  # Extract term name without quotes and [Mesh]
                 if not any(valid in term_name for valid in known_valid_mesh):
                     # Replace with Title/Abstract search
                     replacement = f"{term_name.lower()}[Title/Abstract]"
                     optimized_query = optimized_query.replace(term, replacement)
             # Simplify the query if still no results are found
             self._simplify_query_cache = optimized_query
             # Log original and optimized queries
-            logger.info(f"Original query: '{query}'")
+            logger.info("Original query: '%s'", query)
             logger.info(f"Optimized for PubMed: '{optimized_query}'")
             return optimized_query
         except Exception as e:
             logger.error(f"Error optimizing query: {e}")
             return query  # Fall back to original query on error
     def _simplify_query(self, query: str) -> str:
         """
         Simplify a PubMed query that returned no results.
         Progressively removes elements to get a more basic query.
         Args:
             query: The original query that returned no results
         Returns:
             Simplified query
         """
         logger.info(f"Simplifying query: {query}")
         # Attempt different simplification strategies
         # 1. Remove any MeSH terms and replace with Title/Abstract
         import re
-        simplified = re.sub(r'"[^"]+"[Mesh]', lambda m: m.group(0).split('"')[1].lower() + "[Title/Abstract]", query)
+        simplified = re.sub(
+            r'"[^"]+"[Mesh]',
+            lambda m: m.group(0).split('"')[1].lower() + "[Title/Abstract]",
+            query,
+        )
         # 2. If that doesn't work, focus on just mRNA and vaccines - the core concepts
         if simplified == query:  # No changes were made
-            simplified = "(mRNA[Title/Abstract] OR \"messenger RNA\"[Title/Abstract]) AND vaccin*[Title/Abstract]"
+            simplified = '(mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract]'
         logger.info(f"Simplified query: {simplified}")
         return simplified
     def _is_historical_focused(self, query: str) -> bool:
         """
         Determine if a query is specifically focused on historical/older information using LLM.
         Default assumption is that queries should prioritize recent information unless
         explicitly asking for historical content.
         Args:
             query: The search query
         Returns:
             Boolean indicating if the query is focused on historical information
         """
         if not self.llm:
             # Fall back to basic keyword check if no LLM available
-            historical_terms = ["history", "historical", "early", "initial", "first", "original",
-                               "before", "prior to", "origins", "evolution", "development"]
+            historical_terms = [
+                "history",
+                "historical",
+                "early",
+                "initial",
+                "first",
+                "original",
+                "before",
+                "prior to",
+                "origins",
+                "evolution",
+                "development",
+            ]
             historical_years = [str(year) for year in range(1900, 2020)]
             query_lower = query.lower()
             has_historical_term = any(term in query_lower for term in historical_terms)
             has_past_year = any(year in query for year in historical_years)
             return has_historical_term or has_past_year
         try:
             # Use LLM to determine if the query is focused on historical information
             prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information.
@@ -343,40 +405,51 @@ Answer ONLY "no" if the query is asking about recent, current, or new informatio
 The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise.
 """
             response = self.llm.invoke(prompt)
             answer = response.content.strip().lower()
             # Log the determination
             logger.info(f"Historical focus determination for query: '{query}'")
             logger.info(f"LLM determined historical focus: {answer}")
             return "yes" in answer
         except Exception as e:
             logger.error(f"Error determining historical focus: {e}")
             # Fall back to basic keyword check
-            historical_terms = ["history", "historical", "early", "initial", "first", "original",
-                              "before", "prior to", "origins", "evolution", "development"]
+            historical_terms = [
+                "history",
+                "historical",
+                "early",
+                "initial",
+                "first",
+                "original",
+                "before",
+                "prior to",
+                "origins",
+                "evolution",
+                "development",
+            ]
             return any(term in query.lower() for term in historical_terms)
     def _adaptive_search(self, query: str) -> Tuple[List[str], str]:
         """
         Perform an adaptive search that adjusts based on topic volume and whether
         the query focuses on historical information.
         Args:
             query: The search query (already optimized)
         Returns:
             Tuple of (list of PMIDs, search strategy used)
         """
         # Estimate topic volume
         estimated_volume = self._get_result_count(query)
         # Determine if the query is focused on historical information
         is_historical_focused = self._is_historical_focused(query)
         if is_historical_focused:
             # User wants historical information - no date filtering
             time_filter = None
@@ -397,44 +470,52 @@ The default assumption should be that medical and scientific queries want RECENT
             # Rare topic - still use recency but with wider range
             time_filter = '"last 10 years"[pdat]'
             strategy = "rare_topic"
         # Run search based on strategy
         if time_filter:
             # Try with adaptive time filter
             query_with_time = f"({query}) AND {time_filter}"
-            logger.info(f"Using adaptive search strategy: {strategy} with filter: {time_filter}")
+            logger.info(
+                f"Using adaptive search strategy: {strategy} with filter: {time_filter}"
+            )
             results = self._search_pubmed(query_with_time)
             # If too few results, gradually expand time window
-            if len(results) < 5 and not '"last 10 years"[pdat]' in time_filter:
-                logger.info(f"Insufficient results ({len(results)}), expanding time window")
+            if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter:
+                logger.info(
+                    f"Insufficient results ({len(results)}), expanding time window"
+                )
                 expanded_time = self._expand_time_window(time_filter)
                 query_with_expanded_time = f"({query}) AND {expanded_time}"
                 expanded_results = self._search_pubmed(query_with_expanded_time)
                 if len(expanded_results) > len(results):
-                    logger.info(f"Expanded time window yielded {len(expanded_results)} results")
+                    logger.info(
+                        f"Expanded time window yielded {len(expanded_results)} results"
+                    )
                     return expanded_results, f"{strategy}_expanded"
             # If still no results, try without time filter
             if not results:
-                logger.info("No results with time filter, trying without time restrictions")
+                logger.info(
+                    "No results with time filter, trying without time restrictions"
+                )
                 results = self._search_pubmed(query)
                 strategy = "no_time_filter"
         else:
             # Historical query - run without time filter
-            logger.info(f"Using historical search strategy without date filtering")
+            logger.info("Using historical search strategy without date filtering")
             results = self._search_pubmed(query)
         return results, strategy
     def _search_pubmed(self, query: str) -> List[str]:
         """
         Search PubMed and return a list of article IDs.
         Args:
             query: The search query
         Returns:
             List of PubMed IDs matching the query
         """
@@ -445,76 +526,76 @@ The default assumption should be that medical and scientific queries want RECENT
                 "term": query,
                 "retmode": "json",
                 "retmax": self.max_results,
-                "usehistory": "y"
+                "usehistory": "y",
             }
             # Add API key if available
             if self.api_key:
                 params["api_key"] = self.api_key
             # Add date restriction if specified
             if self.days_limit:
                 params["reldate"] = self.days_limit
                 params["datetype"] = "pdat"  # Publication date
             # Execute search request
             response = requests.get(self.search_url, params=params)
             response.raise_for_status()
             # Parse response
             data = response.json()
             id_list = data["esearchresult"]["idlist"]
             logger.info(f"PubMed search for '{query}' found {len(id_list)} results")
             return id_list
         except Exception as e:
             logger.error(f"Error searching PubMed: {e}")
             return []
     def _get_article_summaries(self, id_list: List[str]) -> List[Dict[str, Any]]:
         """
         Get summaries for a list of PubMed article IDs.
         Args:
             id_list: List of PubMed IDs
         Returns:
             List of article summary dictionaries
         """
         if not id_list:
             return []
         try:
             # Prepare parameters
             params = {
                 "db": "pubmed",
                 "id": ",".join(id_list),
                 "retmode": "json",
-                "rettype": "summary"
+                "rettype": "summary",
             }
             # Add API key if available
             if self.api_key:
                 params["api_key"] = self.api_key
             # Execute request
             response = requests.get(self.summary_url, params=params)
             response.raise_for_status()
             # Parse response
             data = response.json()
             summaries = []
             for pmid in id_list:
                 if pmid in data["result"]:
                     article = data["result"][pmid]
                     # Extract authors (if available)
                     authors = []
                     if "authors" in article:
                         authors = [author["name"] for author in article["authors"]]
                     # Create summary dictionary
                     summary = {
                         "id": pmid,
@@ -524,73 +605,73 @@ The default assumption should be that medical and scientific queries want RECENT
                         "authors": authors,
                         "journal": article.get("fulljournalname", ""),
                         "doi": article.get("doi", ""),
-                        "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
+                        "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
                     }
                     summaries.append(summary)
             return summaries
         except Exception as e:
             logger.error(f"Error getting article summaries: {e}")
             return []
     def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]:
         """
         Get abstracts for a list of PubMed article IDs.
         Args:
             id_list: List of PubMed IDs
         Returns:
             Dictionary mapping PubMed IDs to their abstracts
         """
         if not id_list:
             return {}
         try:
             # Prepare parameters
             params = {
                 "db": "pubmed",
                 "id": ",".join(id_list),
                 "retmode": "xml",
-                "rettype": "abstract"
+                "rettype": "abstract",
             }
             # Add API key if available
             if self.api_key:
                 params["api_key"] = self.api_key
             # Execute request
             response = requests.get(self.fetch_url, params=params)
             response.raise_for_status()
             # Parse XML response
             root = ET.fromstring(response.text)
             # Extract abstracts
             abstracts = {}
             for article in root.findall(".//PubmedArticle"):
                 pmid_elem = article.find(".//PMID")
                 pmid = pmid_elem.text if pmid_elem is not None else None
                 if pmid is None:
                     continue
                 # Find abstract text
                 abstract_text = ""
                 abstract_elem = article.find(".//AbstractText")
                 if abstract_elem is not None:
                     abstract_text = abstract_elem.text or ""
                 # Some abstracts are split into multiple sections
                 for section in article.findall(".//AbstractText"):
                     # Get section label if it exists
                     label = section.get("Label")
                     section_text = section.text or ""
                     if label and section_text:
                         if abstract_text:
                             abstract_text += f"\n\n{label}: {section_text}"
@@ -601,30 +682,30 @@ The default assumption should be that medical and scientific queries want RECENT
                             abstract_text += f"\n\n{section_text}"
                         else:
                             abstract_text = section_text
                 # Store in dictionary
                 if pmid and abstract_text:
                     abstracts[pmid] = abstract_text
             return abstracts
         except Exception as e:
             logger.error(f"Error getting article abstracts: {e}")
             return {}
     def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]:
         """
         Find PMC IDs for the given PubMed IDs (for full-text access).
         Args:
             pmid_list: List of PubMed IDs
         Returns:
             Dictionary mapping PubMed IDs to their PMC IDs (if available)
         """
         if not pmid_list or not self.get_full_text:
             return {}
         try:
             # Prepare parameters
             params = {
@@ -632,89 +713,84 @@ The default assumption should be that medical and scientific queries want RECENT
                 "db": "pmc",
                 "linkname": "pubmed_pmc",
                 "id": ",".join(pmid_list),
-                "retmode": "json"
+                "retmode": "json",
             }
             # Add API key if available
             if self.api_key:
                 params["api_key"] = self.api_key
             # Execute request
             response = requests.get(self.link_url, params=params)
             response.raise_for_status()
             # Parse response
             data = response.json()
             # Map PubMed IDs to PMC IDs
             pmid_to_pmcid = {}
             for linkset in data.get("linksets", []):
                 pmid = linkset.get("ids", [None])[0]
                 if not pmid:
                     continue
                 for link in linkset.get("linksetdbs", []):
                     if link.get("linkname") == "pubmed_pmc":
                         pmcids = link.get("links", [])
                         if pmcids:
                             pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
             logger.info(f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access")
             return pmid_to_pmcid
         except Exception as e:
             logger.error(f"Error finding PMC IDs: {e}")
             return {}
     def _get_pmc_full_text(self, pmcid: str) -> str:
         """
         Get full text for a PMC article.
         Args:
             pmcid: PMC ID of the article
         Returns:
             Full text content or empty string if not available
         """
         try:
             # Prepare parameters
-            params = {
-                "db": "pmc",
-                "id": pmcid,
-                "retmode": "xml",
-                "rettype": "full"
-            }
+            params = {"db": "pmc", "id": pmcid, "retmode": "xml", "rettype": "full"}
             # Add API key if available
             if self.api_key:
                 params["api_key"] = self.api_key
             # Execute request
             response = requests.get(self.fetch_url, params=params)
             response.raise_for_status()
             # Parse XML response
             root = ET.fromstring(response.text)
             # Extract full text
             full_text = []
             # Extract article title
             title_elem = root.find(".//article-title")
             if title_elem is not None and title_elem.text:
                 full_text.append(f"# {title_elem.text}")
             # Extract abstract
             abstract_paras = root.findall(".//abstract//p")
             if abstract_paras:
                 full_text.append("\n## Abstract\n")
                 for p in abstract_paras:
-                    text = ''.join(p.itertext())
+                    text = "".join(p.itertext())
                     if text:
                         full_text.append(text)
             # Extract body content
             body = root.find(".//body")
             if body is not None:
@@ -723,37 +799,37 @@ The default assumption should be that medical and scientific queries want RECENT
                     title = section.find(".//title")
                     if title is not None and title.text:
                         full_text.append(f"\n## {title.text}\n")
                     # Get paragraphs
                     for p in section.findall(".//p"):
-                        text = ''.join(p.itertext())
+                        text = "".join(p.itertext())
                         if text:
                             full_text.append(text)
             return "\n\n".join(full_text)
         except Exception as e:
             logger.error(f"Error getting PMC full text: {e}")
             return ""
     def _get_previews(self, query: str) -> List[Dict[str, Any]]:
         """
         Get preview information for PubMed articles.
         Args:
             query: The search query
         Returns:
             List of preview dictionaries
         """
         logger.info(f"Getting PubMed previews for query: {query}")
         # Optimize the query for PubMed if LLM is available
         optimized_query = self._optimize_query_for_pubmed(query)
         # Perform adaptive search
         pmid_list, strategy = self._adaptive_search(optimized_query)
         # If no results, try a simplified query
         if not pmid_list:
             logger.warning(f"No PubMed results found using strategy: {strategy}")
@@ -763,17 +839,17 @@ The default assumption should be that medical and scientific queries want RECENT
                 pmid_list, strategy = self._adaptive_search(simplified_query)
                 if pmid_list:
                     logger.info(f"Simplified query found {len(pmid_list)} results")
         if not pmid_list:
-            logger.warning(f"No PubMed results found after query simplification")
+            logger.warning("No PubMed results found after query simplification")
             return []
         # Get article summaries
         summaries = self._get_article_summaries(pmid_list)
         # Rate limit compliance (NCBI allows 10 requests per second with an API key, 3 without)
         time.sleep(0.1 if self.api_key else 0.33)
         # Format as previews
         previews = []
         for summary in summaries:
@@ -782,7 +858,7 @@ The default assumption should be that medical and scientific queries want RECENT
             if len(authors_text) > 100:
                 # Truncate long author lists
                 authors_text = authors_text[:97] + "..."
             # Create preview with basic information
             preview = {
                 "id": summary["id"],
@@ -795,73 +871,81 @@ The default assumption should be that medical and scientific queries want RECENT
                 "doi": summary.get("doi", ""),
                 "source": "PubMed",
                 "_pmid": summary["id"],  # Store PMID for later use
-                "_search_strategy": strategy  # Store search strategy for analytics
+                "_search_strategy": strategy,  # Store search strategy for analytics
             }
             previews.append(preview)
         logger.info(f"Found {len(previews)} PubMed previews using strategy: {strategy}")
         return previews
-    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
         """
         Get full content for the relevant PubMed articles.
         Efficiently manages which content to retrieve (abstracts and/or full text).
         Args:
             relevant_items: List of relevant preview dictionaries
         Returns:
             List of result dictionaries with full content
         """
         # Check if we should add full content
-        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
             logger.info("Snippet-only mode, skipping full content retrieval")
             return relevant_items
         logger.info(f"Getting content for {len(relevant_items)} PubMed articles")
         # Collect all PMIDs for relevant items
         pmids = []
         for item in relevant_items:
             if "_pmid" in item:
                 pmids.append(item["_pmid"])
         # Get abstracts if requested and PMIDs exist
         abstracts = {}
         if self.get_abstracts and pmids:
             abstracts = self._get_article_abstracts(pmids)
         # Find PMC IDs for full-text retrieval (if enabled)
         pmid_to_pmcid = {}
         if self.get_full_text and pmids:
             pmid_to_pmcid = self._find_pmc_ids(pmids)
         # Add content to results
         results = []
         for item in relevant_items:
             result = item.copy()
             pmid = item.get("_pmid", "")
             # Add abstract if available
             if pmid in abstracts:
                 result["abstract"] = abstracts[pmid]
                 # Use abstract as content if no full text
                 if pmid not in pmid_to_pmcid:
                     result["full_content"] = abstracts[pmid]
                     result["content"] = abstracts[pmid]
                     result["content_type"] = "abstract"
             # Add full text for a limited number of top articles
-            if (pmid in pmid_to_pmcid and
-                self.get_full_text and
-                len([r for r in results if r.get("content_type") == "full_text"]) < self.full_text_limit):
+            if (
+                pmid in pmid_to_pmcid
+                and self.get_full_text
+                and len([r for r in results if r.get("content_type") == "full_text"])
+                < self.full_text_limit
+            ):
                 # Get full text content
                 pmcid = pmid_to_pmcid[pmid]
                 full_text = self._get_pmc_full_text(pmcid)
                 if full_text:
                     result["full_content"] = full_text
                     result["content"] = full_text
@@ -872,120 +956,128 @@ The default assumption should be that medical and scientific queries want RECENT
                     result["full_content"] = abstracts[pmid]
                     result["content"] = abstracts[pmid]
                     result["content_type"] = "abstract"
             # Remove temporary fields
             if "_pmid" in result:
                 del result["_pmid"]
             if "_search_strategy" in result:
                 del result["_search_strategy"]
             results.append(result)
         return results
-    def search_by_author(self, author_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+    def search_by_author(
+        self, author_name: str, max_results: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
         """
         Search for articles by a specific author.
         Args:
             author_name: Name of the author
             max_results: Maximum number of results (defaults to self.max_results)
         Returns:
             List of articles by the author
         """
         original_max_results = self.max_results
         try:
             if max_results:
                 self.max_results = max_results
             query = f"{author_name}[Author]"
             return self.run(query)
         finally:
             # Restore original value
             self.max_results = original_max_results
-    def search_by_journal(self, journal_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+    def search_by_journal(
+        self, journal_name: str, max_results: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
         """
         Search for articles in a specific journal.
         Args:
             journal_name: Name of the journal
             max_results: Maximum number of results (defaults to self.max_results)
         Returns:
             List of articles from the journal
         """
         original_max_results = self.max_results
         try:
             if max_results:
                 self.max_results = max_results
             query = f"{journal_name}[Journal]"
             return self.run(query)
         finally:
             # Restore original value
             self.max_results = original_max_results
-    def search_recent(self, query: str, days: int = 30, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+    def search_recent(
+        self, query: str, days: int = 30, max_results: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
         """
         Search for recent articles matching the query.
         Args:
             query: The search query
             days: Number of days to look back
             max_results: Maximum number of results (defaults to self.max_results)
         Returns:
             List of recent articles matching the query
         """
         original_max_results = self.max_results
         original_days_limit = self.days_limit
         try:
             if max_results:
                 self.max_results = max_results
             # Set days limit for this search
             self.days_limit = days
             return self.run(query)
         finally:
             # Restore original values
             self.max_results = original_max_results
             self.days_limit = original_days_limit
-    def advanced_search(self, terms: Dict[str, str], max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+    def advanced_search(
+        self, terms: Dict[str, str], max_results: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
         """
         Perform an advanced search with field-specific terms.
         Args:
             terms: Dictionary mapping fields to search terms
                   Valid fields: Author, Journal, Title, MeSH, Affiliation, etc.
             max_results: Maximum number of results (defaults to self.max_results)
         Returns:
             List of articles matching the advanced query
         """
         original_max_results = self.max_results
         try:
             if max_results:
                 self.max_results = max_results
             # Build advanced query string
             query_parts = []
             for field, term in terms.items():
                 query_parts.append(f"{term}[{field}]")
             query = " AND ".join(query_parts)
             return self.run(query)
         finally:
             # Restore original value
-            self.max_results = original_max_results
+            self.max_results = original_max_results

local-deep-research 0.1.26__py3-none-any.whl → 0.2.0__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.0py3-none-any.whl