PyPI - local-deep-research - Versions diffs - 0.1.0__py3-none-any.whl - Mend

local-deep-research 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

local_deep_research/__init__.py +24 -0
local_deep_research/citation_handler.py +113 -0
local_deep_research/config.py +166 -0
local_deep_research/defaults/__init__.py +44 -0
local_deep_research/defaults/llm_config.py +269 -0
local_deep_research/defaults/local_collections.toml +47 -0
local_deep_research/defaults/main.toml +57 -0
local_deep_research/defaults/search_engines.toml +244 -0
local_deep_research/local_collections.py +141 -0
local_deep_research/main.py +113 -0
local_deep_research/report_generator.py +206 -0
local_deep_research/search_system.py +241 -0
local_deep_research/utilties/__init__.py +0 -0
local_deep_research/utilties/enums.py +9 -0
local_deep_research/utilties/llm_utils.py +116 -0
local_deep_research/utilties/search_utilities.py +115 -0
local_deep_research/utilties/setup_utils.py +6 -0
local_deep_research/web/__init__.py +2 -0
local_deep_research/web/app.py +1209 -0
local_deep_research/web/static/css/styles.css +1008 -0
local_deep_research/web/static/js/app.js +2078 -0
local_deep_research/web/templates/api_keys_config.html +82 -0
local_deep_research/web/templates/collections_config.html +90 -0
local_deep_research/web/templates/index.html +312 -0
local_deep_research/web/templates/llm_config.html +120 -0
local_deep_research/web/templates/main_config.html +89 -0
local_deep_research/web/templates/search_engines_config.html +154 -0
local_deep_research/web/templates/settings.html +519 -0
local_deep_research/web/templates/settings_dashboard.html +207 -0
local_deep_research/web_search_engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/full_search.py +128 -0
local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
local_deep_research/web_search_engines/full_search.py +254 -0
local_deep_research/web_search_engines/search_engine_base.py +197 -0
local_deep_research/web_search_engines/search_engine_factory.py +233 -0
local_deep_research/web_search_engines/search_engines_config.py +54 -0
local_deep_research-0.1.0.dist-info/LICENSE +21 -0
local_deep_research-0.1.0.dist-info/METADATA +328 -0
local_deep_research-0.1.0.dist-info/RECORD +56 -0
local_deep_research-0.1.0.dist-info/WHEEL +5 -0
local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
local_deep_research-0.1.0.dist-info/top_level.txt +1 -0

local_deep_research/web_search_engines/engines/search_engine_pubmed.py ADDED Viewed

@@ -0,0 +1,992 @@
+import requests
+import logging
+import xml.etree.ElementTree as ET
+from typing import Dict, List, Any, Optional, Tuple
+from langchain_core.language_models import BaseLLM
+import time
+import re
+from datetime import datetime
+from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
+from local_deep_research import config
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class PubMedSearchEngine(BaseSearchEngine):
+    """
+    PubMed search engine implementation with two-phase approach and adaptive search.
+    Provides efficient access to biomedical literature while minimizing API usage.
+    """
+    def __init__(self,
+                max_results: int = 10,
+                api_key: Optional[str] = None,
+                days_limit: Optional[int] = None,
+                get_abstracts: bool = True,
+                get_full_text: bool = False,
+                full_text_limit: int = 3,
+                llm: Optional[BaseLLM] = None,
+                max_filtered_results: Optional[int] = None,
+                optimize_queries: bool = True):
+        """
+        Initialize the PubMed search engine.
+        Args:
+            max_results: Maximum number of search results
+            api_key: NCBI API key for higher rate limits (optional)
+            days_limit: Limit results to N days (optional)
+            get_abstracts: Whether to fetch abstracts for all results
+            get_full_text: Whether to fetch full text content (when available in PMC)
+            full_text_limit: Max number of full-text articles to retrieve
+            llm: Language model for relevance filtering
+            max_filtered_results: Maximum number of results to keep after filtering
+            optimize_queries: Whether to optimize natural language queries for PubMed
+        """
+        # Initialize the BaseSearchEngine with the LLM and max_filtered_results
+        super().__init__(llm=llm, max_filtered_results=max_filtered_results)
+        self.max_results = max_results
+        self.api_key = api_key
+        self.days_limit = days_limit
+        self.get_abstracts = get_abstracts
+        self.get_full_text = get_full_text
+        self.full_text_limit = full_text_limit
+        self.optimize_queries = optimize_queries
+        # Base API URLs
+        self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
+        self.search_url = f"{self.base_url}/esearch.fcgi"
+        self.summary_url = f"{self.base_url}/esummary.fcgi"
+        self.fetch_url = f"{self.base_url}/efetch.fcgi"
+        self.link_url = f"{self.base_url}/elink.fcgi"
+        # PMC base URL for full text
+        self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"
+    def _get_result_count(self, query: str) -> int:
+        """
+        Get the total number of results for a query without retrieving the results themselves.
+        Args:
+            query: The search query
+        Returns:
+            Total number of matching results
+        """
+        try:
+            # Prepare search parameters
+            params = {
+                "db": "pubmed",
+                "term": query,
+                "retmode": "json",
+                "retmax": 0  # Don't need actual results, just the count
+            }
+            # Add API key if available
+            if self.api_key:
+                params["api_key"] = self.api_key
+            # Execute search request
+            response = requests.get(self.search_url, params=params)
+            response.raise_for_status()
+            # Parse response
+            data = response.json()
+            count = int(data["esearchresult"]["count"])
+            logger.info(f"Query '{query}' has {count} total results in PubMed")
+            return count
+        except Exception as e:
+            logger.error(f"Error getting result count: {e}")
+            return 0
+    def _extract_core_terms(self, query: str) -> str:
+        """
+        Extract core terms from a complex query for volume estimation.
+        Args:
+            query: PubMed query string
+        Returns:
+            Simplified query with core terms
+        """
+        # Remove field specifications and operators
+        simplified = re.sub(r'\[\w+\]', '', query)  # Remove [Field] tags
+        simplified = re.sub(r'\b(AND|OR|NOT)\b', '', simplified)  # Remove operators
+        # Remove quotes and parentheses
+        simplified = simplified.replace('"', '').replace('(', '').replace(')', '')
+        # Split by whitespace and join terms with 4+ chars (likely meaningful)
+        terms = [term for term in simplified.split() if len(term) >= 4]
+        # Join with AND to create a basic search
+        return " ".join(terms[:5])  # Limit to top 5 terms
+    def _expand_time_window(self, time_filter: str) -> str:
+        """
+        Expand a time window to get more results.
+        Args:
+            time_filter: Current time filter
+        Returns:
+            Expanded time filter
+        """
+        # Parse current time window
+        import re
+        match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter)
+        if not match:
+            return '"last 10 years"[pdat]'
+        amount, unit = int(match.group(1)), match.group(2)
+        # Expand based on current unit
+        if unit == "months" or unit == "month":
+            if amount < 6:
+                return '"last 6 months"[pdat]'
+            elif amount < 12:
+                return '"last 1 year"[pdat]'
+            else:
+                return '"last 2 years"[pdat]'
+        elif unit == "years" or unit == "year":
+            if amount < 2:
+                return '"last 2 years"[pdat]'
+            elif amount < 5:
+                return '"last 5 years"[pdat]'
+            else:
+                return '"last 10 years"[pdat]'
+        return '"last 10 years"[pdat]'
+    def _optimize_query_for_pubmed(self, query: str) -> str:
+        """
+        Optimize a natural language query for PubMed search.
+        Uses LLM to transform questions into effective keyword-based queries.
+        Args:
+            query: Natural language query
+        Returns:
+            Optimized query string for PubMed
+        """
+        if not self.llm or not self.optimize_queries:
+            # Return original query if no LLM available or optimization disabled
+            return query
+        try:
+            # Prompt for query optimization
+            prompt = f"""Transform this natural language question into an optimized PubMed search query.
+Original query: "{query}"
+CRITICAL RULES:
+1. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
+2. DO NOT wrap the entire query in quotes
+3. DO NOT include ANY date restrictions or year filters
+4. Use parentheses around OR statements: (term1[Field] OR term2[Field])
+5. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh]
+6. KEEP IT SIMPLE - use 2-3 main concepts maximum
+7. Focus on Title/Abstract searches for reliability: term[Title/Abstract]
+8. Use wildcards for variations: vaccin*[Title/Abstract]
+EXAMPLE QUERIES:
+✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract]
+✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract]
+✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp]
+✗ BAD: "Here's a query to find articles about vaccines..."
+Return ONLY the search query without any explanations.
+"""
+            # Get response from LLM
+            response = self.llm.invoke(prompt)
+            raw_response = response.content.strip()
+            # Clean up the query - extract only the actual query and remove any explanations
+            # First check if there are multiple lines and take the first non-empty line
+            lines = raw_response.split('\n')
+            cleaned_lines = [line.strip() for line in lines if line.strip()]
+            if cleaned_lines:
+                optimized_query = cleaned_lines[0]
+                # Remove any quotes that wrap the entire query
+                if optimized_query.startswith('"') and optimized_query.endswith('"'):
+                    optimized_query = optimized_query[1:-1]
+                # Remove any explanation phrases that might be at the beginning
+                explanation_starters = ["here is", "here's", "this query", "the following"]
+                for starter in explanation_starters:
+                    if optimized_query.lower().startswith(starter):
+                        # Find the actual query part - typically after a colon
+                        colon_pos = optimized_query.find(':')
+                        if colon_pos > 0:
+                            optimized_query = optimized_query[colon_pos + 1:].strip()
+                # Check if the query still seems to contain explanations
+                if len(optimized_query) > 200 or "this query will" in optimized_query.lower():
+                    # It's probably still an explanation - try to extract just the query part
+                    # Look for common patterns in the explanation like parentheses
+                    pattern = r'\([^)]+\)\s+AND\s+'
+                    import re
+                    matches = re.findall(pattern, optimized_query)
+                    if matches:
+                        # Extract just the query syntax parts
+                        query_parts = []
+                        for part in re.split(r'\.\s+', optimized_query):
+                            if '(' in part and ')' in part and ('AND' in part or 'OR' in part):
+                                query_parts.append(part)
+                        if query_parts:
+                            optimized_query = ' '.join(query_parts)
+            else:
+                # Fall back to original query if cleaning fails
+                logger.warning("Failed to extract a clean query from LLM response")
+                optimized_query = query
+            # Final safety check - if query looks too much like an explanation, use original
+            if len(optimized_query.split()) > 30:
+                logger.warning("Query too verbose, falling back to simpler form")
+                # Create a simple query from the original
+                words = [w for w in query.split() if len(w) > 3 and w.lower() not in ('what', 'are', 'the', 'and', 'for', 'with', 'from', 'have', 'been', 'recent')]
+                optimized_query = ' AND '.join(words[:3])
+            # Safety check for invalid or overly complex MeSH terms
+            # This helps prevent errors with non-existent or complex MeSH terms
+            import re
+            mesh_terms = re.findall(r'"[^"]+"[Mesh]', optimized_query)
+            known_valid_mesh = ["Vaccines", "COVID-19", "Influenza", "Infectious Disease Medicine",
+                               "Communicable Diseases", "RNA, Messenger", "Vaccination",
+                               "Immunization"]
+            # Replace potentially problematic MeSH terms with Title/Abstract searches
+            for term in mesh_terms:
+                term_name = term.split('"')[1]  # Extract term name without quotes and [Mesh]
+                if not any(valid in term_name for valid in known_valid_mesh):
+                    # Replace with Title/Abstract search
+                    replacement = f"{term_name.lower()}[Title/Abstract]"
+                    optimized_query = optimized_query.replace(term, replacement)
+            # Simplify the query if still no results are found
+            self._simplify_query_cache = optimized_query
+            # Log original and optimized queries
+            logger.info(f"Original query: '{query}'")
+            logger.info(f"Optimized for PubMed: '{optimized_query}'")
+            return optimized_query
+        except Exception as e:
+            logger.error(f"Error optimizing query: {e}")
+            return query  # Fall back to original query on error
+    def _simplify_query(self, query: str) -> str:
+        """
+        Simplify a PubMed query that returned no results.
+        Progressively removes elements to get a more basic query.
+        Args:
+            query: The original query that returned no results
+        Returns:
+            Simplified query
+        """
+        logger.info(f"Simplifying query: {query}")
+        # Attempt different simplification strategies
+        # 1. Remove any MeSH terms and replace with Title/Abstract
+        import re
+        simplified = re.sub(r'"[^"]+"[Mesh]', lambda m: m.group(0).split('"')[1].lower() + "[Title/Abstract]", query)
+        # 2. If that doesn't work, focus on just mRNA and vaccines - the core concepts
+        if simplified == query:  # No changes were made
+            simplified = "(mRNA[Title/Abstract] OR \"messenger RNA\"[Title/Abstract]) AND vaccin*[Title/Abstract]"
+        logger.info(f"Simplified query: {simplified}")
+        return simplified
+    def _is_historical_focused(self, query: str) -> bool:
+        """
+        Determine if a query is specifically focused on historical/older information using LLM.
+        Default assumption is that queries should prioritize recent information unless
+        explicitly asking for historical content.
+        Args:
+            query: The search query
+        Returns:
+            Boolean indicating if the query is focused on historical information
+        """
+        if not self.llm:
+            # Fall back to basic keyword check if no LLM available
+            historical_terms = ["history", "historical", "early", "initial", "first", "original",
+                               "before", "prior to", "origins", "evolution", "development"]
+            historical_years = [str(year) for year in range(1900, 2020)]
+            query_lower = query.lower()
+            has_historical_term = any(term in query_lower for term in historical_terms)
+            has_past_year = any(year in query for year in historical_years)
+            return has_historical_term or has_past_year
+        try:
+            # Use LLM to determine if the query is focused on historical information
+            prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information.
+Query: "{query}"
+Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago.
+Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus.
+The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise.
+"""
+            response = self.llm.invoke(prompt)
+            answer = response.content.strip().lower()
+            # Log the determination
+            logger.info(f"Historical focus determination for query: '{query}'")
+            logger.info(f"LLM determined historical focus: {answer}")
+            return "yes" in answer
+        except Exception as e:
+            logger.error(f"Error determining historical focus: {e}")
+            # Fall back to basic keyword check
+            historical_terms = ["history", "historical", "early", "initial", "first", "original",
+                              "before", "prior to", "origins", "evolution", "development"]
+            return any(term in query.lower() for term in historical_terms)
+    def _adaptive_search(self, query: str) -> Tuple[List[str], str]:
+        """
+        Perform an adaptive search that adjusts based on topic volume and whether
+        the query focuses on historical information.
+        Args:
+            query: The search query (already optimized)
+        Returns:
+            Tuple of (list of PMIDs, search strategy used)
+        """
+        # Estimate topic volume
+        estimated_volume = self._get_result_count(query)
+        # Determine if the query is focused on historical information
+        is_historical_focused = self._is_historical_focused(query)
+        if is_historical_focused:
+            # User wants historical information - no date filtering
+            time_filter = None
+            strategy = "historical_focus"
+        elif estimated_volume > 5000:
+            # Very common topic - use tighter recency filter
+            time_filter = '"last 1 year"[pdat]'
+            strategy = "high_volume"
+        elif estimated_volume > 1000:
+            # Common topic
+            time_filter = '"last 3 years"[pdat]'
+            strategy = "common_topic"
+        elif estimated_volume > 100:
+            # Moderate volume
+            time_filter = '"last 5 years"[pdat]'
+            strategy = "moderate_volume"
+        else:
+            # Rare topic - still use recency but with wider range
+            time_filter = '"last 10 years"[pdat]'
+            strategy = "rare_topic"
+        # Run search based on strategy
+        if time_filter:
+            # Try with adaptive time filter
+            query_with_time = f"({query}) AND {time_filter}"
+            logger.info(f"Using adaptive search strategy: {strategy} with filter: {time_filter}")
+            results = self._search_pubmed(query_with_time)
+            # If too few results, gradually expand time window
+            if len(results) < 5 and not '"last 10 years"[pdat]' in time_filter:
+                logger.info(f"Insufficient results ({len(results)}), expanding time window")
+                expanded_time = self._expand_time_window(time_filter)
+                query_with_expanded_time = f"({query}) AND {expanded_time}"
+                expanded_results = self._search_pubmed(query_with_expanded_time)
+                if len(expanded_results) > len(results):
+                    logger.info(f"Expanded time window yielded {len(expanded_results)} results")
+                    return expanded_results, f"{strategy}_expanded"
+            # If still no results, try without time filter
+            if not results:
+                logger.info("No results with time filter, trying without time restrictions")
+                results = self._search_pubmed(query)
+                strategy = "no_time_filter"
+        else:
+            # Historical query - run without time filter
+            logger.info(f"Using historical search strategy without date filtering")
+            results = self._search_pubmed(query)
+        return results, strategy
+    def _search_pubmed(self, query: str) -> List[str]:
+        """
+        Search PubMed and return a list of article IDs.
+        Args:
+            query: The search query
+        Returns:
+            List of PubMed IDs matching the query
+        """
+        try:
+            # Prepare search parameters
+            params = {
+                "db": "pubmed",
+                "term": query,
+                "retmode": "json",
+                "retmax": self.max_results,
+                "usehistory": "y"
+            }
+            # Add API key if available
+            if self.api_key:
+                params["api_key"] = self.api_key
+            # Add date restriction if specified
+            if self.days_limit:
+                params["reldate"] = self.days_limit
+                params["datetype"] = "pdat"  # Publication date
+            # Execute search request
+            response = requests.get(self.search_url, params=params)
+            response.raise_for_status()
+            # Parse response
+            data = response.json()
+            id_list = data["esearchresult"]["idlist"]
+            logger.info(f"PubMed search for '{query}' found {len(id_list)} results")
+            return id_list
+        except Exception as e:
+            logger.error(f"Error searching PubMed: {e}")
+            return []
+    def _get_article_summaries(self, id_list: List[str]) -> List[Dict[str, Any]]:
+        """
+        Get summaries for a list of PubMed article IDs.
+        Args:
+            id_list: List of PubMed IDs
+        Returns:
+            List of article summary dictionaries
+        """
+        if not id_list:
+            return []
+        try:
+            # Prepare parameters
+            params = {
+                "db": "pubmed",
+                "id": ",".join(id_list),
+                "retmode": "json",
+                "rettype": "summary"
+            }
+            # Add API key if available
+            if self.api_key:
+                params["api_key"] = self.api_key
+            # Execute request
+            response = requests.get(self.summary_url, params=params)
+            response.raise_for_status()
+            # Parse response
+            data = response.json()
+            summaries = []
+            for pmid in id_list:
+                if pmid in data["result"]:
+                    article = data["result"][pmid]
+                    # Extract authors (if available)
+                    authors = []
+                    if "authors" in article:
+                        authors = [author["name"] for author in article["authors"]]
+                    # Create summary dictionary
+                    summary = {
+                        "id": pmid,
+                        "title": article.get("title", ""),
+                        "pubdate": article.get("pubdate", ""),
+                        "source": article.get("source", ""),
+                        "authors": authors,
+                        "journal": article.get("fulljournalname", ""),
+                        "doi": article.get("doi", ""),
+                        "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
+                    }
+                    summaries.append(summary)
+            return summaries
+        except Exception as e:
+            logger.error(f"Error getting article summaries: {e}")
+            return []
+    def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]:
+        """
+        Get abstracts for a list of PubMed article IDs.
+        Args:
+            id_list: List of PubMed IDs
+        Returns:
+            Dictionary mapping PubMed IDs to their abstracts
+        """
+        if not id_list:
+            return {}
+        try:
+            # Prepare parameters
+            params = {
+                "db": "pubmed",
+                "id": ",".join(id_list),
+                "retmode": "xml",
+                "rettype": "abstract"
+            }
+            # Add API key if available
+            if self.api_key:
+                params["api_key"] = self.api_key
+            # Execute request
+            response = requests.get(self.fetch_url, params=params)
+            response.raise_for_status()
+            # Parse XML response
+            root = ET.fromstring(response.text)
+            # Extract abstracts
+            abstracts = {}
+            for article in root.findall(".//PubmedArticle"):
+                pmid_elem = article.find(".//PMID")
+                pmid = pmid_elem.text if pmid_elem is not None else None
+                if pmid is None:
+                    continue
+                # Find abstract text
+                abstract_text = ""
+                abstract_elem = article.find(".//AbstractText")
+                if abstract_elem is not None:
+                    abstract_text = abstract_elem.text or ""
+                # Some abstracts are split into multiple sections
+                for section in article.findall(".//AbstractText"):
+                    # Get section label if it exists
+                    label = section.get("Label")
+                    section_text = section.text or ""
+                    if label and section_text:
+                        if abstract_text:
+                            abstract_text += f"\n\n{label}: {section_text}"
+                        else:
+                            abstract_text = f"{label}: {section_text}"
+                    elif section_text:
+                        if abstract_text:
+                            abstract_text += f"\n\n{section_text}"
+                        else:
+                            abstract_text = section_text
+                # Store in dictionary
+                if pmid and abstract_text:
+                    abstracts[pmid] = abstract_text
+            return abstracts
+        except Exception as e:
+            logger.error(f"Error getting article abstracts: {e}")
+            return {}
+    def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]:
+        """
+        Find PMC IDs for the given PubMed IDs (for full-text access).
+        Args:
+            pmid_list: List of PubMed IDs
+        Returns:
+            Dictionary mapping PubMed IDs to their PMC IDs (if available)
+        """
+        if not pmid_list or not self.get_full_text:
+            return {}
+        try:
+            # Prepare parameters
+            params = {
+                "dbfrom": "pubmed",
+                "db": "pmc",
+                "linkname": "pubmed_pmc",
+                "id": ",".join(pmid_list),
+                "retmode": "json"
+            }
+            # Add API key if available
+            if self.api_key:
+                params["api_key"] = self.api_key
+            # Execute request
+            response = requests.get(self.link_url, params=params)
+            response.raise_for_status()
+            # Parse response
+            data = response.json()
+            # Map PubMed IDs to PMC IDs
+            pmid_to_pmcid = {}
+            for linkset in data.get("linksets", []):
+                pmid = linkset.get("ids", [None])[0]
+                if not pmid:
+                    continue
+                for link in linkset.get("linksetdbs", []):
+                    if link.get("linkname") == "pubmed_pmc":
+                        pmcids = link.get("links", [])
+                        if pmcids:
+                            pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
+            logger.info(f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access")
+            return pmid_to_pmcid
+        except Exception as e:
+            logger.error(f"Error finding PMC IDs: {e}")
+            return {}
+    def _get_pmc_full_text(self, pmcid: str) -> str:
+        """
+        Get full text for a PMC article.
+        Args:
+            pmcid: PMC ID of the article
+        Returns:
+            Full text content or empty string if not available
+        """
+        try:
+            # Prepare parameters
+            params = {
+                "db": "pmc",
+                "id": pmcid,
+                "retmode": "xml",
+                "rettype": "full"
+            }
+            # Add API key if available
+            if self.api_key:
+                params["api_key"] = self.api_key
+            # Execute request
+            response = requests.get(self.fetch_url, params=params)
+            response.raise_for_status()
+            # Parse XML response
+            root = ET.fromstring(response.text)
+            # Extract full text
+            full_text = []
+            # Extract article title
+            title_elem = root.find(".//article-title")
+            if title_elem is not None and title_elem.text:
+                full_text.append(f"# {title_elem.text}")
+            # Extract abstract
+            abstract_paras = root.findall(".//abstract//p")
+            if abstract_paras:
+                full_text.append("\n## Abstract\n")
+                for p in abstract_paras:
+                    text = ''.join(p.itertext())
+                    if text:
+                        full_text.append(text)
+            # Extract body content
+            body = root.find(".//body")
+            if body is not None:
+                for section in body.findall(".//sec"):
+                    # Get section title
+                    title = section.find(".//title")
+                    if title is not None and title.text:
+                        full_text.append(f"\n## {title.text}\n")
+                    # Get paragraphs
+                    for p in section.findall(".//p"):
+                        text = ''.join(p.itertext())
+                        if text:
+                            full_text.append(text)
+            return "\n\n".join(full_text)
+        except Exception as e:
+            logger.error(f"Error getting PMC full text: {e}")
+            return ""
+    def _get_previews(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Get preview information for PubMed articles.
+        Args:
+            query: The search query
+        Returns:
+            List of preview dictionaries
+        """
+        logger.info(f"Getting PubMed previews for query: {query}")
+        # Optimize the query for PubMed if LLM is available
+        optimized_query = self._optimize_query_for_pubmed(query)
+        # Perform adaptive search
+        pmid_list, strategy = self._adaptive_search(optimized_query)
+        # If no results, try a simplified query
+        if not pmid_list:
+            logger.warning(f"No PubMed results found using strategy: {strategy}")
+            simplified_query = self._simplify_query(optimized_query)
+            if simplified_query != optimized_query:
+                logger.info(f"Trying with simplified query: {simplified_query}")
+                pmid_list, strategy = self._adaptive_search(simplified_query)
+                if pmid_list:
+                    logger.info(f"Simplified query found {len(pmid_list)} results")
+        if not pmid_list:
+            logger.warning(f"No PubMed results found after query simplification")
+            return []
+        # Get article summaries
+        summaries = self._get_article_summaries(pmid_list)
+        # Rate limit compliance (NCBI allows 10 requests per second with an API key, 3 without)
+        time.sleep(0.1 if self.api_key else 0.33)
+        # Format as previews
+        previews = []
+        for summary in summaries:
+            # Authors formatting
+            authors_text = ", ".join(summary.get("authors", []))
+            if len(authors_text) > 100:
+                # Truncate long author lists
+                authors_text = authors_text[:97] + "..."
+            # Create preview with basic information
+            preview = {
+                "id": summary["id"],
+                "title": summary["title"],
+                "link": summary["link"],
+                "snippet": f"{authors_text}. {summary.get('journal', '')}. {summary.get('pubdate', '')}",
+                "authors": summary.get("authors", []),
+                "journal": summary.get("journal", ""),
+                "pubdate": summary.get("pubdate", ""),
+                "doi": summary.get("doi", ""),
+                "source": "PubMed",
+                "_pmid": summary["id"],  # Store PMID for later use
+                "_search_strategy": strategy  # Store search strategy for analytics
+            }
+            previews.append(preview)
+        logger.info(f"Found {len(previews)} PubMed previews using strategy: {strategy}")
+        return previews
+    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Get full content for the relevant PubMed articles.
+        Efficiently manages which content to retrieve (abstracts and/or full text).
+        Args:
+            relevant_items: List of relevant preview dictionaries
+        Returns:
+            List of result dictionaries with full content
+        """
+        # Check if we should add full content
+        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+            logger.info("Snippet-only mode, skipping full content retrieval")
+            return relevant_items
+        logger.info(f"Getting content for {len(relevant_items)} PubMed articles")
+        # Collect all PMIDs for relevant items
+        pmids = []
+        for item in relevant_items:
+            if "_pmid" in item:
+                pmids.append(item["_pmid"])
+        # Get abstracts if requested and PMIDs exist
+        abstracts = {}
+        if self.get_abstracts and pmids:
+            abstracts = self._get_article_abstracts(pmids)
+        # Find PMC IDs for full-text retrieval (if enabled)
+        pmid_to_pmcid = {}
+        if self.get_full_text and pmids:
+            pmid_to_pmcid = self._find_pmc_ids(pmids)
+        # Add content to results
+        results = []
+        for item in relevant_items:
+            result = item.copy()
+            pmid = item.get("_pmid", "")
+            # Add abstract if available
+            if pmid in abstracts:
+                result["abstract"] = abstracts[pmid]
+                # Use abstract as content if no full text
+                if pmid not in pmid_to_pmcid:
+                    result["full_content"] = abstracts[pmid]
+                    result["content"] = abstracts[pmid]
+                    result["content_type"] = "abstract"
+            # Add full text for a limited number of top articles
+            if (pmid in pmid_to_pmcid and
+                self.get_full_text and
+                len([r for r in results if r.get("content_type") == "full_text"]) < self.full_text_limit):
+                # Get full text content
+                pmcid = pmid_to_pmcid[pmid]
+                full_text = self._get_pmc_full_text(pmcid)
+                if full_text:
+                    result["full_content"] = full_text
+                    result["content"] = full_text
+                    result["content_type"] = "full_text"
+                    result["pmcid"] = pmcid
+                elif pmid in abstracts:
+                    # Fall back to abstract if full text retrieval fails
+                    result["full_content"] = abstracts[pmid]
+                    result["content"] = abstracts[pmid]
+                    result["content_type"] = "abstract"
+            # Remove temporary fields
+            if "_pmid" in result:
+                del result["_pmid"]
+            if "_search_strategy" in result:
+                del result["_search_strategy"]
+            results.append(result)
+        return results
+    def search_by_author(self, author_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+        """
+        Search for articles by a specific author.
+        Args:
+            author_name: Name of the author
+            max_results: Maximum number of results (defaults to self.max_results)
+        Returns:
+            List of articles by the author
+        """
+        original_max_results = self.max_results
+        try:
+            if max_results:
+                self.max_results = max_results
+            query = f"{author_name}[Author]"
+            return self.run(query)
+        finally:
+            # Restore original value
+            self.max_results = original_max_results
+    def search_by_journal(self, journal_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+        """
+        Search for articles in a specific journal.
+        Args:
+            journal_name: Name of the journal
+            max_results: Maximum number of results (defaults to self.max_results)
+        Returns:
+            List of articles from the journal
+        """
+        original_max_results = self.max_results
+        try:
+            if max_results:
+                self.max_results = max_results
+            query = f"{journal_name}[Journal]"
+            return self.run(query)
+        finally:
+            # Restore original value
+            self.max_results = original_max_results
+    def search_recent(self, query: str, days: int = 30, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+        """
+        Search for recent articles matching the query.
+        Args:
+            query: The search query
+            days: Number of days to look back
+            max_results: Maximum number of results (defaults to self.max_results)
+        Returns:
+            List of recent articles matching the query
+        """
+        original_max_results = self.max_results
+        original_days_limit = self.days_limit
+        try:
+            if max_results:
+                self.max_results = max_results
+            # Set days limit for this search
+            self.days_limit = days
+            return self.run(query)
+        finally:
+            # Restore original values
+            self.max_results = original_max_results
+            self.days_limit = original_days_limit
+    def advanced_search(self, terms: Dict[str, str], max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+        """
+        Perform an advanced search with field-specific terms.
+        Args:
+            terms: Dictionary mapping fields to search terms
+                  Valid fields: Author, Journal, Title, MeSH, Affiliation, etc.
+            max_results: Maximum number of results (defaults to self.max_results)
+        Returns:
+            List of articles matching the advanced query
+        """
+        original_max_results = self.max_results
+        try:
+            if max_results:
+                self.max_results = max_results
+            # Build advanced query string
+            query_parts = []
+            for field, term in terms.items():
+                query_parts.append(f"{term}[{field}]")
+            query = " AND ".join(query_parts)
+            return self.run(query)
+        finally:
+            # Restore original value
+            self.max_results = original_max_results