PyPI - local-deep-research - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl - Mend

local-deep-research 0.1.13py3-none-any.whl → 0.1.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

local_deep_research/web_search_engines/engines/search_engine_medrxiv.py DELETED Viewed

@@ -1,623 +0,0 @@
-from typing import Dict, List, Any, Optional
-from langchain_core.language_models import BaseLLM
-import requests
-import logging
-import re
-import time
-from datetime import datetime
-import xml.etree.ElementTree as ET
-from urllib.parse import quote
-from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
-from local_deep_research import config
-# Setup logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-class MedRxivSearchEngine(BaseSearchEngine):
-    """medRxiv search engine implementation with two-phase approach"""
-    def __init__(self,
-                max_results: int = 10,
-                sort_by: str = "relevance_score",
-                sort_order: str = "desc",
-                include_full_text: bool = False,
-                download_dir: Optional[str] = None,
-                max_full_text: int = 1,
-                llm: Optional[BaseLLM] = None,
-                max_filtered_results: Optional[int] = None,
-                days_limit: Optional[int] = None,
-                optimize_queries: bool = True):
-        """
-        Initialize the medRxiv search engine.
-        Args:
-            max_results: Maximum number of search results
-            sort_by: Sorting criteria ('relevance_score', 'date', or 'date_posted')
-            sort_order: Sort order ('desc' or 'asc')
-            include_full_text: Whether to include full paper content in results (downloads PDF)
-            download_dir: Directory to download PDFs to (if include_full_text is True)
-            max_full_text: Maximum number of PDFs to download and process (default: 1)
-            llm: Language model for relevance filtering
-            max_filtered_results: Maximum number of results to keep after filtering
-            days_limit: Limit results to papers from the last N days
-        """
-        # Initialize the BaseSearchEngine with the LLM and max_filtered_results
-        super().__init__(llm=llm, max_filtered_results=max_filtered_results)
-        self.max_results = max_results
-        self.sort_by = sort_by
-        self.sort_order = sort_order
-        self.include_full_text = include_full_text
-        self.download_dir = download_dir
-        self.max_full_text = max_full_text
-        self.days_limit = days_limit
-        self.optimize_queries = optimize_queries
-        # medRxiv API endpoints
-        self.search_base_url = "https://api.biorxiv.org/covid19/{}/{{}}".format(
-            "desc" if sort_order.lower() == "desc" else "asc"
-        )
-        self.medrxiv_api_url = "https://api.biorxiv.org/details/medrxiv/{}/"
-        # medRxiv base URL for papers
-        self.medrxiv_base_url = "https://www.medrxiv.org/content/"
-    def _optimize_query_for_medrxiv(self, query: str) -> str:
-        """
-        Optimize a natural language query for medRxiv search.
-        Uses LLM to transform questions into effective keyword-based queries.
-        Args:
-            query: Natural language query
-        Returns:
-            Optimized query string for medRxiv
-        """
-        if not self.llm or not self.optimize_queries:
-            # Return original query if no LLM available or optimization disabled
-            return query
-        try:
-            # Prompt for query optimization
-            prompt = f"""Transform this natural language question into an optimized search query for medRxiv (a medical preprint server).
-Original query: "{query}"
-CRITICAL RULES:
-1. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
-2. Focus on clear medical terminology and keywords
-3. Keep it concise but comprehensive (typically 2-5 key terms)
-4. Include specific medical conditions, treatments, or methodologies
-5. Use Boolean operators (AND, OR) when appropriate
-6. Include common medical acronyms where relevant (e.g., COVID-19 instead of coronavirus disease)
-7. Put multi-word phrases in quotes (e.g., "long covid")
-8. Prioritize specific medical terms over general descriptions
-EXAMPLE CONVERSIONS:
-✓ "what are the neurological effects of long COVID?" → "long covid" AND neurological OR "nervous system"
-✓ "newest vaccine development for covid variants" → COVID-19 AND vaccine AND variant
-✗ BAD: "Here's a query to find information about..."
-✗ BAD: "The most effective search query would be..."
-Return ONLY the search query without any explanations.
-"""
-            # Get response from LLM
-            response = self.llm.invoke(prompt)
-            optimized_query = response.content.strip()
-            # Clean up the query - remove any explanations
-            lines = optimized_query.split('\n')
-            for line in lines:
-                line = line.strip()
-                if line and not line.lower().startswith(('here', 'i would', 'the best', 'this query')):
-                    optimized_query = line
-                    break
-            # Remove any quotes that wrap the entire query
-            if optimized_query.startswith('"') and optimized_query.endswith('"'):
-                optimized_query = optimized_query[1:-1]
-            logger.info(f"Original query: '{query}'")
-            logger.info(f"Optimized for medRxiv: '{optimized_query}'")
-            return optimized_query
-        except Exception as e:
-            logger.error(f"Error optimizing query: {e}")
-            return query  # Fall back to original query on error
-    def _search_medrxiv(self, query: str) -> List[Dict[str, Any]]:
-        """
-        Search medRxiv using their API.
-        Args:
-            query: The search query
-        Returns:
-            List of paper dictionaries
-        """
-        results = []
-        cursor = 0
-        max_per_page = 50  # medRxiv API typically returns 50 results per page
-        total_fetched = 0
-        try:
-            # URL encode the query
-            encoded_query = quote(query)
-            # Format the URL based on sorting and query
-            url = self.search_base_url.format(encoded_query)
-            # Add time restriction if specified
-            if self.days_limit:
-                # Calculate date range using days_limit
-                from datetime import datetime, timedelta
-                end_date = datetime.now()
-                start_date = end_date - timedelta(days=self.days_limit)
-                # Format dates for the API (YYYY-MM-DD)
-                start_date_str = start_date.strftime("%Y-%m-%d")
-                end_date_str = end_date.strftime("%Y-%m-%d")
-                # Add date parameters to URL
-                url += f"/{start_date_str}/{end_date_str}"
-                logger.info(f"Using date range filter: {start_date_str} to {end_date_str}")
-            while total_fetched < self.max_results:
-                # Add cursor to URL
-                page_url = f"{url}/{cursor}"
-                # Make the request
-                logger.debug(f"Requesting: {page_url}")
-                response = requests.get(page_url)
-                if response.status_code != 200:
-                    logger.error(f"Error searching medRxiv: {response.status_code}")
-                    break
-                data = response.json()
-                # Check if we have results
-                collection = data.get("collection", [])
-                if not collection:
-                    break
-                # Extract results
-                for paper in collection:
-                    if paper.get("server") == "medRxiv":  # Ensure we're only getting medRxiv papers
-                        results.append(paper)
-                        total_fetched += 1
-                        if total_fetched >= self.max_results:
-                            break
-                # Check if we should continue to next page
-                if len(collection) < max_per_page or total_fetched >= self.max_results:
-                    break
-                cursor += max_per_page
-                time.sleep(0.5)  # Be respectful with API requests
-            logger.info(f"Found {len(results)} papers from medRxiv for query: {query}")
-            return results
-        except Exception as e:
-            logger.error(f"Error searching medRxiv: {e}")
-            return []
-    def _get_paper_details(self, doi: str) -> Dict[str, Any]:
-        """
-        Get detailed information about a specific paper using its DOI.
-        Args:
-            doi: Digital Object Identifier for the paper
-        Returns:
-            Dictionary with paper details
-        """
-        try:
-            # Format the DOI for the API
-            formatted_doi = doi.replace("10.1101/", "")
-            # Get paper details from the API
-            url = self.medrxiv_api_url.format(formatted_doi)
-            response = requests.get(url)
-            if response.status_code != 200:
-                logger.error(f"Error getting paper details: {response.status_code}")
-                return {}
-            data = response.json()
-            # Extract the paper details
-            collection = data.get("collection", [])
-            if not collection:
-                return {}
-            return collection[0]
-        except Exception as e:
-            logger.error(f"Error getting paper details: {e}")
-            return {}
-    def _get_full_text_url(self, doi: str) -> Optional[str]:
-        """
-        Get the URL for the full text PDF of a paper.
-        Args:
-            doi: Digital Object Identifier for the paper
-        Returns:
-            URL to the PDF or None if not available
-        """
-        pdf_url = None
-        try:
-            # Format the DOI for the URL
-            formatted_doi = doi.replace("10.1101/", "")
-            # Construct the PDF URL
-            # Note: This is a typical pattern for medRxiv PDFs, but may need adjustment
-            pdf_url = f"https://www.medrxiv.org/content/10.1101/{formatted_doi}.full.pdf"
-            # Verify the URL is valid (optional)
-            response = requests.head(pdf_url)
-            if response.status_code != 200:
-                logger.warning(f"PDF not available at {pdf_url}")
-                return None
-            return pdf_url
-        except Exception as e:
-            logger.error(f"Error getting PDF URL: {e}")
-            return None
-    def _download_pdf(self, pdf_url: str, file_name: str) -> Optional[str]:
-        """
-        Download a PDF from a URL to the specified download directory.
-        Args:
-            pdf_url: URL to the PDF
-            file_name: Name to save the file as
-        Returns:
-            Path to the downloaded file or None if download failed
-        """
-        if not self.download_dir:
-            return None
-        import os
-        try:
-            # Create download directory if it doesn't exist
-            os.makedirs(self.download_dir, exist_ok=True)
-            # Clean the filename
-            safe_name = re.sub(r'[^\w\-_\.]', '_', file_name)
-            file_path = os.path.join(self.download_dir, safe_name)
-            # Download the file
-            response = requests.get(pdf_url, stream=True)
-            response.raise_for_status()
-            with open(file_path, 'wb') as f:
-                for chunk in response.iter_content(chunk_size=8192):
-                    f.write(chunk)
-            logger.info(f"Downloaded PDF to {file_path}")
-            return file_path
-        except Exception as e:
-            logger.error(f"Error downloading PDF: {e}")
-            return None
-    def _extract_text_from_pdf(self, pdf_path: str) -> str:
-        """
-        Extract text from a PDF file.
-        Args:
-            pdf_path: Path to the PDF file
-        Returns:
-            Extracted text or empty string if extraction failed
-        """
-        text = ""
-        try:
-            # First try PyPDF2
-            try:
-                import PyPDF2
-                with open(pdf_path, 'rb') as pdf_file:
-                    pdf_reader = PyPDF2.PdfReader(pdf_file)
-                    for page in pdf_reader.pages:
-                        text += page.extract_text() + "\n\n"
-            except (ImportError, Exception) as e1:
-                # Fall back to pdfplumber
-                try:
-                    import pdfplumber
-                    with pdfplumber.open(pdf_path) as pdf:
-                        for page in pdf.pages:
-                            text += page.extract_text() + "\n\n"
-                except (ImportError, Exception) as e2:
-                    logger.error(f"PDF extraction failed with both methods: {e1}, then {e2}")
-                    return ""
-            return text
-        except Exception as e:
-            logger.error(f"Error extracting text from PDF: {e}")
-            return ""
-    def _get_previews(self, query: str) -> List[Dict[str, Any]]:
-        """
-        Get preview information for medRxiv papers.
-        Args:
-            query: The search query
-        Returns:
-            List of preview dictionaries
-        """
-        logger.info(f"Getting medRxiv previews for query: {query}")
-        # Optimize the query for medRxiv if LLM is available
-        if self.optimize_queries and self.llm:
-            optimized_query = self._optimize_query_for_medrxiv(query)
-            # Store original and optimized queries for potential fallback
-            self._original_query = query
-            self._optimized_query = optimized_query
-            # Store for simplification if needed
-            self._simplify_query_cache = optimized_query
-            # Use the optimized query for adaptive search
-            papers, strategy = self._adaptive_search(optimized_query)
-        else:
-            # Use the original query directly with adaptive search
-            papers, strategy = self._adaptive_search(query)
-        # If no results, return empty list
-        if not papers:
-            logger.warning(f"No medRxiv results found using strategy: {strategy}")
-            return []
-        # Store the paper objects for later use
-        self._papers = {paper.get("doi"): paper for paper in papers}
-        self._search_strategy = strategy
-        # Format results as previews
-        previews = []
-        for paper in papers:
-            # Extract the data
-            doi = paper.get("doi", "")
-            title = paper.get("title", "")
-            abstract = paper.get("abstract", "")
-            authors = paper.get("authors", "")
-            date = paper.get("date", "")
-            # Create a preview
-            preview = {
-                "id": doi,  # Use DOI as ID
-                "title": title,
-                "link": f"https://www.medrxiv.org/content/{doi}v1",
-                "snippet": abstract[:250] + "..." if len(abstract) > 250 else abstract,
-                "authors": authors.split("; ") if authors else [],
-                "published": date,
-                "doi": doi,
-                "source": "medRxiv",
-                "_search_strategy": strategy  # Store search strategy for analytics
-            }
-            previews.append(preview)
-        logger.info(f"Found {len(previews)} medRxiv previews using strategy: {strategy}")
-        return previews
-    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """
-        Get full content for the relevant medRxiv papers.
-        Args:
-            relevant_items: List of relevant preview dictionaries
-        Returns:
-            List of result dictionaries with full content
-        """
-        # Check if we should get full content
-        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
-            logger.info("Snippet-only mode, skipping full content retrieval")
-            return relevant_items
-        logger.info(f"Getting full content for {len(relevant_items)} medRxiv papers")
-        results = []
-        pdf_count = 0  # Track number of PDFs processed
-        for item in relevant_items:
-            # Start with the preview data
-            result = item.copy()
-            # Get the paper DOI
-            doi = item.get("id") or item.get("doi")
-            if not doi:
-                results.append(result)
-                continue
-            # Try to get the cached paper details
-            paper = None
-            if hasattr(self, '_papers') and doi in self._papers:
-                paper = self._papers[doi]
-            else:
-                # Get the paper details from the API
-                paper = self._get_paper_details(doi)
-            if paper:
-                # Update with more complete information
-                result.update({
-                    "title": paper.get("title", result.get("title", "")),
-                    "authors": paper.get("authors", "").split("; ") if paper.get("authors") else result.get("authors", []),
-                    "published": paper.get("date", result.get("published", "")),
-                    "abstract": paper.get("abstract", ""),
-                    "doi": paper.get("doi", doi),
-                    "category": paper.get("category", ""),
-                    "journal": "medRxiv",  # It's a preprint server
-                    "version": paper.get("version", "1"),
-                    "type": paper.get("type", "new_result"),
-                })
-                # Use abstract as content by default
-                result["content"] = paper.get("abstract", "")
-                result["full_content"] = paper.get("abstract", "")
-                # Add search strategy if available
-                if "_search_strategy" in item:
-                    result["search_strategy"] = item["_search_strategy"]
-                    # Remove temporary field
-                    if "_search_strategy" in result:
-                        del result["_search_strategy"]
-                # Download PDF and extract text if requested and within limit
-                if (self.include_full_text and self.download_dir and
-                    pdf_count < self.max_full_text):
-                    try:
-                        # Get the PDF URL
-                        pdf_url = self._get_full_text_url(doi)
-                        if pdf_url:
-                            # Download the PDF
-                            pdf_count += 1
-                            safe_name = f"medrxiv_{doi.replace('/', '_')}.pdf"
-                            pdf_path = self._download_pdf(pdf_url, safe_name)
-                            if pdf_path:
-                                result["pdf_path"] = pdf_path
-                                # Extract text from PDF
-                                pdf_text = self._extract_text_from_pdf(pdf_path)
-                                if pdf_text:
-                                    result["content"] = pdf_text
-                                    result["full_content"] = pdf_text
-                                    result["content_type"] = "full_text"
-                                else:
-                                    result["content_type"] = "abstract"
-                    except Exception as e:
-                        logger.error(f"Error processing PDF for {doi}: {e}")
-                        result["content_type"] = "abstract"
-            results.append(result)
-        return results
-    def run(self, query: str) -> List[Dict[str, Any]]:
-        """
-        Execute a search using medRxiv with the two-phase approach.
-        Args:
-            query: The search query
-        Returns:
-            List of search results
-        """
-        logger.info(f"---Execute a search using medRxiv---")
-        # Use the implementation from the parent class which handles all phases
-        # _get_previews will handle query optimization and adaptive search
-        results = super().run(query)
-        # Clean up temporary variables
-        if hasattr(self, '_papers'):
-            del self._papers
-        if hasattr(self, '_original_query'):
-            del self._original_query
-        if hasattr(self, '_optimized_query'):
-            del self._optimized_query
-        if hasattr(self, '_simplify_query_cache'):
-            del self._simplify_query_cache
-        if hasattr(self, '_search_strategy'):
-            del self._search_strategy
-        return results
-    def search_by_author(self, author_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
-        """
-        Search for papers by a specific author.
-        Args:
-            author_name: Name of the author
-            max_results: Maximum number of results (defaults to self.max_results)
-        Returns:
-            List of papers by the author
-        """
-        original_max_results = self.max_results
-        try:
-            if max_results:
-                self.max_results = max_results
-            # medRxiv API doesn't have direct author search, so we include in query
-            query = f"author:{author_name}"
-            return self.run(query)
-        finally:
-            # Restore original value
-            self.max_results = original_max_results
-    def search_by_topic(self, topic: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
-        """
-        Search for papers on a specific topic.
-        Args:
-            topic: Topic to search for
-            max_results: Maximum number of results (defaults to self.max_results)
-        Returns:
-            List of papers on the topic
-        """
-        original_max_results = self.max_results
-        try:
-            if max_results:
-                self.max_results = max_results
-            return self.run(topic)
-        finally:
-            # Restore original value
-            self.max_results = original_max_results
-    def search_recent(self, days: int = 30, topic: Optional[str] = None, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
-        """
-        Search for recent papers, optionally filtered by topic.
-        Args:
-            days: Number of days to look back
-            topic: Optional topic filter
-            max_results: Maximum number of results (defaults to self.max_results)
-        Returns:
-            List of recent papers
-        """
-        original_max_results = self.max_results
-        original_days_limit = self.days_limit
-        try:
-            if max_results:
-                self.max_results = max_results
-            # Set days limit for this search
-            self.days_limit = days
-            # If topic is provided, use it as query, otherwise use a broad query
-            query = topic if topic else "covid"  # Default to COVID which will have many papers
-            return self.run(query)
-        finally:
-            # Restore original values
-            self.max_results = original_max_results
-            self.days_limit = original_days_limit

{local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/WHEEL RENAMED Viewed

File without changes

{local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{local_deep_research-0.1.13.dist-info → local_deep_research-0.1.14.dist-info}/top_level.txt RENAMED Viewed

File without changes

local-deep-research 0.1.13__py3-none-any.whl → 0.1.14__py3-none-any.whl

local-deep-research 0.1.13py3-none-any.whl → 0.1.14py3-none-any.whl