PyPI - local-deep-research - Versions diffs - 0.1.0__py3-none-any.whl - Mend

local-deep-research 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

local_deep_research/__init__.py +24 -0
local_deep_research/citation_handler.py +113 -0
local_deep_research/config.py +166 -0
local_deep_research/defaults/__init__.py +44 -0
local_deep_research/defaults/llm_config.py +269 -0
local_deep_research/defaults/local_collections.toml +47 -0
local_deep_research/defaults/main.toml +57 -0
local_deep_research/defaults/search_engines.toml +244 -0
local_deep_research/local_collections.py +141 -0
local_deep_research/main.py +113 -0
local_deep_research/report_generator.py +206 -0
local_deep_research/search_system.py +241 -0
local_deep_research/utilties/__init__.py +0 -0
local_deep_research/utilties/enums.py +9 -0
local_deep_research/utilties/llm_utils.py +116 -0
local_deep_research/utilties/search_utilities.py +115 -0
local_deep_research/utilties/setup_utils.py +6 -0
local_deep_research/web/__init__.py +2 -0
local_deep_research/web/app.py +1209 -0
local_deep_research/web/static/css/styles.css +1008 -0
local_deep_research/web/static/js/app.js +2078 -0
local_deep_research/web/templates/api_keys_config.html +82 -0
local_deep_research/web/templates/collections_config.html +90 -0
local_deep_research/web/templates/index.html +312 -0
local_deep_research/web/templates/llm_config.html +120 -0
local_deep_research/web/templates/main_config.html +89 -0
local_deep_research/web/templates/search_engines_config.html +154 -0
local_deep_research/web/templates/settings.html +519 -0
local_deep_research/web/templates/settings_dashboard.html +207 -0
local_deep_research/web_search_engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/__init__.py +0 -0
local_deep_research/web_search_engines/engines/full_search.py +128 -0
local_deep_research/web_search_engines/engines/meta_search_engine.py +274 -0
local_deep_research/web_search_engines/engines/search_engine_arxiv.py +367 -0
local_deep_research/web_search_engines/engines/search_engine_brave.py +245 -0
local_deep_research/web_search_engines/engines/search_engine_ddg.py +123 -0
local_deep_research/web_search_engines/engines/search_engine_github.py +663 -0
local_deep_research/web_search_engines/engines/search_engine_google_pse.py +283 -0
local_deep_research/web_search_engines/engines/search_engine_guardian.py +337 -0
local_deep_research/web_search_engines/engines/search_engine_local.py +901 -0
local_deep_research/web_search_engines/engines/search_engine_local_all.py +153 -0
local_deep_research/web_search_engines/engines/search_engine_medrxiv.py +623 -0
local_deep_research/web_search_engines/engines/search_engine_pubmed.py +992 -0
local_deep_research/web_search_engines/engines/search_engine_serpapi.py +230 -0
local_deep_research/web_search_engines/engines/search_engine_wayback.py +474 -0
local_deep_research/web_search_engines/engines/search_engine_wikipedia.py +242 -0
local_deep_research/web_search_engines/full_search.py +254 -0
local_deep_research/web_search_engines/search_engine_base.py +197 -0
local_deep_research/web_search_engines/search_engine_factory.py +233 -0
local_deep_research/web_search_engines/search_engines_config.py +54 -0
local_deep_research-0.1.0.dist-info/LICENSE +21 -0
local_deep_research-0.1.0.dist-info/METADATA +328 -0
local_deep_research-0.1.0.dist-info/RECORD +56 -0
local_deep_research-0.1.0.dist-info/WHEEL +5 -0
local_deep_research-0.1.0.dist-info/entry_points.txt +3 -0
local_deep_research-0.1.0.dist-info/top_level.txt +1 -0

local_deep_research/web_search_engines/engines/search_engine_arxiv.py ADDED Viewed

@@ -0,0 +1,367 @@
+from typing import Dict, List, Any, Optional
+from langchain_core.language_models import BaseLLM
+from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
+from local_deep_research import config
+import arxiv
+import logging
+logger = logging.getLogger(__name__)
+class ArXivSearchEngine(BaseSearchEngine):
+    """arXiv search engine implementation with two-phase approach"""
+    def __init__(self,
+                max_results: int = 10,
+                sort_by: str = "relevance",
+                sort_order: str = "descending",
+                include_full_text: bool = False,
+                download_dir: Optional[str] = None,
+                max_full_text: int = 1,
+                llm: Optional[BaseLLM] = None,
+                max_filtered_results: Optional[int] = None):  # Added this parameter
+        """
+        Initialize the arXiv search engine.
+        Args:
+            max_results: Maximum number of search results
+            sort_by: Sorting criteria ('relevance', 'lastUpdatedDate', or 'submittedDate')
+            sort_order: Sort order ('ascending' or 'descending')
+            include_full_text: Whether to include full paper content in results (downloads PDF)
+            download_dir: Directory to download PDFs to (if include_full_text is True)
+            max_full_text: Maximum number of PDFs to download and process (default: 1)
+            llm: Language model for relevance filtering
+            max_filtered_results: Maximum number of results to keep after filtering
+        """
+        # Initialize the BaseSearchEngine with the LLM and max_filtered_results
+        super().__init__(llm=llm, max_filtered_results=max_filtered_results)
+        #max_results = min(max_results, 20) # required for arxiv
+        self.max_results = 20 # TODO this needs to be corrected.
+        self.sort_by = sort_by
+        self.sort_order = sort_order
+        self.include_full_text = include_full_text
+        self.download_dir = download_dir
+        self.max_full_text = max_full_text
+        # Map sort parameters to arxiv package parameters
+        self.sort_criteria = {
+            'relevance': arxiv.SortCriterion.Relevance,
+            'lastUpdatedDate': arxiv.SortCriterion.LastUpdatedDate,
+            'submittedDate': arxiv.SortCriterion.SubmittedDate
+        }
+        self.sort_directions = {
+            'ascending': arxiv.SortOrder.Ascending,
+            'descending': arxiv.SortOrder.Descending
+        }
+    def _get_search_results(self, query: str) -> List[Any]:
+        """
+        Helper method to get search results from arXiv API.
+        Args:
+            query: The search query
+        Returns:
+            List of arXiv paper objects
+        """
+        # Configure the search client
+        sort_criteria = self.sort_criteria.get(self.sort_by, arxiv.SortCriterion.Relevance)
+        sort_order = self.sort_directions.get(self.sort_order, arxiv.SortOrder.Descending)
+        # Create the search client
+        client = arxiv.Client(page_size=self.max_results)
+        # Create the search query
+        search = arxiv.Search(
+            query=query,
+            max_results=self.max_results,
+            sort_by=sort_criteria,
+            sort_order=sort_order
+        )
+        # Get the search results
+        papers = list(client.results(search))
+        return papers
+    def _get_previews(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Get preview information for arXiv papers.
+        Args:
+            query: The search query
+        Returns:
+            List of preview dictionaries
+        """
+        logger.info("Getting paper previews from arXiv")
+        try:
+            # Get search results from arXiv
+            papers = self._get_search_results(query)
+            # Store the paper objects for later use
+            self._papers = {paper.entry_id: paper for paper in papers}
+            # Format results as previews with basic information
+            previews = []
+            for paper in papers:
+                preview = {
+                    "id": paper.entry_id,  # Use entry_id as ID
+                    "title": paper.title,
+                    "link": paper.entry_id,  # arXiv URL
+                    "snippet": paper.summary[:250] + "..." if len(paper.summary) > 250 else paper.summary,
+                    "authors": [author.name for author in paper.authors[:3]],  # First 3 authors
+                    "published": paper.published.strftime("%Y-%m-%d") if paper.published else None
+                }
+                previews.append(preview)
+            return previews
+        except Exception as e:
+            print(f"Error getting arXiv previews: {e}")
+            return []
+    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Get full content for the relevant arXiv papers.
+        Downloads PDFs and extracts text when include_full_text is True.
+        Limits the number of PDFs processed to max_full_text.
+        Args:
+            relevant_items: List of relevant preview dictionaries
+        Returns:
+            List of result dictionaries with full content
+        """
+        # Check if we should get full content
+        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+            print("Snippet-only mode, skipping full content retrieval")
+            return relevant_items
+        print("Getting full content for relevant arXiv papers")
+        results = []
+        pdf_count = 0  # Track number of PDFs processed
+        for item in relevant_items:
+            # Start with the preview data
+            result = item.copy()
+            # Get the paper ID
+            paper_id = item.get("id")
+            # Try to get the full paper from our cache
+            paper = None
+            if hasattr(self, '_papers') and paper_id in self._papers:
+                paper = self._papers[paper_id]
+            if paper:
+                # Add complete paper information
+                result.update({
+                    "pdf_url": paper.pdf_url,
+                    "authors": [author.name for author in paper.authors],  # All authors
+                    "published": paper.published.strftime("%Y-%m-%d") if paper.published else None,
+                    "updated": paper.updated.strftime("%Y-%m-%d") if paper.updated else None,
+                    "categories": paper.categories,
+                    "summary": paper.summary,  # Full summary
+                    "comment": paper.comment,
+                    "journal_ref": paper.journal_ref,
+                    "doi": paper.doi
+                })
+                # Default to using summary as content
+                result["content"] = paper.summary
+                result["full_content"] = paper.summary
+                # Download PDF and extract text if requested and within limit
+                if self.include_full_text and self.download_dir and pdf_count < self.max_full_text:
+                    try:
+                        # Download the paper
+                        pdf_count += 1  # Increment counter before attempting download
+                        paper_path = paper.download_pdf(dirpath=self.download_dir)
+                        result["pdf_path"] = str(paper_path)
+                        # Extract text from PDF
+                        try:
+                            # Try PyPDF2 first
+                            try:
+                                import PyPDF2
+                                with open(paper_path, 'rb') as pdf_file:
+                                    pdf_reader = PyPDF2.PdfReader(pdf_file)
+                                    pdf_text = ""
+                                    for page in pdf_reader.pages:
+                                        pdf_text += page.extract_text() + "\n\n"
+                                    if pdf_text.strip():  # Only use if we got meaningful text
+                                        result["content"] = pdf_text
+                                        result["full_content"] = pdf_text
+                                        print(f"Successfully extracted text from PDF using PyPDF2")
+                            except (ImportError, Exception) as e1:
+                                # Fall back to pdfplumber
+                                try:
+                                    import pdfplumber
+                                    with pdfplumber.open(paper_path) as pdf:
+                                        pdf_text = ""
+                                        for page in pdf.pages:
+                                            pdf_text += page.extract_text() + "\n\n"
+                                        if pdf_text.strip():  # Only use if we got meaningful text
+                                            result["content"] = pdf_text
+                                            result["full_content"] = pdf_text
+                                            print(f"Successfully extracted text from PDF using pdfplumber")
+                                except (ImportError, Exception) as e2:
+                                    print(f"PDF text extraction failed: {str(e1)}, then {str(e2)}")
+                                    print(f"Using paper summary as content instead")
+                        except Exception as e:
+                            print(f"Error extracting text from PDF: {e}")
+                            print(f"Using paper summary as content instead")
+                    except Exception as e:
+                        print(f"Error downloading paper {paper.title}: {e}")
+                        result["pdf_path"] = None
+                        pdf_count -= 1  # Decrement counter if download fails
+                elif self.include_full_text and self.download_dir and pdf_count >= self.max_full_text:
+                    # Reached PDF limit
+                    print(f"Maximum number of PDFs ({self.max_full_text}) reached. Skipping remaining PDFs.")
+                    result["content"] = paper.summary
+                    result["full_content"] = paper.summary
+            results.append(result)
+        return results
+    def run(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Execute a search using arXiv with the two-phase approach.
+        Args:
+            query: The search query
+        Returns:
+            List of search results
+        """
+        print("---Execute a search using arXiv---")
+        # Use the implementation from the parent class which handles all phases
+        results = super().run(query)
+        # Clean up
+        if hasattr(self, '_papers'):
+            del self._papers
+        return results
+    def get_paper_details(self, arxiv_id: str) -> Dict[str, Any]:
+        """
+        Get detailed information about a specific arXiv paper.
+        Args:
+            arxiv_id: arXiv ID of the paper (e.g., '2101.12345')
+        Returns:
+            Dictionary with paper information
+        """
+        try:
+            # Create the search client
+            client = arxiv.Client()
+            # Search for the specific paper
+            search = arxiv.Search(id_list=[arxiv_id], max_results=1)
+            # Get the paper
+            papers = list(client.results(search))
+            if not papers:
+                return {}
+            paper = papers[0]
+            # Format result based on config
+            result = {
+                "title": paper.title,
+                "link": paper.entry_id,
+                "snippet": paper.summary[:250] + "..." if len(paper.summary) > 250 else paper.summary,
+                "authors": [author.name for author in paper.authors[:3]]  # First 3 authors
+            }
+            # Add full content if not in snippet-only mode
+            if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
+                result.update({
+                    "pdf_url": paper.pdf_url,
+                    "authors": [author.name for author in paper.authors],  # All authors
+                    "published": paper.published.strftime("%Y-%m-%d") if paper.published else None,
+                    "updated": paper.updated.strftime("%Y-%m-%d") if paper.updated else None,
+                    "categories": paper.categories,
+                    "summary": paper.summary,  # Full summary
+                    "comment": paper.comment,
+                    "journal_ref": paper.journal_ref,
+                    "doi": paper.doi,
+                    "content": paper.summary,  # Use summary as content
+                    "full_content": paper.summary  # For consistency
+                })
+                # Download PDF if requested
+                if self.include_full_text and self.download_dir:
+                    try:
+                        # Download the paper
+                        paper_path = paper.download_pdf(dirpath=self.download_dir)
+                        result["pdf_path"] = str(paper_path)
+                    except Exception as e:
+                        print(f"Error downloading paper: {e}")
+            return result
+        except Exception as e:
+            print(f"Error getting paper details: {e}")
+            return {}
+    def search_by_author(self, author_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+        """
+        Search for papers by a specific author.
+        Args:
+            author_name: Name of the author
+            max_results: Maximum number of results (defaults to self.max_results)
+        Returns:
+            List of papers by the author
+        """
+        original_max_results = self.max_results
+        try:
+            if max_results:
+                self.max_results = max_results
+            query = f"au:\"{author_name}\""
+            return self.run(query)
+        finally:
+            # Restore original value
+            self.max_results = original_max_results
+    def search_by_category(self, category: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+        """
+        Search for papers in a specific arXiv category.
+        Args:
+            category: arXiv category (e.g., 'cs.AI', 'physics.optics')
+            max_results: Maximum number of results (defaults to self.max_results)
+        Returns:
+            List of papers in the category
+        """
+        original_max_results = self.max_results
+        try:
+            if max_results:
+                self.max_results = max_results
+            query = f"cat:{category}"
+            return self.run(query)
+        finally:
+            # Restore original value
+            self.max_results = original_max_results

local_deep_research/web_search_engines/engines/search_engine_brave.py ADDED Viewed

@@ -0,0 +1,245 @@
+from langchain_community.tools import BraveSearch
+from typing import Dict, List, Any, Optional
+import os
+from langchain_core.language_models import BaseLLM
+from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
+from local_deep_research import config
+class BraveSearchEngine(BaseSearchEngine):
+    """Brave search engine implementation with two-phase approach"""
+    def __init__(self,
+                max_results: int = 10,
+                region: str = "US",
+                time_period: str = "y",
+                safe_search: bool = True,
+                search_language: str = "English",
+                api_key: Optional[str] = None,
+                language_code_mapping: Optional[Dict[str, str]] = None,
+                llm: Optional[BaseLLM] = None,
+                include_full_content: bool = False,
+                max_filtered_results: Optional[int] = None,
+                **kwargs):
+        """
+        Initialize the Brave search engine.
+        Args:
+            max_results: Maximum number of search results
+            region: Region code for search results
+            time_period: Time period for search results
+            safe_search: Whether to enable safe search
+            search_language: Language for search results
+            api_key: Brave Search API key (can also be set in BRAVE_API_KEY env)
+            language_code_mapping: Mapping from language names to codes
+            llm: Language model for relevance filtering
+            include_full_content: Whether to include full webpage content in results
+            max_filtered_results: Maximum number of results to keep after filtering
+            **kwargs: Additional parameters (ignored but accepted for compatibility)
+        """
+        # Initialize the BaseSearchEngine with the LLM and max_filtered_results
+        super().__init__(llm=llm, max_filtered_results=max_filtered_results)
+        self.max_results = max_results
+        self.include_full_content = include_full_content
+        # Set up language code mapping
+        if language_code_mapping is None:
+            language_code_mapping = {
+                "english": "en",
+                "spanish": "es",
+                "chinese": "zh",
+                "hindi": "hi",
+                "french": "fr",
+                "arabic": "ar",
+                "bengali": "bn",
+                "portuguese": "pt",
+                "russian": "ru",
+            }
+        # Get API key
+        brave_api_key = api_key or os.getenv("BRAVE_API_KEY")
+        if not brave_api_key:
+            raise ValueError("BRAVE_API_KEY not found. Please provide api_key or set the BRAVE_API_KEY environment variable.")
+        # Get language code
+        language_code = language_code_mapping.get(search_language.lower(), "en")
+        # Convert time period format to Brave's format
+        brave_time_period = f"p{time_period}"
+        # Convert safe search to Brave's format
+        brave_safe_search = "moderate" if safe_search else "off"
+        # Initialize Brave Search
+        self.engine = BraveSearch.from_api_key(
+            api_key=brave_api_key,
+            search_kwargs={
+                "count": min(20, max_results),
+                "country": region.upper(),
+                "search_lang": language_code,
+                "safesearch": brave_safe_search,
+                "freshness": brave_time_period,
+            }
+        )
+        # Set user agent for Brave Search
+        os.environ["USER_AGENT"] = "Local Deep Research/1.0"
+        # If full content is requested, initialize FullSearchResults
+        if include_full_content:
+            # Import FullSearchResults only if needed
+            try:
+                from local_deep_research.web_search_engines.engines.full_search import FullSearchResults
+                self.full_search = FullSearchResults(
+                    llm=llm,
+                    web_search=self.engine,
+                    language=search_language,
+                    max_results=max_results,
+                    region=region,
+                    time=time_period,
+                    safesearch=brave_safe_search
+                )
+            except ImportError:
+                print("Warning: FullSearchResults not available. Full content retrieval disabled.")
+                self.include_full_content = False
+    def _get_previews(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Get preview information from Brave Search.
+        Args:
+            query: The search query
+        Returns:
+            List of preview dictionaries
+        """
+        print("Getting search results from Brave Search")
+        try:
+            # Get search results from Brave Search
+            raw_results = self.engine.run(query[:400])
+            # Parse results if they're in string format
+            if isinstance(raw_results, str):
+                try:
+                    import json
+                    raw_results = json.loads(raw_results)
+                except json.JSONDecodeError:
+                    print("Error: Unable to parse BraveSearch response as JSON.")
+                    return []
+            # Format results as previews
+            previews = []
+            for i, result in enumerate(raw_results):
+                preview = {
+                    "id": i,  # Use index as ID
+                    "title": result.get("title", ""),
+                    "link": result.get("link", ""),
+                    "snippet": result.get("snippet", ""),
+                    "displayed_link": result.get("link", ""),
+                    "position": i
+                }
+                # Store full Brave result for later
+                preview["_full_result"] = result
+                previews.append(preview)
+            # Store the previews for potential full content retrieval
+            self._search_results = previews
+            return previews
+        except Exception as e:
+            print(f"Error getting Brave Search results: {e}")
+            return []
+    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Get full content for the relevant search results.
+        If include_full_content is True and FullSearchResults is available,
+        retrieves full webpage content for the results.
+        Args:
+            relevant_items: List of relevant preview dictionaries
+        Returns:
+            List of result dictionaries with full content if requested
+        """
+        # Check if we should get full content
+        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+            print("Snippet-only mode, skipping full content retrieval")
+            # Return the relevant items with their full Brave information
+            results = []
+            for item in relevant_items:
+                # Use the full result if available, otherwise use the preview
+                if "_full_result" in item:
+                    result = item["_full_result"]
+                    # Remove temporary field
+                    if "_full_result" in result:
+                        del result["_full_result"]
+                else:
+                    result = item
+                results.append(result)
+            return results
+        # If full content retrieval is enabled
+        if self.include_full_content and hasattr(self, 'full_search'):
+            print("Retrieving full webpage content")
+            try:
+                # Extract only the links from relevant items
+                links = [item.get("link") for item in relevant_items if item.get("link")]
+                # Use FullSearchResults to get full content
+                results_with_content = self.full_search._get_full_content(relevant_items)
+                return results_with_content
+            except Exception as e:
+                print(f"Error retrieving full content: {e}")
+                # Fall back to returning the items without full content
+        # Return items with their full Brave information
+        results = []
+        for item in relevant_items:
+            # Use the full result if available, otherwise use the preview
+            if "_full_result" in item:
+                result = item["_full_result"].copy()
+                # Remove temporary field
+                if "_full_result" in result:
+                    del result["_full_result"]
+            else:
+                result = item.copy()
+                if "_full_result" in result:
+                    del result["_full_result"]
+            results.append(result)
+        return results
+    def run(self, query: str) -> List[Dict[str, Any]]:
+        """
+        Execute a search using Brave Search with the two-phase approach.
+        Args:
+            query: The search query
+        Returns:
+            List of search results
+        """
+        print("---Execute a search using Brave Search---")
+        # Use the implementation from the parent class which handles all phases
+        results = super().run(query)
+        # Clean up
+        if hasattr(self, '_search_results'):
+            del self._search_results
+        return results