PyPI - local-deep-research - Versions diffs - 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (140) hide show

local_deep_research/web_search_engines/engines/search_engine_arxiv.py CHANGED Viewed

@@ -1,27 +1,32 @@
-from typing import Dict, List, Any, Optional
-from langchain_core.language_models import BaseLLM
+import logging
+from typing import Any, Dict, List, Optional
-from local_deep_research.web_search_engines.search_engine_base import BaseSearchEngine
-from local_deep_research import config
 import arxiv
-import logging
+from langchain_core.language_models import BaseLLM
+from ...config import search_config
+from ..search_engine_base import BaseSearchEngine
 logger = logging.getLogger(__name__)
 class ArXivSearchEngine(BaseSearchEngine):
     """arXiv search engine implementation with two-phase approach"""
-    def __init__(self,
-                max_results: int = 10,
-                sort_by: str = "relevance",
-                sort_order: str = "descending",
-                include_full_text: bool = False,
-                download_dir: Optional[str] = None,
-                max_full_text: int = 1,
-                llm: Optional[BaseLLM] = None,
-                max_filtered_results: Optional[int] = None):  # Added this parameter
+    def __init__(
+        self,
+        max_results: int = 10,
+        sort_by: str = "relevance",
+        sort_order: str = "descending",
+        include_full_text: bool = False,
+        download_dir: Optional[str] = None,
+        max_full_text: int = 1,
+        llm: Optional[BaseLLM] = None,
+        max_filtered_results: Optional[int] = None,
+    ):  # Added this parameter
         """
         Initialize the arXiv search engine.
         Args:
             max_results: Maximum number of search results
             sort_by: Sorting criteria ('relevance', 'lastUpdatedDate', or 'submittedDate')
@@ -33,77 +38,81 @@ class ArXivSearchEngine(BaseSearchEngine):
             max_filtered_results: Maximum number of results to keep after filtering
         """
         # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
-        super().__init__(llm=llm, max_filtered_results=max_filtered_results, max_results=max_results)
-        self.max_results=max(self.max_results,25)
+        super().__init__(
+            llm=llm, max_filtered_results=max_filtered_results, max_results=max_results
+        )
+        self.max_results = max(self.max_results, 25)
         self.sort_by = sort_by
         self.sort_order = sort_order
         self.include_full_text = include_full_text
         self.download_dir = download_dir
         self.max_full_text = max_full_text
         # Map sort parameters to arxiv package parameters
         self.sort_criteria = {
-            'relevance': arxiv.SortCriterion.Relevance,
-            'lastUpdatedDate': arxiv.SortCriterion.LastUpdatedDate,
-            'submittedDate': arxiv.SortCriterion.SubmittedDate
+            "relevance": arxiv.SortCriterion.Relevance,
+            "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate,
+            "submittedDate": arxiv.SortCriterion.SubmittedDate,
         }
         self.sort_directions = {
-            'ascending': arxiv.SortOrder.Ascending,
-            'descending': arxiv.SortOrder.Descending
+            "ascending": arxiv.SortOrder.Ascending,
+            "descending": arxiv.SortOrder.Descending,
         }
     def _get_search_results(self, query: str) -> List[Any]:
         """
         Helper method to get search results from arXiv API.
         Args:
             query: The search query
         Returns:
             List of arXiv paper objects
         """
         # Configure the search client
-        sort_criteria = self.sort_criteria.get(self.sort_by, arxiv.SortCriterion.Relevance)
-        sort_order = self.sort_directions.get(self.sort_order, arxiv.SortOrder.Descending)
+        sort_criteria = self.sort_criteria.get(
+            self.sort_by, arxiv.SortCriterion.Relevance
+        )
+        sort_order = self.sort_directions.get(
+            self.sort_order, arxiv.SortOrder.Descending
+        )
         # Create the search client
         client = arxiv.Client(page_size=self.max_results)
         # Create the search query
         search = arxiv.Search(
             query=query,
             max_results=self.max_results,
             sort_by=sort_criteria,
-            sort_order=sort_order
+            sort_order=sort_order,
         )
         # Get the search results
         papers = list(client.results(search))
         return papers
     def _get_previews(self, query: str) -> List[Dict[str, Any]]:
         """
         Get preview information for arXiv papers.
         Args:
             query: The search query
         Returns:
             List of preview dictionaries
         """
         logger.info("Getting paper previews from arXiv")
         try:
             # Get search results from arXiv
             papers = self._get_search_results(query)
             # Store the paper objects for later use
             self._papers = {paper.entry_id: paper for paper in papers}
             # Format results as previews with basic information
             previews = []
             for paper in papers:
@@ -111,196 +120,268 @@ class ArXivSearchEngine(BaseSearchEngine):
                     "id": paper.entry_id,  # Use entry_id as ID
                     "title": paper.title,
                     "link": paper.entry_id,  # arXiv URL
-                    "snippet": paper.summary[:250] + "..." if len(paper.summary) > 250 else paper.summary,
-                    "authors": [author.name for author in paper.authors[:3]],  # First 3 authors
-                    "published": paper.published.strftime("%Y-%m-%d") if paper.published else None
+                    "snippet": (
+                        paper.summary[:250] + "..."
+                        if len(paper.summary) > 250
+                        else paper.summary
+                    ),
+                    "authors": [
+                        author.name for author in paper.authors[:3]
+                    ],  # First 3 authors
+                    "published": (
+                        paper.published.strftime("%Y-%m-%d")
+                        if paper.published
+                        else None
+                    ),
                 }
                 previews.append(preview)
             return previews
         except Exception as e:
             logger.error(f"Error getting arXiv previews: {e}")
             return []
-    def _get_full_content(self, relevant_items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    def _get_full_content(
+        self, relevant_items: List[Dict[str, Any]]
+    ) -> List[Dict[str, Any]]:
         """
         Get full content for the relevant arXiv papers.
         Downloads PDFs and extracts text when include_full_text is True.
         Limits the number of PDFs processed to max_full_text.
         Args:
             relevant_items: List of relevant preview dictionaries
         Returns:
             List of result dictionaries with full content
         """
         # Check if we should get full content
-        if hasattr(config, 'SEARCH_SNIPPETS_ONLY') and config.SEARCH_SNIPPETS_ONLY:
+        if (
+            hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+            and search_config.SEARCH_SNIPPETS_ONLY
+        ):
             logger.info("Snippet-only mode, skipping full content retrieval")
             return relevant_items
         logger.info("Getting full content for relevant arXiv papers")
         results = []
         pdf_count = 0  # Track number of PDFs processed
         for item in relevant_items:
             # Start with the preview data
             result = item.copy()
             # Get the paper ID
             paper_id = item.get("id")
             # Try to get the full paper from our cache
             paper = None
-            if hasattr(self, '_papers') and paper_id in self._papers:
+            if hasattr(self, "_papers") and paper_id in self._papers:
                 paper = self._papers[paper_id]
             if paper:
                 # Add complete paper information
-                result.update({
-                    "pdf_url": paper.pdf_url,
-                    "authors": [author.name for author in paper.authors],  # All authors
-                    "published": paper.published.strftime("%Y-%m-%d") if paper.published else None,
-                    "updated": paper.updated.strftime("%Y-%m-%d") if paper.updated else None,
-                    "categories": paper.categories,
-                    "summary": paper.summary,  # Full summary
-                    "comment": paper.comment,
-                    "journal_ref": paper.journal_ref,
-                    "doi": paper.doi
-                })
+                result.update(
+                    {
+                        "pdf_url": paper.pdf_url,
+                        "authors": [
+                            author.name for author in paper.authors
+                        ],  # All authors
+                        "published": (
+                            paper.published.strftime("%Y-%m-%d")
+                            if paper.published
+                            else None
+                        ),
+                        "updated": (
+                            paper.updated.strftime("%Y-%m-%d")
+                            if paper.updated
+                            else None
+                        ),
+                        "categories": paper.categories,
+                        "summary": paper.summary,  # Full summary
+                        "comment": paper.comment,
+                        "journal_ref": paper.journal_ref,
+                        "doi": paper.doi,
+                    }
+                )
                 # Default to using summary as content
                 result["content"] = paper.summary
                 result["full_content"] = paper.summary
                 # Download PDF and extract text if requested and within limit
-                if self.include_full_text and self.download_dir and pdf_count < self.max_full_text:
+                if (
+                    self.include_full_text
+                    and self.download_dir
+                    and pdf_count < self.max_full_text
+                ):
                     try:
                         # Download the paper
                         pdf_count += 1  # Increment counter before attempting download
                         paper_path = paper.download_pdf(dirpath=self.download_dir)
                         result["pdf_path"] = str(paper_path)
                         # Extract text from PDF
                         try:
                             # Try PyPDF2 first
                             try:
                                 import PyPDF2
-                                with open(paper_path, 'rb') as pdf_file:
+                                with open(paper_path, "rb") as pdf_file:
                                     pdf_reader = PyPDF2.PdfReader(pdf_file)
                                     pdf_text = ""
                                     for page in pdf_reader.pages:
                                         pdf_text += page.extract_text() + "\n\n"
-                                    if pdf_text.strip():  # Only use if we got meaningful text
+                                    if (
+                                        pdf_text.strip()
+                                    ):  # Only use if we got meaningful text
                                         result["content"] = pdf_text
                                         result["full_content"] = pdf_text
-                                        logger.info(f"Successfully extracted text from PDF using PyPDF2")
+                                        logger.info(
+                                            "Successfully extracted text from PDF using PyPDF2"
+                                        )
                             except (ImportError, Exception) as e1:
                                 # Fall back to pdfplumber
                                 try:
                                     import pdfplumber
                                     with pdfplumber.open(paper_path) as pdf:
                                         pdf_text = ""
                                         for page in pdf.pages:
                                             pdf_text += page.extract_text() + "\n\n"
-                                        if pdf_text.strip():  # Only use if we got meaningful text
+                                        if (
+                                            pdf_text.strip()
+                                        ):  # Only use if we got meaningful text
                                             result["content"] = pdf_text
                                             result["full_content"] = pdf_text
-                                            logger.info(f"Successfully extracted text from PDF using pdfplumber")
+                                            logger.info(
+                                                "Successfully extracted text from PDF using pdfplumber"
+                                            )
                                 except (ImportError, Exception) as e2:
-                                    logger.error(f"PDF text extraction failed: {str(e1)}, then {str(e2)}")
-                                    logger.error(f"Using paper summary as content instead")
+                                    logger.error(
+                                        f"PDF text extraction failed: {str(e1)}, then {str(e2)}"
+                                    )
+                                    logger.error(
+                                        "Using paper summary as content instead"
+                                    )
                         except Exception as e:
                             logger.error(f"Error extracting text from PDF: {e}")
-                            logger.error(f"Using paper summary as content instead")
+                            logger.error("Using paper summary as content instead")
                     except Exception as e:
                         logger.error(f"Error downloading paper {paper.title}: {e}")
                         result["pdf_path"] = None
                         pdf_count -= 1  # Decrement counter if download fails
-                elif self.include_full_text and self.download_dir and pdf_count >= self.max_full_text:
+                elif (
+                    self.include_full_text
+                    and self.download_dir
+                    and pdf_count >= self.max_full_text
+                ):
                     # Reached PDF limit
-                    logger.info(f"Maximum number of PDFs ({self.max_full_text}) reached. Skipping remaining PDFs.")
+                    logger.info(
+                        f"Maximum number of PDFs ({self.max_full_text}) reached. Skipping remaining PDFs."
+                    )
                     result["content"] = paper.summary
                     result["full_content"] = paper.summary
             results.append(result)
         return results
     def run(self, query: str) -> List[Dict[str, Any]]:
         """
         Execute a search using arXiv with the two-phase approach.
         Args:
             query: The search query
         Returns:
             List of search results
         """
         logger.info("---Execute a search using arXiv---")
         # Use the implementation from the parent class which handles all phases
         results = super().run(query)
         # Clean up
-        if hasattr(self, '_papers'):
+        if hasattr(self, "_papers"):
             del self._papers
         return results
     def get_paper_details(self, arxiv_id: str) -> Dict[str, Any]:
         """
         Get detailed information about a specific arXiv paper.
         Args:
             arxiv_id: arXiv ID of the paper (e.g., '2101.12345')
         Returns:
             Dictionary with paper information
         """
         try:
             # Create the search client
             client = arxiv.Client()
             # Search for the specific paper
             search = arxiv.Search(id_list=[arxiv_id], max_results=1)
             # Get the paper
             papers = list(client.results(search))
             if not papers:
                 return {}
             paper = papers[0]
             # Format result based on config
             result = {
                 "title": paper.title,
                 "link": paper.entry_id,
-                "snippet": paper.summary[:250] + "..." if len(paper.summary) > 250 else paper.summary,
-                "authors": [author.name for author in paper.authors[:3]]  # First 3 authors
+                "snippet": (
+                    paper.summary[:250] + "..."
+                    if len(paper.summary) > 250
+                    else paper.summary
+                ),
+                "authors": [
+                    author.name for author in paper.authors[:3]
+                ],  # First 3 authors
             }
             # Add full content if not in snippet-only mode
-            if not hasattr(config, 'SEARCH_SNIPPETS_ONLY') or not config.SEARCH_SNIPPETS_ONLY:
-                result.update({
-                    "pdf_url": paper.pdf_url,
-                    "authors": [author.name for author in paper.authors],  # All authors
-                    "published": paper.published.strftime("%Y-%m-%d") if paper.published else None,
-                    "updated": paper.updated.strftime("%Y-%m-%d") if paper.updated else None,
-                    "categories": paper.categories,
-                    "summary": paper.summary,  # Full summary
-                    "comment": paper.comment,
-                    "journal_ref": paper.journal_ref,
-                    "doi": paper.doi,
-                    "content": paper.summary,  # Use summary as content
-                    "full_content": paper.summary  # For consistency
-                })
+            if (
+                not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
+                or not search_config.SEARCH_SNIPPETS_ONLY
+            ):
+                result.update(
+                    {
+                        "pdf_url": paper.pdf_url,
+                        "authors": [
+                            author.name for author in paper.authors
+                        ],  # All authors
+                        "published": (
+                            paper.published.strftime("%Y-%m-%d")
+                            if paper.published
+                            else None
+                        ),
+                        "updated": (
+                            paper.updated.strftime("%Y-%m-%d")
+                            if paper.updated
+                            else None
+                        ),
+                        "categories": paper.categories,
+                        "summary": paper.summary,  # Full summary
+                        "comment": paper.comment,
+                        "journal_ref": paper.journal_ref,
+                        "doi": paper.doi,
+                        "content": paper.summary,  # Use summary as content
+                        "full_content": paper.summary,  # For consistency
+                    }
+                )
                 # Download PDF if requested
                 if self.include_full_text and self.download_dir:
                     try:
@@ -309,57 +390,61 @@ class ArXivSearchEngine(BaseSearchEngine):
                         result["pdf_path"] = str(paper_path)
                     except Exception as e:
                         logger.error(f"Error downloading paper: {e}")
             return result
         except Exception as e:
             logger.error(f"Error getting paper details: {e}")
             return {}
-    def search_by_author(self, author_name: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+    def search_by_author(
+        self, author_name: str, max_results: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
         """
         Search for papers by a specific author.
         Args:
             author_name: Name of the author
             max_results: Maximum number of results (defaults to self.max_results)
         Returns:
             List of papers by the author
         """
         original_max_results = self.max_results
         try:
             if max_results:
                 self.max_results = max_results
-            query = f"au:\"{author_name}\""
+            query = f'au:"{author_name}"'
             return self.run(query)
         finally:
             # Restore original value
             self.max_results = original_max_results
-    def search_by_category(self, category: str, max_results: Optional[int] = None) -> List[Dict[str, Any]]:
+    def search_by_category(
+        self, category: str, max_results: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
         """
         Search for papers in a specific arXiv category.
         Args:
             category: arXiv category (e.g., 'cs.AI', 'physics.optics')
             max_results: Maximum number of results (defaults to self.max_results)
         Returns:
             List of papers in the category
         """
         original_max_results = self.max_results
         try:
             if max_results:
                 self.max_results = max_results
             query = f"cat:{category}"
             return self.run(query)
         finally:
             # Restore original value
-            self.max_results = original_max_results
+            self.max_results = original_max_results

local-deep-research 0.1.26__py3-none-any.whl → 0.2.2__py3-none-any.whl

local-deep-research 0.1.26py3-none-any.whl → 0.2.2py3-none-any.whl