PyPI - aiagents4pharma - Versions diffs - 0.0.0__py3-none-any.whl - Mend

aiagents4pharma 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (336) hide show

aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py ADDED Viewed

@@ -0,0 +1,207 @@
+#!/usr/bin/env python3
+"""
+ArXiv paper downloader implementation.
+"""
+import logging
+import xml.etree.ElementTree as ET
+from typing import Any
+import requests
+from .base_paper_downloader import BasePaperDownloader
+logger = logging.getLogger(__name__)
+class ArxivDownloader(BasePaperDownloader):
+    """ArXiv-specific implementation of paper downloader."""
+    def __init__(self, config: Any):
+        """Initialize ArXiv downloader with configuration."""
+        super().__init__(config)
+        self.api_url = config.api_url
+        self.pdf_base_url = config.pdf_base_url
+        # XML namespace configuration
+        self.xml_namespaces = getattr(
+            config, "xml_namespace", {"atom": "http://www.w3.org/2005/Atom"}
+        )
+    def fetch_metadata(self, identifier: str) -> ET.Element:
+        """
+        Fetch paper metadata from arXiv API.
+        Args:
+            identifier: arXiv ID (e.g., '1234.5678' or '2301.12345')
+        Returns:
+            XML root element from arXiv API response
+        Raises:
+            requests.RequestException: If API call fails
+            RuntimeError: If no entry found in response
+        """
+        query_url = f"{self.api_url}?search_query=id:{identifier}&start=0&max_results=1"
+        logger.info("Fetching metadata for arXiv ID %s from: %s", identifier, query_url)
+        response = requests.get(query_url, timeout=self.request_timeout)
+        response.raise_for_status()
+        root = ET.fromstring(response.text)
+        entry = root.find("atom:entry", self.xml_namespaces)
+        if entry is None:
+            raise RuntimeError("No entry found in arXiv API response")
+        return root
+    def construct_pdf_url(self, metadata: ET.Element, identifier: str) -> str:
+        """
+        Extract or construct PDF URL from arXiv metadata.
+        Args:
+            metadata: XML root from arXiv API
+            identifier: arXiv ID
+        Returns:
+            PDF URL string
+        """
+        entry = metadata.find("atom:entry", self.xml_namespaces)
+        if entry is None:
+            return ""
+        # Try to find PDF link in metadata first
+        pdf_url = next(
+            (
+                link.attrib.get("href")
+                for link in entry.findall("atom:link", self.xml_namespaces)
+                if link.attrib.get("title") == "pdf"
+            ),
+            None,
+        )
+        # Fallback to constructed PDF URL if not found in metadata
+        if not pdf_url:
+            pdf_url = f"{self.pdf_base_url}/{identifier}.pdf"
+            logger.info("Using constructed PDF URL for %s: %s", identifier, pdf_url)
+        return pdf_url
+    def extract_paper_metadata(
+        self,
+        metadata: ET.Element,
+        identifier: str,
+        pdf_result: tuple[str, str] | None,
+    ) -> dict[str, Any]:
+        """
+        Extract structured metadata from arXiv API response.
+        Args:
+            metadata: XML root from arXiv API
+            identifier: arXiv ID
+            pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
+        Returns:
+            Standardized paper metadata dictionary
+        """
+        entry = metadata.find("atom:entry", self.xml_namespaces)
+        if entry is None:
+            raise RuntimeError("No entry found in metadata")
+        # Extract basic metadata
+        basic_metadata = self._extract_basic_metadata(entry, self.xml_namespaces)
+        # Handle PDF download results
+        pdf_metadata = self._extract_pdf_metadata(pdf_result, identifier)
+        # Combine all metadata
+        return {
+            **basic_metadata,
+            **pdf_metadata,
+            "source": "arxiv",
+            "arxiv_id": identifier,
+        }
+    def _extract_basic_metadata(self, entry: ET.Element, ns: dict) -> dict[str, Any]:
+        """Extract basic metadata (title, authors, abstract, date) from entry."""
+        title = self._extract_title(entry, ns)
+        authors = self._extract_authors(entry, ns)
+        abstract = self._extract_abstract(entry, ns)
+        pub_date = self._extract_publication_date(entry, ns)
+        return {
+            "Title": title,
+            "Authors": authors,
+            "Abstract": abstract,
+            "Publication Date": pub_date,
+        }
+    def _extract_title(self, entry: ET.Element, ns: dict) -> str:
+        """Extract title from entry."""
+        title_elem = entry.find("atom:title", ns)
+        return (title_elem.text or "").strip() if title_elem is not None else "N/A"
+    def _extract_authors(self, entry: ET.Element, ns: dict) -> list:
+        """Extract authors from entry."""
+        authors = []
+        for author_elem in entry.findall("atom:author", ns):
+            name_elem = author_elem.find("atom:name", ns)
+            if name_elem is not None and name_elem.text:
+                authors.append(name_elem.text.strip())
+        return authors
+    def _extract_abstract(self, entry: ET.Element, ns: dict) -> str:
+        """Extract abstract from entry."""
+        summary_elem = entry.find("atom:summary", ns)
+        return (summary_elem.text or "").strip() if summary_elem is not None else "N/A"
+    def _extract_publication_date(self, entry: ET.Element, ns: dict) -> str:
+        """Extract publication date from entry."""
+        published_elem = entry.find("atom:published", ns)
+        return (published_elem.text or "").strip() if published_elem is not None else "N/A"
+    def _extract_pdf_metadata(
+        self, pdf_result: tuple[str, str] | None, identifier: str
+    ) -> dict[str, Any]:
+        """Extract PDF-related metadata."""
+        if pdf_result:
+            temp_file_path, filename = pdf_result
+            return {
+                "URL": temp_file_path,
+                "pdf_url": temp_file_path,
+                "filename": filename,
+                "access_type": "open_access_downloaded",
+                "temp_file_path": temp_file_path,
+            }
+        return {
+            "URL": "",
+            "pdf_url": "",
+            "filename": self.get_default_filename(identifier),
+            "access_type": "download_failed",
+            "temp_file_path": "",
+        }
+    def get_service_name(self) -> str:
+        """Return service name."""
+        return "arXiv"
+    def get_identifier_name(self) -> str:
+        """Return identifier display name."""
+        return "arXiv ID"
+    def get_default_filename(self, identifier: str) -> str:
+        """Generate default filename for arXiv paper."""
+        return f"{identifier}.pdf"
+    def _get_paper_identifier_info(self, paper: dict[str, Any]) -> str:
+        """Get arXiv-specific identifier info for paper summary."""
+        arxiv_id = paper.get("arxiv_id", "N/A")
+        pub_date = paper.get("Publication Date", "N/A")
+        return f" (arXiv:{arxiv_id}, {pub_date})"
+    def _add_service_identifier(self, entry: dict[str, Any], identifier: str) -> None:
+        """Add arXiv ID field to entry."""
+        entry["arxiv_id"] = identifier

aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py ADDED Viewed

@@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+"""
+Abstract base class for paper download tools.
+Provides common functionality for arXiv, medRxiv, PubMed, and future paper sources.
+"""
+import logging
+import re
+import tempfile
+from abc import ABC, abstractmethod
+from typing import Any
+import requests
+# Configure logging
+logger = logging.getLogger(__name__)
+class BasePaperDownloader(ABC):
+    """Abstract base class for paper download tools."""
+    def __init__(self, config: Any):
+        """Initialize with service-specific configuration."""
+        self.config = config
+        self.request_timeout = getattr(config, "request_timeout", 15)
+        self.chunk_size = getattr(config, "chunk_size", 8192)
+        self.user_agent = getattr(
+            config, "user_agent", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
+        )
+    # Abstract methods that each service must implement
+    @abstractmethod
+    def fetch_metadata(self, identifier: str) -> Any:
+        """
+        Fetch paper metadata from the service API.
+        Args:
+            identifier: Paper identifier (arXiv ID, DOI, PMID, etc.)
+        Returns:
+            Service-specific metadata object (XML, JSON, etc.)
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def construct_pdf_url(self, metadata: Any, identifier: str) -> str:
+        """
+        Construct or extract PDF URL from metadata.
+        Args:
+            metadata: Metadata returned from fetch_metadata()
+            identifier: Original paper identifier
+        Returns:
+            PDF URL string (empty if not available)
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def extract_paper_metadata(
+        self, metadata: Any, identifier: str, pdf_result: tuple[str, str] | None
+    ) -> dict[str, Any]:
+        """
+        Extract and structure metadata into standardized format.
+        Args:
+            metadata: Raw metadata from API
+            identifier: Original paper identifier
+            pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
+        Returns:
+            Standardized paper metadata dictionary
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def get_service_name(self) -> str:
+        """Return service name (e.g., 'arxiv', 'medrxiv', 'pubmed')."""
+        raise NotImplementedError
+    @abstractmethod
+    def get_identifier_name(self) -> str:
+        """Return identifier display name (e.g., 'arXiv ID', 'DOI', 'PMID')."""
+        raise NotImplementedError
+    @abstractmethod
+    def get_default_filename(self, identifier: str) -> str:
+        """Generate default filename for the paper PDF."""
+        raise NotImplementedError
+    # Common methods shared by all services
+    def download_pdf_to_temp(self, pdf_url: str, identifier: str) -> tuple[str, str] | None:
+        """
+        Download PDF from URL to a temporary file.
+        Args:
+            pdf_url: URL to download PDF from
+            identifier: Paper identifier for logging
+        Returns:
+            Tuple of (temp_file_path, filename) or None if failed
+        """
+        if not pdf_url:
+            logger.info("No PDF URL available for %s %s", self.get_identifier_name(), identifier)
+            return None
+        try:
+            logger.info(
+                "Downloading PDF for %s %s from %s",
+                self.get_identifier_name(),
+                identifier,
+                pdf_url,
+            )
+            headers = {"User-Agent": self.user_agent}
+            response = requests.get(
+                pdf_url, headers=headers, timeout=self.request_timeout, stream=True
+            )
+            response.raise_for_status()
+            # Download to temporary file
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+                for chunk in response.iter_content(chunk_size=self.chunk_size):
+                    if chunk:  # Filter out keep-alive chunks
+                        temp_file.write(chunk)
+                temp_file_path = temp_file.name
+            logger.info(
+                "%s PDF downloaded to temporary file: %s",
+                self.get_service_name(),
+                temp_file_path,
+            )
+            # Try to extract filename from Content-Disposition header
+            filename = self.get_default_filename(identifier)
+            content_disposition = response.headers.get("Content-Disposition", "")
+            if "filename=" in content_disposition:
+                try:
+                    filename_match = re.search(
+                        r'filename[*]?=(?:"([^"]+)"|([^;]+))', content_disposition
+                    )
+                    if filename_match:
+                        extracted_filename = filename_match.group(1) or filename_match.group(2)
+                        extracted_filename = extracted_filename.strip().strip('"')
+                        if extracted_filename and extracted_filename.endswith(".pdf"):
+                            filename = extracted_filename
+                            logger.info("Extracted filename from header: %s", filename)
+                except requests.RequestException as e:
+                    logger.warning("Failed to extract filename from header: %s", e)
+            return temp_file_path, filename
+        except (requests.exceptions.RequestException, OSError) as e:
+            logger.error(
+                "Failed to download PDF for %s %s: %s",
+                self.get_identifier_name(),
+                identifier,
+                e,
+            )
+            return None
+    def get_snippet(self, abstract: str) -> str:
+        """
+        Extract the first one or two sentences from an abstract.
+        Args:
+            abstract: Full abstract text
+        Returns:
+            Snippet of first 1-2 sentences
+        """
+        if not abstract or abstract == "N/A":
+            return ""
+        sentences = abstract.split(". ")
+        snippet_sentences = sentences[:2]
+        snippet = ". ".join(snippet_sentences)
+        if not snippet.endswith("."):
+            snippet += "."
+        return snippet
+    def create_error_entry(self, identifier: str, error_msg: str) -> dict[str, Any]:
+        """
+        Create standardized error entry for failed paper processing.
+        Args:
+            identifier: Paper identifier
+            error_msg: Error message
+        Returns:
+            Error entry dictionary
+        """
+        return {
+            "Title": "Error fetching paper",
+            "Authors": [],
+            "Abstract": f"Error: {error_msg}",
+            "Publication Date": "N/A",
+            "URL": "",
+            "pdf_url": "",
+            "filename": self.get_default_filename(identifier),
+            "source": self.get_service_name(),
+            "access_type": "error",
+            "temp_file_path": "",
+            "error": error_msg,
+            # Service-specific identifier field will be added by subclasses
+        }
+    def build_summary(self, article_data: dict[str, Any]) -> str:
+        """
+        Build a summary string for up to three papers with snippets.
+        Args:
+            article_data: Dictionary of paper data keyed by identifier
+        Returns:
+            Formatted summary string
+        """
+        top = list(article_data.values())[:3]
+        lines: list[str] = []
+        downloaded_count = sum(
+            1
+            for paper in article_data.values()
+            if paper.get("access_type") == "open_access_downloaded"
+        )
+        for idx, paper in enumerate(top):
+            title = paper.get("Title", "N/A")
+            access_type = paper.get("access_type", "N/A")
+            temp_file_path = paper.get("temp_file_path", "")
+            snippet = self.get_snippet(paper.get("Abstract", ""))
+            # Build paper line with service-specific identifier info
+            line = f"{idx + 1}. {title}"
+            line += self._get_paper_identifier_info(paper)
+            line += f"\n   Access: {access_type}"
+            if temp_file_path:
+                line += f"\n   Downloaded to: {temp_file_path}"
+            if snippet:
+                line += f"\n   Abstract snippet: {snippet}"
+            lines.append(line)
+        summary = "\n".join(lines)
+        service_name = self.get_service_name()
+        return (
+            f"Download was successful from {service_name}. "
+            "Papers metadata are attached as an artifact. "
+            "Here is a summary of the results:\n"
+            f"Number of papers found: {len(article_data)}\n"
+            f"PDFs successfully downloaded: {downloaded_count}\n"
+            "Top 3 papers:\n" + summary
+        )
+    @abstractmethod
+    def _get_paper_identifier_info(self, paper: dict[str, Any]) -> str:
+        """
+        Get service-specific identifier info for paper summary.
+        Args:
+            paper: Paper metadata dictionary
+        Returns:
+            Formatted identifier string (e.g., " (arXiv:1234.5678, 2023-01-01)")
+        """
+        raise NotImplementedError
+    def process_identifiers(self, identifiers: list[str]) -> dict[str, Any]:
+        """
+        Main processing loop for downloading papers.
+        Args:
+            identifiers: List of paper identifiers
+        Returns:
+            Dictionary of paper data keyed by identifier
+        """
+        logger.info(
+            "Processing %d identifiers from %s: %s",
+            len(identifiers),
+            self.get_service_name(),
+            identifiers,
+        )
+        article_data: dict[str, Any] = {}
+        for identifier in identifiers:
+            logger.info("Processing %s: %s", self.get_identifier_name(), identifier)
+            try:
+                # Step 1: Fetch metadata
+                metadata = self.fetch_metadata(identifier)
+                # Step 2: Extract PDF URL
+                pdf_url = self.construct_pdf_url(metadata, identifier)
+                # Step 3: Download PDF if available
+                pdf_result = None
+                if pdf_url:
+                    pdf_result = self.download_pdf_to_temp(pdf_url, identifier)
+                # Step 4: Extract and structure metadata
+                article_data[identifier] = self.extract_paper_metadata(
+                    metadata, identifier, pdf_result
+                )
+            except requests.RequestException as e:
+                logger.warning(
+                    "Error processing %s %s: %s",
+                    self.get_identifier_name(),
+                    identifier,
+                    str(e),
+                )
+                # Create error entry
+                error_entry = self.create_error_entry(identifier, str(e))
+                # Add service-specific identifier field
+                self._add_service_identifier(error_entry, identifier)
+                article_data[identifier] = error_entry
+        return article_data
+    @abstractmethod
+    def _add_service_identifier(self, entry: dict[str, Any], identifier: str) -> None:
+        """
+        Add service-specific identifier field to entry.
+        Args:
+            entry: Paper entry dictionary to modify
+            identifier: Original identifier
+        """
+        raise NotImplementedError