PyPI - aiagents4pharma - Versions diffs - 0.0.0__py3-none-any.whl - Mend

aiagents4pharma 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (336) hide show

aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py ADDED Viewed

@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+"""
+PubMed paper downloader implementation.
+"""
+import logging
+import xml.etree.ElementTree as ET
+from typing import Any, cast
+import requests
+from bs4 import BeautifulSoup, Tag
+from .base_paper_downloader import BasePaperDownloader
+logger = logging.getLogger(__name__)
+class PubmedDownloader(BasePaperDownloader):
+    """PubMed-specific implementation of paper downloader."""
+    def __init__(self, config: Any):
+        """Initialize PubMed downloader with configuration."""
+        super().__init__(config)
+        self.id_converter_url = config.id_converter_url
+        self.oa_api_url = config.oa_api_url
+        # Alternative PDF sources
+        self.europe_pmc_base_url = config.europe_pmc_base_url
+        self.pmc_page_base_url = config.pmc_page_base_url
+        self.direct_pmc_pdf_base_url = config.direct_pmc_pdf_base_url
+        # URL conversion for NCBI FTP links
+        self.ftp_base_url = config.ftp_base_url
+        self.https_base_url = config.https_base_url
+        # Configuration values
+        self.id_converter_format = getattr(config, "id_converter_format", "json")
+        self.pdf_meta_name = getattr(config, "pdf_meta_name", "citation_pdf_url")
+        self.default_error_code = getattr(config, "default_error_code", "unknown")
+    def fetch_metadata(self, identifier: str) -> dict[str, Any]:
+        """
+        Fetch paper metadata from PubMed ID Converter API.
+        Args:
+            identifier: PMID (e.g., '12345678')
+        Returns:
+            JSON response from PMC ID Converter API
+        Raises:
+            requests.RequestException: If API call fails
+            RuntimeError: If no records found in response
+        """
+        query_url = f"{self.id_converter_url}?ids={identifier}&format={self.id_converter_format}"
+        logger.info("Fetching metadata from ID converter for PMID %s: %s", identifier, query_url)
+        response = requests.get(query_url, timeout=self.request_timeout)
+        response.raise_for_status()
+        result = response.json()
+        logger.info("ID converter response for PMID %s: %s", identifier, result)
+        if "records" not in result or not result["records"]:
+            raise RuntimeError("No records found in PMC ID Converter API response")
+        return result
+    def construct_pdf_url(self, metadata: dict[str, Any], identifier: str) -> str:
+        """
+        Construct PDF URL using multiple fallback strategies.
+        Args:
+            metadata: JSON response from ID converter
+            identifier: PMID
+        Returns:
+            PDF URL string (empty if no PDF available)
+        """
+        if "records" not in metadata or not metadata["records"]:
+            return ""
+        record = metadata["records"][0]
+        pmcid = record.get("pmcid", "")
+        if not pmcid or pmcid == "N/A":
+            logger.info("No PMCID available for PDF fetch: PMID %s", identifier)
+            return ""
+        return self._fetch_pdf_url_with_fallbacks(pmcid)
+    def _fetch_pdf_url_with_fallbacks(self, pmcid: str) -> str:
+        """
+        Fetch PDF URL from OA API with comprehensive fallback strategies.
+        Args:
+            pmcid: PMC ID (e.g., 'PMC1234567')
+        Returns:
+            PDF URL string (empty if all strategies fail)
+        """
+        logger.info("Fetching PDF URL for PMCID: %s", pmcid)
+        # Strategy 1: Official OA API (fastest when it works)
+        pdf_url = self._try_oa_api(pmcid)
+        if pdf_url:
+            return pdf_url
+        # Strategy 2: Europe PMC Service (most reliable fallback)
+        pdf_url = self._try_europe_pmc(pmcid)
+        if pdf_url:
+            return pdf_url
+        # Strategy 3: Scrape PMC page for citation_pdf_url meta tag
+        pdf_url = self._try_pmc_page_scraping(pmcid)
+        if pdf_url:
+            return pdf_url
+        # Strategy 4: Direct PMC PDF URL pattern (least reliable)
+        pdf_url = self._try_direct_pmc_url(pmcid)
+        if pdf_url:
+            return pdf_url
+        logger.warning("All PDF URL strategies failed for PMCID: %s", pmcid)
+        return ""
+    def _try_oa_api(self, pmcid: str) -> str:
+        """Try to get PDF URL from official OA API."""
+        query_url = f"{self.oa_api_url}?id={pmcid}"
+        logger.info("Trying OA API for PMCID %s: %s", pmcid, query_url)
+        try:
+            response = requests.get(query_url, timeout=self.request_timeout)
+            response.raise_for_status()
+            logger.info("OA API response for PMCID %s: %s", pmcid, response.text[:500])
+            # Parse XML response
+            root = ET.fromstring(response.text)
+            # Check for error first
+            error_elem = root.find(".//error")
+            if error_elem is not None:
+                error_code = error_elem.get("code", self.default_error_code)
+                error_text = error_elem.text or "unknown error"
+                logger.info("OA API error for PMCID %s: %s - %s", pmcid, error_code, error_text)
+                return ""
+            # Look for PDF link
+            pdf_link = root.find(".//link[@format='pdf']")
+            if pdf_link is not None:
+                pdf_url = pdf_link.get("href", "")
+                logger.info("Found PDF URL from OA API for PMCID %s: %s", pmcid, pdf_url)
+                # Convert FTP links to HTTPS for download compatibility
+                if pdf_url.startswith(self.ftp_base_url):
+                    pdf_url = pdf_url.replace(self.ftp_base_url, self.https_base_url)
+                    logger.info("Converted FTP to HTTPS for %s: %s", pmcid, pdf_url)
+                return pdf_url
+        except requests.RequestException as e:
+            logger.info("OA API failed for %s: %s", pmcid, str(e))
+        return ""
+    def _try_europe_pmc(self, pmcid: str) -> str:
+        """Try Europe PMC service for PDF."""
+        europe_pmc_url = f"{self.europe_pmc_base_url}?accid={pmcid}&blobtype=pdf"
+        logger.info("Trying Europe PMC service for %s: %s", pmcid, europe_pmc_url)
+        try:
+            response = requests.head(europe_pmc_url, timeout=self.request_timeout)
+            if response.status_code == 200:
+                logger.info("Europe PMC service works for %s", pmcid)
+                return europe_pmc_url
+        except requests.RequestException as e:
+            logger.info("Europe PMC service failed for %s: %s", pmcid, str(e))
+        return ""
+    def _try_pmc_page_scraping(self, pmcid: str) -> str:
+        """Try scraping PMC page for PDF meta tag."""
+        pmc_page_url = f"{self.pmc_page_base_url}/{pmcid}/"
+        logger.info("Scraping PMC page for PDF meta tag for %s: %s", pmcid, pmc_page_url)
+        try:
+            headers = {"User-Agent": self.user_agent}
+            response = requests.get(pmc_page_url, headers=headers, timeout=self.request_timeout)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, "html.parser")
+            # Look for PDF meta tag
+            pdf_meta = soup.find("meta", attrs={"name": self.pdf_meta_name})
+            if pdf_meta is not None:
+                # Cast to Tag to help type checker understand this is a BeautifulSoup Tag object
+                meta_tag = cast(Tag, pdf_meta)
+                content = meta_tag.get("content")
+                if content:
+                    logger.info(
+                        "Found %s meta tag for %s: %s",
+                        self.pdf_meta_name,
+                        pmcid,
+                        content,
+                    )
+                    return str(content)
+        except requests.RequestException as e:
+            logger.info("PMC page scraping failed for %s: %s", pmcid, str(e))
+        return ""
+    def _try_direct_pmc_url(self, pmcid: str) -> str:
+        """Try direct PMC PDF URL pattern."""
+        direct_pmc_url = f"{self.direct_pmc_pdf_base_url}/{pmcid}/pdf/"
+        logger.info("Trying direct PMC PDF URL for %s: %s", pmcid, direct_pmc_url)
+        try:
+            response = requests.head(direct_pmc_url, timeout=self.request_timeout)
+            if response.status_code == 200:
+                logger.info("Direct PMC PDF URL works for %s", pmcid)
+                return direct_pmc_url
+        except requests.RequestException as e:
+            logger.info("Direct PMC PDF URL failed for %s: %s", pmcid, str(e))
+        return ""
+    def extract_paper_metadata(
+        self,
+        metadata: dict[str, Any],
+        identifier: str,
+        pdf_result: tuple[str, str] | None,
+    ) -> dict[str, Any]:
+        """
+        Extract structured metadata from PubMed ID converter response.
+        Args:
+            metadata: JSON response from ID converter
+            identifier: PMID
+            pdf_result: Tuple of (temp_file_path, filename) if PDF downloaded
+        Returns:
+            Standardized paper metadata dictionary
+        """
+        if "records" not in metadata or not metadata["records"]:
+            raise RuntimeError("No records found in metadata")
+        record = metadata["records"][0]  # Get first (and should be only) record
+        # Extract basic fields from ID converter
+        pmcid = record.get("pmcid", "N/A")
+        doi = record.get("doi", "N/A")
+        # Handle PDF download results
+        if pdf_result:
+            temp_file_path, filename = pdf_result
+            access_type = "open_access_downloaded"
+            pdf_url = temp_file_path  # Use local temp file path
+        else:
+            temp_file_path = ""
+            filename = self.get_default_filename(identifier)
+            access_type = "abstract_only" if pmcid != "N/A" else "no_pmcid"
+            pdf_url = ""
+        # Note: For PubMed, we don't get title/authors from ID converter
+        # In a real implementation, you might want to call E-utilities for full metadata
+        # For now, we'll use placeholders and focus on the ID conversion functionality
+        return {
+            "Title": (
+                f"PubMed Article {identifier}"
+            ),  # Placeholder - would need E-utilities for real title
+            "Authors": [],  # Placeholder - would need E-utilities for real authors
+            "Abstract": "Abstract available in PubMed",  # Placeholder
+            "Publication Date": "N/A",  # Would need E-utilities for this
+            "PMID": identifier,
+            "PMCID": pmcid,
+            "DOI": doi,
+            "Journal": "N/A",  # Would need E-utilities for this
+            "URL": pdf_url,
+            "pdf_url": pdf_url,
+            "access_type": access_type,
+            "filename": filename,
+            "source": "pubmed",
+            "temp_file_path": temp_file_path,
+        }
+    def get_service_name(self) -> str:
+        """Return service name."""
+        return "PubMed"
+    def get_identifier_name(self) -> str:
+        """Return identifier display name."""
+        return "PMID"
+    def get_default_filename(self, identifier: str) -> str:
+        """Generate default filename for PubMed paper."""
+        return f"pmid_{identifier}.pdf"
+    def get_snippet(self, abstract: str) -> str:
+        """Override to handle PubMed-specific abstract placeholder."""
+        if not abstract or abstract == "N/A" or abstract == "Abstract available in PubMed":
+            return ""
+        return super().get_snippet(abstract)
+    def _get_paper_identifier_info(self, paper: dict[str, Any]) -> str:
+        """Get PubMed-specific identifier info for paper summary."""
+        pmid = paper.get("PMID", "N/A")
+        pmcid = paper.get("PMCID", "N/A")
+        info = f" (PMID: {pmid})"
+        if pmcid != "N/A":
+            info += f"\n   PMCID: {pmcid}"
+        return info
+    def _add_service_identifier(self, entry: dict[str, Any], identifier: str) -> None:
+        """Add PMID and PubMed-specific fields to entry."""
+        entry["PMID"] = identifier
+        entry["PMCID"] = "N/A"
+        entry["DOI"] = "N/A"
+        entry["Journal"] = "N/A"

aiagents4pharma/talk2scholars/tools/pdf/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""
+This file is used to import all the modules in the package.
+"""
+from . import question_and_answer
+__all__ = ["question_and_answer"]

aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py ADDED Viewed

@@ -0,0 +1,170 @@
+"""
+LangGraph PDF Retrieval-Augmented Generation (RAG) Tool
+This tool answers user questions using the traditional RAG pipeline:
+1. Retrieve relevant chunks from ALL papers in the vector store
+2. Rerank chunks using NVIDIA NIM reranker to find the most relevant ones
+3. Generate answer using the top reranked chunks
+Traditional RAG Pipeline Flow:
+  Query → Retrieve chunks from ALL papers → Rerank chunks → Generate answer
+This ensures the best possible chunks are selected across all available papers,
+not just from pre-selected papers.
+"""
+import logging
+import os
+import time
+from typing import Annotated, Any
+from langchain_core.messages import ToolMessage
+from langchain_core.tools import tool
+from langchain_core.tools.base import InjectedToolCallId
+from langgraph.prebuilt import InjectedState
+from langgraph.types import Command
+from pydantic import BaseModel, Field
+from .utils.answer_formatter import format_answer
+from .utils.generate_answer import load_hydra_config
+from .utils.paper_loader import load_all_papers
+from .utils.rag_pipeline import retrieve_and_rerank_chunks
+from .utils.tool_helper import QAToolHelper
+# Helper for managing state, vectorstore, reranking, and formatting
+helper = QAToolHelper()
+# Load configuration and start logging
+config = load_hydra_config()
+# Set up logging with configurable level
+log_level = os.environ.get("LOG_LEVEL", "INFO")
+logging.basicConfig(level=getattr(logging, log_level))
+logger = logging.getLogger(__name__)
+logger.setLevel(getattr(logging, log_level))
+class QuestionAndAnswerInput(BaseModel):
+    """
+    Pydantic schema for the PDF Q&A tool inputs.
+    Fields:
+      question: User's free-text query to answer based on PDF content.
+      tool_call_id: LangGraph-injected call identifier for tracking.
+      state: Shared agent state dict containing:
+        - article_data: metadata mapping of paper IDs to info (e.g., 'pdf_url', title).
+        - text_embedding_model: embedding model instance for chunk indexing.
+        - llm_model: chat/LLM instance for answer generation.
+    """
+    question: str = Field(description="User question for generating a PDF-based answer.")
+    tool_call_id: Annotated[str, InjectedToolCallId]
+    state: Annotated[dict, InjectedState]
+@tool(args_schema=QuestionAndAnswerInput, parse_docstring=True)
+def question_and_answer(
+    question: str,
+    state: Annotated[dict, InjectedState],
+    tool_call_id: Annotated[str, InjectedToolCallId],
+) -> Command[Any]:
+    """
+    LangGraph tool for Retrieval-Augmented Generation over PDFs using traditional RAG pipeline.
+    Traditional RAG Pipeline Implementation:
+      1. Load ALL available PDFs into Milvus vector store (if not already loaded)
+      2. Retrieve relevant chunks from ALL papers using vector similarity search
+      3. Rerank retrieved chunks using NVIDIA NIM semantic reranker
+      4. Generate answer using top reranked chunks with source attribution
+    This approach ensures the best chunks are selected across all available papers,
+    rather than pre-selecting papers and potentially missing relevant information.
+    Args:
+      question (str): The free-text question to answer.
+      state (dict): Injected agent state; must include:
+        - article_data: mapping paper IDs → metadata (pdf_url, title, etc.)
+        - text_embedding_model: embedding model instance.
+        - llm_model: chat/LLM instance.
+      tool_call_id (str): Internal identifier for this tool invocation.
+    Returns:
+      Command[Any]: updates conversation state with a ToolMessage(answer).
+    Raises:
+      ValueError: when required models or metadata are missing in state.
+      RuntimeError: when no relevant chunks can be retrieved for the query.
+    """
+    call_id = f"qa_call_{time.time()}"
+    logger.info(
+        "Starting PDF Question and Answer tool (Traditional RAG Pipeline) - Call %s",
+        call_id,
+    )
+    logger.info("%s: Question: '%s'", call_id, question)
+    helper.start_call(config, call_id)
+    # Extract models and article metadata
+    text_emb, llm_model, article_data = helper.get_state_models_and_data(state)
+    # Initialize or reuse Milvus vector store
+    logger.info("%s: Initializing vector store", call_id)
+    vs = helper.init_vector_store(text_emb)
+    # Load ALL papers (traditional RAG approach)
+    logger.info(
+        "%s: Loading all %d papers into vector store (traditional RAG approach)",
+        call_id,
+        len(article_data),
+    )
+    load_all_papers(
+        vector_store=vs,
+        articles=article_data,
+        call_id=call_id,
+        config=config,
+        has_gpu=helper.has_gpu,
+    )
+    # Traditional RAG Pipeline: Retrieve from ALL papers, then rerank
+    logger.info(
+        "%s: Starting traditional RAG pipeline: retrieve → rerank → generate",
+        call_id,
+    )
+    # Retrieve and rerank chunks in one step
+    reranked_chunks = retrieve_and_rerank_chunks(vs, question, config, call_id, helper.has_gpu)
+    if not reranked_chunks:
+        msg = f"No relevant chunks found for question: '{question}'"
+        logger.warning("%s: %s", call_id, msg)
+    # Generate answer using reranked chunks
+    logger.info(
+        "%s: Generating answer using %d reranked chunks",
+        call_id,
+        len(reranked_chunks),
+    )
+    response_text = format_answer(
+        question,
+        reranked_chunks,
+        llm_model,
+        article_data,
+        config,
+        call_id=call_id,
+        has_gpu=helper.has_gpu,
+    )
+    logger.info(
+        "%s: Successfully traditional completed RAG pipeline",
+        call_id,
+    )
+    return Command(
+        update={
+            "messages": [
+                ToolMessage(
+                    content=response_text,
+                    tool_call_id=tool_call_id,
+                )
+            ],
+        }
+    )

aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""
+Utility modules for the PDF question_and_answer tool.
+"""
+from . import (
+    answer_formatter,
+    batch_processor,
+    collection_manager,
+    generate_answer,
+    get_vectorstore,
+    gpu_detection,
+    nvidia_nim_reranker,
+    paper_loader,
+    rag_pipeline,
+    retrieve_chunks,
+    singleton_manager,
+    tool_helper,
+    vector_normalization,
+    vector_store,
+)
+__all__ = [
+    "answer_formatter",
+    "batch_processor",
+    "collection_manager",
+    "generate_answer",
+    "get_vectorstore",
+    "gpu_detection",
+    "nvidia_nim_reranker",
+    "paper_loader",
+    "rag_pipeline",
+    "retrieve_chunks",
+    "singleton_manager",
+    "tool_helper",
+    "vector_normalization",
+    "vector_store",
+]

aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py ADDED Viewed

@@ -0,0 +1,62 @@
+"""
+Format the final answer text with source attributions and hardware info.
+"""
+import logging
+from typing import Any
+from .generate_answer import generate_answer
+logger = logging.getLogger(__name__)
+def format_answer(
+    question: str,
+    chunks: list[Any],
+    llm: Any,
+    articles: dict[str, Any],
+    config: Any,
+    **kwargs: Any,
+) -> str:
+    """
+    Generate the final answer text with source attributions and hardware info.
+    Expects `call_id` and `has_gpu` in kwargs.
+    """
+    result = generate_answer(question, chunks, llm, config)
+    answer = result.get("output_text", "No answer generated.")
+    # Get unique paper titles for source attribution
+    titles: dict[str, str] = {}
+    for pid in result.get("papers_used", []):
+        if pid in articles:
+            titles[pid] = articles[pid].get("Title", "Unknown paper")
+    # Format sources
+    if titles:
+        srcs = "\n\nSources:\n" + "\n".join(f"- {t}" for t in titles.values())
+    else:
+        srcs = ""
+    # Extract logging metadata
+    call_id = kwargs.get("call_id", "<no-call-id>")
+    has_gpu = kwargs.get("has_gpu", False)
+    hardware_info = "GPU-accelerated" if has_gpu else "CPU-processed"
+    # Log final statistics with hardware info
+    logger.info(
+        "%s: Generated answer using %d chunks from %d papers (%s)",
+        call_id,
+        len(chunks),
+        len(titles),
+        hardware_info,
+    )
+    # Add subtle hardware info to logs but not to user output
+    logger.debug(
+        "%s: Answer generation completed with %s processing",
+        call_id,
+        hardware_info,
+    )
+    return f"{answer}{srcs}"