PyPI - aiagents4pharma - Versions diffs - 1.31.0__py3-none-any.whl → 1.33.0__py3-none-any.whl - Mend

aiagents4pharma 1.31.0py3-none-any.whl → 1.33.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

aiagents4pharma/talk2scholars/tools/s2/__init__.py CHANGED Viewed

@@ -2,18 +2,18 @@
 This file is used to import all the modules in the package.
 """
-from . import display_results
+from . import display_dataframe
 from . import multi_paper_rec
 from . import search
 from . import single_paper_rec
-from . import query_results
+from . import query_dataframe
 from . import retrieve_semantic_scholar_paper_id
 __all__ = [
-    "display_results",
+    "display_dataframe",
     "multi_paper_rec",
     "search",
     "single_paper_rec",
-    "query_results",
+    "query_dataframe",
     "retrieve_semantic_scholar_paper_id",
 ]

aiagents4pharma/talk2scholars/tools/s2/{display_results.py → display_dataframe.py} RENAMED Viewed

@@ -2,11 +2,13 @@
 """
-Tool for displaying search or recommendation results.
+Tool for rendering the most recently displayed papers as a DataFrame artifact for the front-end.
-This module defines a tool that retrieves and displays a table of research papers
-found during searches or recommendations. If no papers are found, an exception is raised
-to signal the need for a new search.
+This module defines a tool that retrieves the paper metadata stored under the state key
+'last_displayed_papers' and returns it as an artifact (dictionary of papers). The front-end
+can then render this artifact as a pandas DataFrame for display. If no papers are found,
+a NoPapersFoundError is raised to indicate that a search or recommendation should be
+performed first.
 """
@@ -38,34 +40,30 @@ class NoPapersFoundError(Exception):
     """
-@tool("display_results", parse_docstring=True)
-def display_results(
+@tool("display_dataframe", parse_docstring=True)
+def display_dataframe(
     tool_call_id: Annotated[str, InjectedToolCallId],
     state: Annotated[dict, InjectedState],
 ) -> Command:
     """
-    Displays retrieved research papers after a search or recommendation.
+    Render the last set of retrieved papers as a DataFrame in the front-end.
-    This function retrieves the last displayed research papers from the state and
-    returns them as an artifact for further processing. If no papers are found,
-    it raises a `NoPapersFoundError` to indicate that a new search is needed.
+    This function reads the 'last_displayed_papers' key from state, fetches the
+    corresponding metadata dictionary, and returns a Command with a ToolMessage
+    containing the artifact (dictionary) for the front-end to render as a DataFrame.
+    If no papers are found in state, it raises a NoPapersFoundError to indicate
+    that a search or recommendation must be performed first.
     Args:
-        tool_call_id (Annotated[str, InjectedToolCallId]): The tool call ID for tracking.
-        state (dict): The agent's state containing retrieved papers.
+        tool_call_id (InjectedToolCallId): Unique ID of this tool invocation.
+        state (dict): The agent's state containing the 'last_displayed_papers' reference.
     Returns:
-        Command: A command containing a message with the number of displayed papers
-                 and an attached artifact for further reference.
+        Command: A command whose update contains a ToolMessage with the artifact
+                 (papers dict) for DataFrame rendering in the UI.
     Raises:
-        NoPapersFoundError: If no research papers are found in the agent's state.
-    Example:
-        >>> state = {"last_displayed_papers": {"paper1": "Title 1", "paper2": "Title 2"}}
-        >>> result = display_results(tool_call_id="123", state=state)
-        >>> print(result.update["messages"][0].content)
-        "2 papers found. Papers are attached as an artifact."
+        NoPapersFoundError: If no entries exist under 'last_displayed_papers' in state.
     """
     logger.info("Displaying papers")
     context_key = state.get("last_displayed_papers")

aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py ADDED Viewed

@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""
+Tool for querying the metadata table of the last displayed papers.
+This tool loads the most recently displayed papers into a pandas DataFrame and uses an
+LLM-driven pandas agent to answer metadata-level questions (e.g., filter by author, list titles).
+It is intended for metadata exploration only, and does not perform content-based retrieval
+or summarization. For PDF-level question answering, use the 'question_and_answer_agent'.
+"""
+import logging
+from typing import Annotated
+import pandas as pd
+from langchain_experimental.agents import create_pandas_dataframe_agent
+from langchain_core.tools import tool
+from langgraph.prebuilt import InjectedState
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class NoPapersFoundError(Exception):
+    """Exception raised when no papers are found in the state."""
+@tool("query_dataframe", parse_docstring=True)
+def query_dataframe(question: str, state: Annotated[dict, InjectedState]) -> str:
+    """
+    Perform a tabular query on the most recently displayed papers.
+    This function loads the last displayed papers into a pandas DataFrame and uses a
+    pandas DataFrame agent to answer metadata-level questions (e.g., "Which papers have
+    'Transformer' in the title?", "List authors of paper X"). It does not perform PDF
+    content analysis or summarization; for content-level question answering, use the
+    'question_and_answer_agent'.
+    Args:
+        question (str): The metadata query to ask over the papers table.
+        state (dict): The agent's state containing 'last_displayed_papers'
+            key referencing the metadata table in state.
+    Returns:
+        str: The LLM's response to the metadata query.
+    Raises:
+        NoPapersFoundError: If no papers have been displayed yet.
+    """
+    logger.info("Querying last displayed papers with question: %s", question)
+    llm_model = state.get("llm_model")
+    if not state.get("last_displayed_papers"):
+        logger.info("No papers displayed so far, raising NoPapersFoundError")
+        raise NoPapersFoundError(
+            "No papers found. A search needs to be performed first."
+        )
+    context_key = state.get("last_displayed_papers")
+    dic_papers = state.get(context_key)
+    df_papers = pd.DataFrame.from_dict(dic_papers, orient="index")
+    df_agent = create_pandas_dataframe_agent(
+        llm_model,
+        allow_dangerous_code=True,
+        agent_type="tool-calling",
+        df=df_papers,
+        max_iterations=5,
+        include_df_in_prompt=True,
+        number_of_head_rows=df_papers.shape[0],
+        verbose=True,
+    )
+    llm_result = df_agent.invoke(question, stream_mode=None)
+    return llm_result["output"]

aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py CHANGED Viewed

@@ -5,16 +5,22 @@ Utility for zotero read tool.
 """
 import logging
-from typing import Any, Dict, List
+import tempfile
+from typing import Any, Dict, List, Tuple, Optional
+import concurrent.futures
 import hydra
+import requests
 from pyzotero import zotero
-from .zotero_path import get_item_collections
+from .zotero_path import get_item_collections
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# pylint: disable=broad-exception-caught
 class ZoteroSearchData:
     """Helper class to organize Zotero search-related data."""
@@ -33,8 +39,10 @@ class ZoteroSearchData:
         self.cfg = self._load_config()
         self.zot = self._init_zotero_client()
         self.item_to_collections = get_item_collections(self.zot)
-        self.filtered_papers = {}
+        self.article_data = {}
         self.content = ""
+        # Create a session for connection pooling
+        self.session = requests.Session()
     def process_search(self) -> None:
         """Process the search request and prepare results."""
@@ -45,7 +53,7 @@ class ZoteroSearchData:
     def get_search_results(self) -> Dict[str, Any]:
         """Get the search results and content."""
         return {
-            "filtered_papers": self.filtered_papers,
+            "article_data": self.article_data,
             "content": self.content,
         }
@@ -97,50 +105,218 @@ class ZoteroSearchData:
         return items
+    def _download_zotero_pdf(self, attachment_key: str) -> Optional[Tuple[str, str]]:
+        """Download a PDF from Zotero by attachment key. Returns (file_path, filename) or None."""
+        zotero_pdf_url = (
+            f"https://api.zotero.org/users/{self.cfg.user_id}/items/"
+            f"{attachment_key}/file"
+        )
+        headers = {"Zotero-API-Key": self.cfg.api_key}
+        try:
+            # Use session for connection pooling
+            response = self.session.get(
+                zotero_pdf_url, headers=headers, stream=True, timeout=10
+            )
+            response.raise_for_status()
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+                # Increased chunk size for better performance
+                for chunk in response.iter_content(chunk_size=16384):
+                    temp_file.write(chunk)
+                temp_file_path = temp_file.name
+            content_disp = response.headers.get("Content-Disposition", "")
+            filename = (
+                content_disp.split("filename=")[-1].strip('"')
+                if "filename=" in content_disp
+                else "downloaded.pdf"
+            )
+            return temp_file_path, filename
+        except Exception as e:
+            logger.error(
+                "Failed to download Zotero PDF for attachment %s: %s", attachment_key, e
+            )
+            return None
+    def _download_pdfs_in_parallel(
+        self, attachment_item_map: Dict[str, str]
+    ) -> Dict[str, Tuple[str, str, str]]:
+        """
+        Download multiple PDFs in parallel using ThreadPoolExecutor.
+        Args:
+            attachment_item_map: Dictionary mapping attachment keys to parent item keys
+        Returns:
+            Dictionary mapping parent item keys to (file_path, filename, attachment_key)
+        """
+        results = {}
+        max_workers = min(10, len(attachment_item_map))  # Set reasonable limit
+        if not attachment_item_map:
+            return results
+        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+            # Create a dictionary mapping Future objects to attachment keys
+            future_to_key = {
+                executor.submit(self._download_zotero_pdf, attachment_key): (
+                    attachment_key,
+                    item_key,
+                )
+                for attachment_key, item_key in attachment_item_map.items()
+            }
+            for future in concurrent.futures.as_completed(future_to_key):
+                attachment_key, item_key = future_to_key[future]
+                try:
+                    result = future.result()
+                    if result:
+                        temp_file_path, resolved_filename = result
+                        results[item_key] = (
+                            temp_file_path,
+                            resolved_filename,
+                            attachment_key,
+                        )
+                except Exception as e:
+                    logger.error(
+                        "Failed to download PDF for key %s: %s", attachment_key, e
+                    )
+        return results
+    # pylint: disable=too-many-locals, too-many-branches
     def _filter_and_format_papers(self, items: List[Dict[str, Any]]) -> None:
-        """Filter and format papers from items."""
+        """Filter and format papers from Zotero items, including standalone PDFs."""
         filter_item_types = (
             self.cfg.zotero.filter_item_types if self.only_articles else []
         )
         logger.debug("Filtering item types: %s", filter_item_types)
+        # Maps to track attachments for batch processing
+        orphaned_pdfs = {}  # attachment_key -> item key (same for orphans)
+        item_attachments = {}  # item_key -> [attachment_keys]
+        # First pass: process all items without downloading PDFs
         for item in items:
             if not isinstance(item, dict):
                 continue
-            data = item.get("data")
-            if not isinstance(data, dict):
-                continue
+            data = item.get("data", {})
             item_type = data.get("itemType", "N/A")
-            logger.debug("Item type: %s", item_type)
             key = data.get("key")
             if not key:
                 continue
-            collection_paths = self.item_to_collections.get(key, ["/Unknown"])
-            self.filtered_papers[key] = {
-                "Title": data.get("title", "N/A"),
-                "Abstract": data.get("abstractNote", "N/A"),
-                "Publication Date": data.get("date", "N/A"),
-                "URL": data.get("url", "N/A"),
-                "Type": item_type if isinstance(item_type, str) else "N/A",
-                "Collections": collection_paths,
-                "Citation Count": data.get("citationCount", "N/A"),
-                "Venue": data.get("venue", "N/A"),
-                "Publication Venue": data.get("publicationTitle", "N/A"),
-                "Journal Name": data.get("journalAbbreviation", "N/A"),
-                "Authors": [
-                    f"{creator.get('firstName', '')} {creator.get('lastName', '')}".strip()
-                    for creator in data.get("creators", [])
-                    if isinstance(creator, dict)
-                    and creator.get("creatorType") == "author"
-                ],
-            }
+            # CASE 1: Top-level item (e.g., journalArticle)
+            if item_type != "attachment":
+                collection_paths = self.item_to_collections.get(key, ["/Unknown"])
+                self.article_data[key] = {
+                    "Title": data.get("title", "N/A"),
+                    "Abstract": data.get("abstractNote", "N/A"),
+                    "Publication Date": data.get("date", "N/A"),
+                    "URL": data.get("url", "N/A"),
+                    "Type": item_type,
+                    "Collections": collection_paths,
+                    "Citation Count": data.get("citationCount", "N/A"),
+                    "Venue": data.get("venue", "N/A"),
+                    "Publication Venue": data.get("publicationTitle", "N/A"),
+                    "Journal Name": data.get("journalAbbreviation", "N/A"),
+                    "Authors": [
+                        f"{creator.get('firstName', '')} {creator.get('lastName', '')}".strip()
+                        for creator in data.get("creators", [])
+                        if isinstance(creator, dict)
+                        and creator.get("creatorType") == "author"
+                    ],
+                    "source": "zotero",
+                }
+                # We'll collect attachment info in second pass
+            # CASE 2: Standalone orphaned PDF attachment
+            elif data.get("contentType") == "application/pdf" and not data.get(
+                "parentItem"
+            ):
+                attachment_key = key
+                filename = data.get("filename", "unknown.pdf")
+                # Add to orphaned PDFs for batch processing
+                orphaned_pdfs[attachment_key] = (
+                    attachment_key  # Same key as both attachment and "item"
+                )
+                # Create the entry without PDF info yet
+                self.article_data[key] = {
+                    "Title": filename,
+                    "Abstract": "No abstract available",
+                    "Publication Date": "N/A",
+                    "URL": "N/A",
+                    "Type": "orphan_attachment",
+                    "Collections": ["/(No Collection)"],
+                    "Citation Count": "N/A",
+                    "Venue": "N/A",
+                    "Publication Venue": "N/A",
+                    "Journal Name": "N/A",
+                    "Authors": ["(Unknown)"],
+                    "source": "zotero",
+                }
-        if not self.filtered_papers:
+        # Second pass: collect attachment info for all items
+        for item_key, item_data in self.article_data.items():
+            if item_data["Type"] != "orphan_attachment":
+                try:
+                    children = self.zot.children(item_key)
+                    pdf_attachments = [
+                        child
+                        for child in children
+                        if isinstance(child, dict)
+                        and child.get("data", {}).get("contentType")
+                        == "application/pdf"
+                    ]
+                    if pdf_attachments:
+                        attachment = pdf_attachments[0]
+                        attachment_data = attachment.get("data", {})
+                        attachment_key = attachment_data.get("key")
+                        filename = attachment_data.get("filename", "unknown.pdf")
+                        if attachment_key:
+                            # Add to item attachments map
+                            item_attachments[attachment_key] = item_key
+                            # Add basic info
+                            self.article_data[item_key]["filename"] = filename
+                except Exception as e:
+                    logger.error(
+                        "Failed to get attachments for item %s: %s", item_key, e
+                    )
+        # Now download all PDFs in parallel - first orphaned PDFs
+        logger.info("Downloading %d orphaned PDFs in parallel", len(orphaned_pdfs))
+        orphan_results = self._download_pdfs_in_parallel(orphaned_pdfs)
+        # Update orphan data
+        for item_key, (file_path, filename, attachment_key) in orphan_results.items():
+            self.article_data[item_key]["filename"] = filename
+            self.article_data[item_key]["pdf_url"] = file_path
+            self.article_data[item_key]["attachment_key"] = attachment_key
+            logger.info("Downloaded orphaned Zotero PDF to: %s", file_path)
+        # Download regular item attachments
+        logger.info(
+            "Downloading %d regular item PDFs in parallel", len(item_attachments)
+        )
+        item_results = self._download_pdfs_in_parallel(item_attachments)
+        # Update item data
+        for item_key, (file_path, filename, attachment_key) in item_results.items():
+            self.article_data[item_key]["filename"] = filename
+            self.article_data[item_key]["pdf_url"] = file_path
+            self.article_data[item_key]["attachment_key"] = attachment_key
+            logger.info("Downloaded Zotero PDF to: %s", file_path)
+        if not self.article_data:
             logger.error(
                 "No matching papers returned from Zotero for query: '%s'", self.query
             )
@@ -148,11 +324,13 @@ class ZoteroSearchData:
                 "No matching papers returned from Zotero. Please retry the same query."
             )
-        logger.info("Filtered %d items", len(self.filtered_papers))
+        logger.info(
+            "Filtered %d items (including orphaned attachments)", len(self.article_data)
+        )
     def _create_content(self) -> None:
         """Create the content message for the response."""
-        top_papers = list(self.filtered_papers.values())[:2]
+        top_papers = list(self.article_data.values())[:2]
         top_papers_info = "\n".join(
             [
                 f"{i+1}. {paper['Title']} ({paper['Type']})"
@@ -162,6 +340,6 @@ class ZoteroSearchData:
         self.content = "Retrieval was successful. Papers are attached as an artifact."
         self.content += " And here is a summary of the retrieval results:\n"
-        self.content += f"Number of papers found: {len(self.filtered_papers)}\n"
+        self.content += f"Number of papers found: {len(self.article_data)}\n"
         self.content += f"Query: {self.query}\n"
         self.content += "Here are a few of these papers:\n" + top_papers_info

aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py CHANGED Viewed

@@ -62,13 +62,13 @@ def zotero_read(
     return Command(
         update={
-            "zotero_read": results["filtered_papers"],
-            "last_displayed_papers": "zotero_read",
+            "article_data": results["article_data"],
+            "last_displayed_papers": "article_data",
             "messages": [
                 ToolMessage(
                     content=results["content"],
                     tool_call_id=tool_call_id,
-                    artifact=results["filtered_papers"],
+                    artifact=results["article_data"],
                 )
             ],
         }

{aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: aiagents4pharma
-Version: 1.31.0
+Version: 1.33.0
 Summary: AI Agents for drug discovery, drug development, and other pharmaceutical R&D.
 Classifier: Programming Language :: Python :: 3
 Classifier: License :: OSI Approved :: MIT License
@@ -275,6 +275,8 @@ To use **Talk2AIAgents4Pharma**, **Talk2BioModels**, **Talk2KnowledgeGraphs**, o
 Only for **Talk2Scholars**, you also need a **Zotero API key**, which you can generate [here](https://www.zotero.org/user/login#applications). _(For all other agents, the Zotero key is not required.)_
+To use **Talk2Scholars**, you must have **FAISS** installed through **Conda**. Follow installation instructions for your OS [here](https://github.com/VirtualPatientEngine/AIAgents4Pharma/tree/main/aiagents4pharma/talk2scholars/install.md).
 To use **Talk2AIAgents4Pharma** or **Talk2KnowledgeGraphs**, you must have **Ollama** installed. Follow installation instructions for your OS [here](https://ollama.com/download).
 After installing, pull the `nomic-embed-text` model and start the server by running:

aiagents4pharma 1.31.0__py3-none-any.whl → 1.33.0__py3-none-any.whl

aiagents4pharma 1.31.0py3-none-any.whl → 1.33.0py3-none-any.whl