aiagents4pharma 1.37.0__py3-none-any.whl → 1.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +12 -4
- aiagents4pharma/talk2scholars/configs/config.yaml +2 -0
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +3 -0
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +3 -0
- aiagents4pharma/talk2scholars/configs/tools/zotero_read/default.yaml +1 -0
- aiagents4pharma/talk2scholars/state/state_talk2scholars.py +33 -7
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +151 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +151 -0
- aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +59 -3
- aiagents4pharma/talk2scholars/tests/test_read_helper_utils.py +110 -0
- aiagents4pharma/talk2scholars/tests/test_s2_display.py +20 -1
- aiagents4pharma/talk2scholars/tests/test_s2_query.py +17 -0
- aiagents4pharma/talk2scholars/tests/test_state.py +25 -1
- aiagents4pharma/talk2scholars/tests/test_zotero_pdf_downloader_utils.py +46 -0
- aiagents4pharma/talk2scholars/tests/test_zotero_read.py +35 -40
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +4 -1
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +112 -0
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +112 -0
- aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +82 -41
- aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +6 -2
- aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +2 -1
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +7 -3
- aiagents4pharma/talk2scholars/tools/s2/search.py +2 -1
- aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +2 -1
- aiagents4pharma/talk2scholars/tools/s2/utils/multi_helper.py +2 -0
- aiagents4pharma/talk2scholars/tools/s2/utils/search_helper.py +2 -0
- aiagents4pharma/talk2scholars/tools/s2/utils/single_helper.py +2 -0
- aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +79 -136
- aiagents4pharma/talk2scholars/tools/zotero/utils/zotero_pdf_downloader.py +147 -0
- aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +42 -9
- {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/METADATA +1 -1
- {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/RECORD +35 -26
- {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/WHEEL +1 -1
- {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.37.0.dist-info → aiagents4pharma-1.39.0.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,8 @@ from langgraph.prebuilt.tool_node import ToolNode
|
|
14
14
|
from langgraph.checkpoint.memory import MemorySaver
|
15
15
|
from ..state.state_talk2scholars import Talk2Scholars
|
16
16
|
from ..tools.paper_download.download_arxiv_input import download_arxiv_paper
|
17
|
+
from ..tools.paper_download.download_medrxiv_input import download_medrxiv_paper
|
18
|
+
from ..tools.paper_download.download_biorxiv_input import download_biorxiv_paper
|
17
19
|
|
18
20
|
# Initialize logger
|
19
21
|
logging.basicConfig(level=logging.INFO)
|
@@ -24,14 +26,20 @@ def get_app(uniq_id, llm_model: BaseChatModel):
|
|
24
26
|
"""
|
25
27
|
Initializes and returns the LangGraph application for the Talk2Scholars paper download agent.
|
26
28
|
|
29
|
+
This agent supports downloading scientific papers from multiple preprint servers, including
|
30
|
+
arXiv, BioRxiv, and MedRxiv. It can intelligently handle user queries by extracting or resolving
|
31
|
+
necessary identifiers (e.g., arXiv ID or DOI) from the paper title and routing the request to
|
32
|
+
the appropriate download tool.
|
33
|
+
|
27
34
|
Args:
|
28
35
|
uniq_id (str): A unique identifier for tracking the current session.
|
29
36
|
llm_model (BaseChatModel, optional): The language model to be used by the agent.
|
30
|
-
|
37
|
+
Defaults to ChatOpenAI(model="gpt-4o-mini", temperature=0.5).
|
31
38
|
|
32
39
|
Returns:
|
33
40
|
StateGraph: A compiled LangGraph application that enables the paper download agent to
|
34
|
-
|
41
|
+
process user queries and retrieve research papers from arXiv (using arXiv ID),
|
42
|
+
BioRxiv and MedRxiv (using DOI resolved from the paper title or provided directly).
|
35
43
|
"""
|
36
44
|
|
37
45
|
# Load Hydra configuration
|
@@ -44,7 +52,7 @@ def get_app(uniq_id, llm_model: BaseChatModel):
|
|
44
52
|
cfg = cfg.agents.talk2scholars.paper_download_agent
|
45
53
|
|
46
54
|
# Define tools properly
|
47
|
-
tools = ToolNode([download_arxiv_paper])
|
55
|
+
tools = ToolNode([download_arxiv_paper, download_medrxiv_paper, download_biorxiv_paper])
|
48
56
|
|
49
57
|
# Define the model
|
50
58
|
logger.info("Using OpenAI model %s", llm_model)
|
@@ -58,7 +66,7 @@ def get_app(uniq_id, llm_model: BaseChatModel):
|
|
58
66
|
|
59
67
|
def paper_download_agent_node(state: Talk2Scholars) -> Dict[str, Any]:
|
60
68
|
"""
|
61
|
-
Processes the current state to fetch the arXiv
|
69
|
+
Processes the current state to fetch the research paper from arXiv, BioRxiv, or MedRxiv.
|
62
70
|
"""
|
63
71
|
logger.info("Creating paper download agent node with thread_id: %s", uniq_id)
|
64
72
|
result = model.invoke(state, {"configurable": {"thread_id": uniq_id}})
|
@@ -8,6 +8,8 @@ defaults:
|
|
8
8
|
- agents/talk2scholars/pdf_agent: default
|
9
9
|
- tools/search: default
|
10
10
|
- tools/download_arxiv_paper: default
|
11
|
+
- tools/download_biorxiv_paper: default
|
12
|
+
- tools/download_medrxiv_paper: default
|
11
13
|
- tools/single_paper_recommendation: default
|
12
14
|
- tools/multi_paper_recommendation: default
|
13
15
|
- tools/retrieve_semantic_scholar_paper_id: default
|
@@ -2,6 +2,7 @@
|
|
2
2
|
library_type: "user" # Type of library ('user' or 'group')
|
3
3
|
default_limit: 2
|
4
4
|
request_timeout: 10
|
5
|
+
chunk_size: 16384 # Size (in bytes) for streaming PDF download chunks
|
5
6
|
user_id: ${oc.env:ZOTERO_USER_ID} # Load from environment variable
|
6
7
|
api_key: ${oc.env:ZOTERO_API_KEY} # Load from environment variable
|
7
8
|
|
@@ -7,6 +7,7 @@ across agent interactions.
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
import logging
|
10
|
+
from collections.abc import Mapping
|
10
11
|
from typing import Annotated, Any, Dict
|
11
12
|
|
12
13
|
from langchain_core.embeddings import Embeddings
|
@@ -18,7 +19,24 @@ logging.basicConfig(level=logging.INFO)
|
|
18
19
|
logger = logging.getLogger(__name__)
|
19
20
|
|
20
21
|
|
21
|
-
def
|
22
|
+
def merge_dict(existing: Dict[str, Any], new: Dict[str, Any]) -> Dict[str, Any]:
|
23
|
+
"""
|
24
|
+
Merges the existing dictionary with a new dictionary.
|
25
|
+
|
26
|
+
This function logs the state merge and ensures that the new values
|
27
|
+
are appended to the existing state without overwriting other entries.
|
28
|
+
Args:
|
29
|
+
existing (Dict[str, Any]): The current dictionary state.
|
30
|
+
new (Dict[str, Any]): The new dictionary state to merge.
|
31
|
+
Returns:
|
32
|
+
Dict[str, Any]: The merged dictionary state.
|
33
|
+
"""
|
34
|
+
merged = dict(existing) if existing else {}
|
35
|
+
merged.update(new or {})
|
36
|
+
return merged
|
37
|
+
|
38
|
+
|
39
|
+
def replace_dict(existing: Dict[str, Any], new: Any) -> Any:
|
22
40
|
"""
|
23
41
|
Replaces the existing dictionary with a new dictionary.
|
24
42
|
|
@@ -39,9 +57,13 @@ def replace_dict(existing: Dict[str, Any], new: Dict[str, Any]) -> Dict[str, Any
|
|
39
57
|
>>> print(updated_state)
|
40
58
|
{"papers": {"id2": "Paper 2"}}
|
41
59
|
"""
|
42
|
-
#
|
43
|
-
|
44
|
-
|
60
|
+
# If new is not a mapping, just replace existing value outright
|
61
|
+
if not isinstance(new, Mapping):
|
62
|
+
return new
|
63
|
+
# In-place replace: clear existing mapping and update with new entries
|
64
|
+
existing.clear()
|
65
|
+
existing.update(new)
|
66
|
+
return existing
|
45
67
|
|
46
68
|
|
47
69
|
class Talk2Scholars(AgentState):
|
@@ -63,10 +85,14 @@ class Talk2Scholars(AgentState):
|
|
63
85
|
"""
|
64
86
|
|
65
87
|
# Agent state fields
|
88
|
+
# Key controlling UI display: always replace to reference latest output
|
89
|
+
# Stores the most recently displayed papers metadata
|
66
90
|
last_displayed_papers: Annotated[Dict[str, Any], replace_dict]
|
67
|
-
|
68
|
-
|
69
|
-
|
91
|
+
# Accumulative keys: merge new entries into existing state
|
92
|
+
papers: Annotated[Dict[str, Any], merge_dict]
|
93
|
+
multi_papers: Annotated[Dict[str, Any], merge_dict]
|
94
|
+
article_data: Annotated[Dict[str, Any], merge_dict]
|
95
|
+
# Approval status: always replace to reflect latest operation
|
70
96
|
zotero_write_approval_status: Annotated[Dict[str, Any], replace_dict]
|
71
97
|
llm_model: BaseChatModel
|
72
98
|
text_embedding_model: Embeddings
|
@@ -0,0 +1,151 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for bioRxiv paper downloading functionality, including:
|
3
|
+
- download_bioRxiv_paper tool function.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import unittest
|
7
|
+
from unittest.mock import MagicMock, patch
|
8
|
+
from langchain_core.messages import ToolMessage
|
9
|
+
|
10
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input import (
|
11
|
+
download_biorxiv_paper,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
class TestDownloadBiorxivPaper(unittest.TestCase):
|
16
|
+
"""Tests for the download_bioRxiv_paper tool."""
|
17
|
+
|
18
|
+
@patch(
|
19
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input.hydra.initialize"
|
20
|
+
)
|
21
|
+
@patch(
|
22
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input.hydra.compose"
|
23
|
+
)
|
24
|
+
@patch(
|
25
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input.requests.get"
|
26
|
+
)
|
27
|
+
def test_download_biorxiv_paper_success(self, mock_get, mock_compose, mock_initialize):
|
28
|
+
"""Test successful metadata and PDF URL retrieval."""
|
29
|
+
dummy_cfg = MagicMock()
|
30
|
+
dummy_cfg.tools.download_biorxiv_paper.api_url = "http://dummy.biorxiv.org/api"
|
31
|
+
dummy_cfg.tools.download_biorxiv_paper.request_timeout = 10
|
32
|
+
mock_compose.return_value = dummy_cfg
|
33
|
+
mock_initialize.return_value.__enter__.return_value = None
|
34
|
+
|
35
|
+
doi = "10.1101/2025.05.13.653102"
|
36
|
+
|
37
|
+
dummy_response = MagicMock()
|
38
|
+
dummy_response.status_code = 200
|
39
|
+
dummy_response.raise_for_status = MagicMock()
|
40
|
+
dummy_response.json.return_value = {
|
41
|
+
"collection": [
|
42
|
+
{
|
43
|
+
"title": "Sample BioRxiv Paper",
|
44
|
+
"authors": "Author One; Author Two",
|
45
|
+
"abstract": "This is a bioRxiv abstract.",
|
46
|
+
"date": "2025-04-25",
|
47
|
+
"doi": doi,
|
48
|
+
"link": f"https://www.biorxiv.org/content/{doi}.full.pdf"
|
49
|
+
}
|
50
|
+
]
|
51
|
+
}
|
52
|
+
mock_get.return_value = dummy_response
|
53
|
+
|
54
|
+
tool_input = {"doi": doi, "tool_call_id": "test_tool_id"}
|
55
|
+
result = download_biorxiv_paper.run(tool_input)
|
56
|
+
update = result.update
|
57
|
+
|
58
|
+
self.assertIn("article_data", update)
|
59
|
+
self.assertIn(doi, update["article_data"])
|
60
|
+
metadata = update["article_data"][doi]
|
61
|
+
self.assertEqual(metadata["Title"], "Sample BioRxiv Paper")
|
62
|
+
self.assertEqual(metadata["Authors"], "Author One; Author Two")
|
63
|
+
self.assertEqual(metadata["Abstract"], "This is a bioRxiv abstract.")
|
64
|
+
self.assertEqual(metadata["Publication Date"], "2025-04-25")
|
65
|
+
self.assertEqual(metadata["URL"], f"https://www.biorxiv.org/content/{doi}.full.pdf")
|
66
|
+
self.assertEqual(metadata["pdf_url"], f"https://www.biorxiv.org/content/{doi}.full.pdf")
|
67
|
+
self.assertEqual(metadata["filename"], f"{doi.rsplit('/', maxsplit=1)[-1]}.pdf")
|
68
|
+
self.assertEqual(metadata["source"], "biorxiv")
|
69
|
+
self.assertEqual(metadata["biorxiv_id"], doi)
|
70
|
+
|
71
|
+
self.assertTrue(len(update["messages"]) >= 1)
|
72
|
+
self.assertIsInstance(update["messages"][0], ToolMessage)
|
73
|
+
self.assertIn("Successfully retrieved metadata and PDF URL", update["messages"][0].content)
|
74
|
+
|
75
|
+
@patch(
|
76
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input.hydra.initialize"
|
77
|
+
)
|
78
|
+
@patch(
|
79
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input.hydra.compose"
|
80
|
+
)
|
81
|
+
@patch(
|
82
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input.requests.get"
|
83
|
+
)
|
84
|
+
def test_no_entry_found(self, mock_get, mock_compose, mock_initialize):
|
85
|
+
"""Test behavior when no 'entry' is in response."""
|
86
|
+
dummy_cfg = MagicMock()
|
87
|
+
dummy_cfg.tools.download_biorxiv_paper.api_url = "http://dummy.biorxiv.org/api"
|
88
|
+
dummy_cfg.tools.download_biorxiv_paper.request_timeout = 10
|
89
|
+
mock_compose.return_value = dummy_cfg
|
90
|
+
mock_initialize.return_value.__enter__.return_value = None
|
91
|
+
|
92
|
+
dummy_response = MagicMock()
|
93
|
+
dummy_response.status_code = 200
|
94
|
+
dummy_response.raise_for_status = MagicMock()
|
95
|
+
dummy_response.json.return_value = {} # No entry
|
96
|
+
mock_get.return_value = dummy_response
|
97
|
+
|
98
|
+
doi = "10.1101/2025.05.13.653102"
|
99
|
+
tool_input = {"doi": doi, "tool_call_id": "test_tool_id"}
|
100
|
+
|
101
|
+
with self.assertRaises(ValueError) as context:
|
102
|
+
download_biorxiv_paper.run(tool_input)
|
103
|
+
|
104
|
+
self.assertEqual(str(context.exception), f"No metadata found for DOI: {doi}")
|
105
|
+
|
106
|
+
@patch(
|
107
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input.hydra.initialize"
|
108
|
+
)
|
109
|
+
@patch(
|
110
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input.hydra.compose"
|
111
|
+
)
|
112
|
+
@patch(
|
113
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_biorxiv_input.requests.get"
|
114
|
+
)
|
115
|
+
def test_no_pdf_url_found(self, mock_get, mock_compose, mock_initialize):
|
116
|
+
"""Test fallback to DOI-based PDF URL construction when 'link' is missing."""
|
117
|
+
dummy_cfg = MagicMock()
|
118
|
+
dummy_cfg.tools.download_biorxiv_paper.api_url = "http://dummy.biorxiv.org/api"
|
119
|
+
dummy_cfg.tools.download_biorxiv_paper.request_timeout = 10
|
120
|
+
mock_compose.return_value = dummy_cfg
|
121
|
+
mock_initialize.return_value.__enter__.return_value = None
|
122
|
+
|
123
|
+
doi = "10.1101/2025.05.13.653102"
|
124
|
+
|
125
|
+
dummy_response = MagicMock()
|
126
|
+
dummy_response.status_code = 200
|
127
|
+
dummy_response.raise_for_status = MagicMock()
|
128
|
+
dummy_response.json.return_value = {
|
129
|
+
"collection": [
|
130
|
+
{
|
131
|
+
"title": "Sample Biorxiv Paper",
|
132
|
+
"authors": "Author One; Author Two",
|
133
|
+
"abstract": "This is a BioRxiv abstract.",
|
134
|
+
"date": "2025-04-25",
|
135
|
+
"doi": doi
|
136
|
+
# 'link' is intentionally omitted
|
137
|
+
}
|
138
|
+
]
|
139
|
+
}
|
140
|
+
mock_get.return_value = dummy_response
|
141
|
+
|
142
|
+
tool_input = {"doi": doi, "tool_call_id": "test_tool_id"}
|
143
|
+
result = download_biorxiv_paper.run(tool_input)
|
144
|
+
update = result.update
|
145
|
+
metadata = update["article_data"][doi]
|
146
|
+
|
147
|
+
# Assert that the PDF URL was constructed from DOI
|
148
|
+
expected_suffix = doi.rsplit('/', maxsplit=1)[-1]
|
149
|
+
expected_url = f"https://www.biorxiv.org/content/10.1101/{expected_suffix}.full.pdf"
|
150
|
+
|
151
|
+
self.assertEqual(metadata["pdf_url"], expected_url)
|
@@ -0,0 +1,151 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for medrXiv paper downloading functionality, including:
|
3
|
+
- download_medrxiv_paper tool function.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import unittest
|
7
|
+
from unittest.mock import MagicMock, patch
|
8
|
+
from langchain_core.messages import ToolMessage
|
9
|
+
|
10
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input import (
|
11
|
+
download_medrxiv_paper,
|
12
|
+
)
|
13
|
+
|
14
|
+
|
15
|
+
class TestDownloadMedrxivPaper(unittest.TestCase):
|
16
|
+
"""Tests for the download_medrxiv_paper tool."""
|
17
|
+
|
18
|
+
@patch(
|
19
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.initialize"
|
20
|
+
)
|
21
|
+
@patch(
|
22
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.compose"
|
23
|
+
)
|
24
|
+
@patch(
|
25
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.requests.get"
|
26
|
+
)
|
27
|
+
def test_download_medrxiv_paper_success(self, mock_get, mock_compose, mock_initialize):
|
28
|
+
"""Test successful metadata and PDF URL retrieval."""
|
29
|
+
dummy_cfg = MagicMock()
|
30
|
+
dummy_cfg.tools.download_medrxiv_paper.api_url = "http://dummy.medrxiv.org/api"
|
31
|
+
dummy_cfg.tools.download_medrxiv_paper.request_timeout = 10
|
32
|
+
mock_compose.return_value = dummy_cfg
|
33
|
+
mock_initialize.return_value.__enter__.return_value = None
|
34
|
+
|
35
|
+
doi = "10.1101/2025.04.25.25326432"
|
36
|
+
|
37
|
+
dummy_response = MagicMock()
|
38
|
+
dummy_response.status_code = 200
|
39
|
+
dummy_response.raise_for_status = MagicMock()
|
40
|
+
dummy_response.json.return_value = {
|
41
|
+
"collection": [
|
42
|
+
{
|
43
|
+
"title": "Sample Medrxiv Paper",
|
44
|
+
"authors": "Author One; Author Two",
|
45
|
+
"abstract": "This is a medRxiv abstract.",
|
46
|
+
"date": "2025-04-25",
|
47
|
+
"doi": doi,
|
48
|
+
"link": f"https://www.medrxiv.org/content/{doi}.full.pdf"
|
49
|
+
}
|
50
|
+
]
|
51
|
+
}
|
52
|
+
mock_get.return_value = dummy_response
|
53
|
+
|
54
|
+
tool_input = {"doi": doi, "tool_call_id": "test_tool_id"}
|
55
|
+
result = download_medrxiv_paper.run(tool_input)
|
56
|
+
update = result.update
|
57
|
+
|
58
|
+
self.assertIn("article_data", update)
|
59
|
+
self.assertIn(doi, update["article_data"])
|
60
|
+
metadata = update["article_data"][doi]
|
61
|
+
self.assertEqual(metadata["Title"], "Sample Medrxiv Paper")
|
62
|
+
self.assertEqual(metadata["Authors"], "Author One; Author Two")
|
63
|
+
self.assertEqual(metadata["Abstract"], "This is a medRxiv abstract.")
|
64
|
+
self.assertEqual(metadata["Publication Date"], "2025-04-25")
|
65
|
+
self.assertEqual(metadata["URL"], f"https://www.medrxiv.org/content/{doi}.full.pdf")
|
66
|
+
self.assertEqual(metadata["pdf_url"], f"https://www.medrxiv.org/content/{doi}.full.pdf")
|
67
|
+
self.assertEqual(metadata["filename"], f"{doi.rsplit('/', maxsplit=1)[-1]}.pdf")
|
68
|
+
self.assertEqual(metadata["source"], "medrxiv")
|
69
|
+
self.assertEqual(metadata["medrxiv_id"], doi)
|
70
|
+
|
71
|
+
self.assertTrue(len(update["messages"]) >= 1)
|
72
|
+
self.assertIsInstance(update["messages"][0], ToolMessage)
|
73
|
+
self.assertIn("Successfully retrieved metadata and PDF URL", update["messages"][0].content)
|
74
|
+
|
75
|
+
@patch(
|
76
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.initialize"
|
77
|
+
)
|
78
|
+
@patch(
|
79
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.compose"
|
80
|
+
)
|
81
|
+
@patch(
|
82
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.requests.get"
|
83
|
+
)
|
84
|
+
def test_no_entry_found(self, mock_get, mock_compose, mock_initialize):
|
85
|
+
"""Test behavior when no 'entry' is in response."""
|
86
|
+
dummy_cfg = MagicMock()
|
87
|
+
dummy_cfg.tools.download_medrxiv_paper.api_url = "http://dummy.medrxiv.org/api"
|
88
|
+
dummy_cfg.tools.download_medrxiv_paper.request_timeout = 10
|
89
|
+
mock_compose.return_value = dummy_cfg
|
90
|
+
mock_initialize.return_value.__enter__.return_value = None
|
91
|
+
|
92
|
+
dummy_response = MagicMock()
|
93
|
+
dummy_response.status_code = 200
|
94
|
+
dummy_response.raise_for_status = MagicMock()
|
95
|
+
dummy_response.json.return_value = {} # No entry
|
96
|
+
mock_get.return_value = dummy_response
|
97
|
+
|
98
|
+
doi = "10.1101/2025.04.25.25326432"
|
99
|
+
tool_input = {"doi": doi, "tool_call_id": "test_tool_id"}
|
100
|
+
|
101
|
+
with self.assertRaises(ValueError) as context:
|
102
|
+
download_medrxiv_paper.run(tool_input)
|
103
|
+
|
104
|
+
self.assertEqual(str(context.exception), f"No entry found for medRxiv ID {doi}")
|
105
|
+
|
106
|
+
@patch(
|
107
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.initialize"
|
108
|
+
)
|
109
|
+
@patch(
|
110
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.compose"
|
111
|
+
)
|
112
|
+
@patch(
|
113
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.requests.get"
|
114
|
+
)
|
115
|
+
def test_no_pdf_url_found(self, mock_get, mock_compose, mock_initialize):
|
116
|
+
"""Test fallback to DOI-based PDF URL construction when 'link' is missing."""
|
117
|
+
dummy_cfg = MagicMock()
|
118
|
+
dummy_cfg.tools.download_medrxiv_paper.api_url = "http://dummy.medrxiv.org/api"
|
119
|
+
dummy_cfg.tools.download_medrxiv_paper.request_timeout = 10
|
120
|
+
mock_compose.return_value = dummy_cfg
|
121
|
+
mock_initialize.return_value.__enter__.return_value = None
|
122
|
+
|
123
|
+
doi = "10.1101/2025.04.25.25326432"
|
124
|
+
|
125
|
+
dummy_response = MagicMock()
|
126
|
+
dummy_response.status_code = 200
|
127
|
+
dummy_response.raise_for_status = MagicMock()
|
128
|
+
dummy_response.json.return_value = {
|
129
|
+
"collection": [
|
130
|
+
{
|
131
|
+
"title": "Sample Medrxiv Paper",
|
132
|
+
"authors": "Author One; Author Two",
|
133
|
+
"abstract": "This is a medRxiv abstract.",
|
134
|
+
"date": "2025-04-25",
|
135
|
+
"doi": doi
|
136
|
+
# 'link' is intentionally omitted
|
137
|
+
}
|
138
|
+
]
|
139
|
+
}
|
140
|
+
mock_get.return_value = dummy_response
|
141
|
+
|
142
|
+
tool_input = {"doi": doi, "tool_call_id": "test_tool_id"}
|
143
|
+
result = download_medrxiv_paper.run(tool_input)
|
144
|
+
update = result.update
|
145
|
+
metadata = update["article_data"][doi]
|
146
|
+
|
147
|
+
# Assert that the PDF URL was constructed from DOI
|
148
|
+
expected_suffix = doi.rsplit('/', maxsplit=1)[-1]
|
149
|
+
expected_url = f"https://www.medrxiv.org/content/10.1101/{expected_suffix}.full.pdf"
|
150
|
+
|
151
|
+
self.assertEqual(metadata["pdf_url"], expected_url)
|
@@ -3,11 +3,14 @@ Unit tests for question_and_answer tool functionality.
|
|
3
3
|
"""
|
4
4
|
|
5
5
|
import unittest
|
6
|
+
from types import SimpleNamespace
|
6
7
|
from unittest.mock import MagicMock, patch
|
7
8
|
|
8
9
|
from langchain_core.documents import Document
|
9
10
|
from langchain_core.embeddings import Embeddings
|
11
|
+
from langchain_core.messages import ToolMessage
|
10
12
|
|
13
|
+
import aiagents4pharma.talk2scholars.tools.pdf.question_and_answer as qa_module
|
11
14
|
from aiagents4pharma.talk2scholars.tools.pdf.question_and_answer import (
|
12
15
|
Vectorstore,
|
13
16
|
generate_answer,
|
@@ -145,8 +148,9 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
|
|
145
148
|
|
146
149
|
vector_store = Vectorstore(embedding_model=mock_embedding_model)
|
147
150
|
vector_store.vector_store = True
|
151
|
+
# Add a document chunk with required metadata including chunk_id
|
148
152
|
vector_store.documents["test_doc"] = Document(
|
149
|
-
page_content="Test content", metadata={"paper_id": "test_paper"}
|
153
|
+
page_content="Test content", metadata={"paper_id": "test_paper", "chunk_id": 0}
|
150
154
|
)
|
151
155
|
|
152
156
|
results = vector_store.retrieve_relevant_chunks(query="test query")
|
@@ -793,8 +797,9 @@ class TestMissingState(unittest.TestCase):
|
|
793
797
|
|
794
798
|
vector_store = Vectorstore(embedding_model=mock_embedding_model)
|
795
799
|
vector_store.vector_store = True
|
796
|
-
|
797
|
-
|
800
|
+
# Add document chunks with necessary metadata including chunk_ids
|
801
|
+
doc1 = Document(page_content="Doc 1", metadata={"paper_id": "paper1", "chunk_id": 0})
|
802
|
+
doc2 = Document(page_content="Doc 2", metadata={"paper_id": "paper2", "chunk_id": 1})
|
798
803
|
vector_store.documents = {"doc1": doc1, "doc2": doc2}
|
799
804
|
|
800
805
|
results = vector_store.retrieve_relevant_chunks(
|
@@ -820,3 +825,54 @@ class TestMissingState(unittest.TestCase):
|
|
820
825
|
query="test", paper_ids=["nonexistent_id"]
|
821
826
|
)
|
822
827
|
assert results == []
|
828
|
+
|
829
|
+
@patch(
|
830
|
+
"aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.load_hydra_config"
|
831
|
+
)
|
832
|
+
@patch(
|
833
|
+
"aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.generate_answer"
|
834
|
+
)
|
835
|
+
def test_prebuilt_vector_store_branch(self, mock_generate, mock_load_config):
|
836
|
+
"""Test question_and_answer tool with a shared pre-built vector store branch."""
|
837
|
+
# Mock configuration for tool-level thresholds
|
838
|
+
config = SimpleNamespace(top_k_papers=1, top_k_chunks=1)
|
839
|
+
mock_load_config.return_value = config
|
840
|
+
# Mock generate_answer to return a simple response
|
841
|
+
mock_generate.return_value = {"output_text": "Answer", "papers_used": ["p1"]}
|
842
|
+
|
843
|
+
# Prepare a dummy pre-built vector store
|
844
|
+
dummy_vs = SimpleNamespace(
|
845
|
+
loaded_papers=set(),
|
846
|
+
vector_store=True,
|
847
|
+
retrieve_relevant_chunks=lambda *_args, **_kwargs: [
|
848
|
+
Document(page_content="chunk", metadata={"paper_id": "p1"})
|
849
|
+
],
|
850
|
+
)
|
851
|
+
# Override the module-level prebuilt_vector_store
|
852
|
+
qa_module.prebuilt_vector_store = dummy_vs
|
853
|
+
|
854
|
+
# Prepare state with required models and article_data
|
855
|
+
state = {
|
856
|
+
"text_embedding_model": MagicMock(),
|
857
|
+
"llm_model": MagicMock(),
|
858
|
+
"article_data": {"p1": {"source": "upload"}},
|
859
|
+
}
|
860
|
+
|
861
|
+
# Invoke the tool-level function via .run with appropriate input schema
|
862
|
+
input_data = {
|
863
|
+
"question": "What?",
|
864
|
+
"paper_ids": None,
|
865
|
+
"use_all_papers": False,
|
866
|
+
"tool_call_id": "testid",
|
867
|
+
"state": state,
|
868
|
+
}
|
869
|
+
result = qa_module.question_and_answer.run(input_data)
|
870
|
+
|
871
|
+
# Ensure the prebuilt branch was used and a Command is returned
|
872
|
+
self.assertTrue(hasattr(result, "update"))
|
873
|
+
messages = result.update.get("messages", [])
|
874
|
+
self.assertEqual(len(messages), 1)
|
875
|
+
self.assertIsInstance(messages[0], ToolMessage)
|
876
|
+
|
877
|
+
# Clean up global override
|
878
|
+
qa_module.prebuilt_vector_store = None
|
@@ -0,0 +1,110 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for Zotero read helper download branches.
|
3
|
+
"""
|
4
|
+
|
5
|
+
import unittest
|
6
|
+
from types import SimpleNamespace
|
7
|
+
from unittest.mock import MagicMock, patch
|
8
|
+
|
9
|
+
from aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper import (
|
10
|
+
ZoteroSearchData,
|
11
|
+
)
|
12
|
+
|
13
|
+
# Dummy Hydra configuration for tests
|
14
|
+
dummy_zotero_read_config = SimpleNamespace(
|
15
|
+
user_id="dummy_user",
|
16
|
+
library_type="user",
|
17
|
+
api_key="dummy_api_key",
|
18
|
+
zotero=SimpleNamespace(
|
19
|
+
max_limit=5,
|
20
|
+
filter_item_types=["journalArticle", "conferencePaper"],
|
21
|
+
filter_excluded_types=["attachment", "note"],
|
22
|
+
),
|
23
|
+
)
|
24
|
+
dummy_cfg = SimpleNamespace(tools=SimpleNamespace(zotero_read=dummy_zotero_read_config))
|
25
|
+
|
26
|
+
|
27
|
+
class TestReadHelperDownloadsFalse(unittest.TestCase):
|
28
|
+
"""Tests for read_helper download_pdfs=False branches."""
|
29
|
+
|
30
|
+
@patch(
|
31
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
|
32
|
+
)
|
33
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
|
34
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
|
35
|
+
@patch(
|
36
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
|
37
|
+
)
|
38
|
+
def test_download_pdfs_false_branches(
|
39
|
+
self,
|
40
|
+
mock_hydra_init,
|
41
|
+
mock_hydra_compose,
|
42
|
+
mock_zotero_class,
|
43
|
+
mock_get_item_collections,
|
44
|
+
):
|
45
|
+
"""Ensure attachment_key and filename are set when download_pdfs=False."""
|
46
|
+
# Setup Hydra mocks
|
47
|
+
mock_hydra_compose.return_value = dummy_cfg
|
48
|
+
mock_hydra_init.return_value.__enter__.return_value = None
|
49
|
+
|
50
|
+
# Fake Zotero items: one paper with child PDF, one orphaned PDF
|
51
|
+
fake_zot = MagicMock()
|
52
|
+
fake_items = [
|
53
|
+
{
|
54
|
+
"data": {
|
55
|
+
"key": "paper1",
|
56
|
+
"title": "P1",
|
57
|
+
"abstractNote": "A1",
|
58
|
+
"date": "2021",
|
59
|
+
"url": "u1",
|
60
|
+
"itemType": "journalArticle",
|
61
|
+
}
|
62
|
+
},
|
63
|
+
{
|
64
|
+
"data": {
|
65
|
+
"key": "attach2",
|
66
|
+
"itemType": "attachment",
|
67
|
+
"contentType": "application/pdf",
|
68
|
+
"filename": "file2.pdf",
|
69
|
+
}
|
70
|
+
},
|
71
|
+
]
|
72
|
+
fake_zot.items.return_value = fake_items
|
73
|
+
# children for paper1
|
74
|
+
fake_child = {
|
75
|
+
"data": {
|
76
|
+
"key": "attach1",
|
77
|
+
"filename": "file1.pdf",
|
78
|
+
"contentType": "application/pdf",
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
def children_side_effect(key):
|
83
|
+
return [fake_child] if key == "paper1" else []
|
84
|
+
|
85
|
+
fake_zot.children.side_effect = children_side_effect
|
86
|
+
mock_zotero_class.return_value = fake_zot
|
87
|
+
mock_get_item_collections.return_value = {"paper1": ["/C1"], "attach2": ["/C2"]}
|
88
|
+
|
89
|
+
# Instantiate with download_pdfs=False
|
90
|
+
search = ZoteroSearchData(
|
91
|
+
query="test",
|
92
|
+
only_articles=False,
|
93
|
+
limit=2,
|
94
|
+
tool_call_id="id",
|
95
|
+
download_pdfs=False,
|
96
|
+
)
|
97
|
+
search.process_search()
|
98
|
+
data = search.get_search_results()["article_data"]
|
99
|
+
|
100
|
+
# Regular paper1 should have attachment_key and filename, no pdf_url
|
101
|
+
self.assertIn("paper1", data)
|
102
|
+
self.assertEqual(data["paper1"]["attachment_key"], "attach1")
|
103
|
+
self.assertEqual(data["paper1"]["filename"], "file1.pdf")
|
104
|
+
self.assertNotIn("pdf_url", data["paper1"])
|
105
|
+
|
106
|
+
# Orphan attach2 should have attachment_key and filename, no pdf_url
|
107
|
+
self.assertIn("attach2", data)
|
108
|
+
self.assertEqual(data["attach2"]["attachment_key"], "attach2")
|
109
|
+
self.assertEqual(data["attach2"]["filename"], "file2.pdf")
|
110
|
+
self.assertNotIn("pdf_url", data["attach2"])
|