aiagents4pharma 1.42.0__py3-none-any.whl → 1.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
- aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
- aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
- aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
- aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
- aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
- aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
- aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
- aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +3 -1
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +31 -28
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
- /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -1,151 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Unit tests for medrXiv paper downloading functionality, including:
|
3
|
-
- download_medrxiv_paper tool function.
|
4
|
-
"""
|
5
|
-
|
6
|
-
import unittest
|
7
|
-
from unittest.mock import MagicMock, patch
|
8
|
-
from langchain_core.messages import ToolMessage
|
9
|
-
|
10
|
-
from aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input import (
|
11
|
-
download_medrxiv_paper,
|
12
|
-
)
|
13
|
-
|
14
|
-
|
15
|
-
class TestDownloadMedrxivPaper(unittest.TestCase):
|
16
|
-
"""Tests for the download_medrxiv_paper tool."""
|
17
|
-
|
18
|
-
@patch(
|
19
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.initialize"
|
20
|
-
)
|
21
|
-
@patch(
|
22
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.compose"
|
23
|
-
)
|
24
|
-
@patch(
|
25
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.requests.get"
|
26
|
-
)
|
27
|
-
def test_download_medrxiv_paper_success(self, mock_get, mock_compose, mock_initialize):
|
28
|
-
"""Test successful metadata and PDF URL retrieval."""
|
29
|
-
dummy_cfg = MagicMock()
|
30
|
-
dummy_cfg.tools.download_medrxiv_paper.api_url = "http://dummy.medrxiv.org/api"
|
31
|
-
dummy_cfg.tools.download_medrxiv_paper.request_timeout = 10
|
32
|
-
mock_compose.return_value = dummy_cfg
|
33
|
-
mock_initialize.return_value.__enter__.return_value = None
|
34
|
-
|
35
|
-
doi = "10.1101/2025.04.25.25326432"
|
36
|
-
|
37
|
-
dummy_response = MagicMock()
|
38
|
-
dummy_response.status_code = 200
|
39
|
-
dummy_response.raise_for_status = MagicMock()
|
40
|
-
dummy_response.json.return_value = {
|
41
|
-
"collection": [
|
42
|
-
{
|
43
|
-
"title": "Sample Medrxiv Paper",
|
44
|
-
"authors": "Author One; Author Two",
|
45
|
-
"abstract": "This is a medRxiv abstract.",
|
46
|
-
"date": "2025-04-25",
|
47
|
-
"doi": doi,
|
48
|
-
"link": f"https://www.medrxiv.org/content/{doi}.full.pdf"
|
49
|
-
}
|
50
|
-
]
|
51
|
-
}
|
52
|
-
mock_get.return_value = dummy_response
|
53
|
-
|
54
|
-
tool_input = {"doi": doi, "tool_call_id": "test_tool_id"}
|
55
|
-
result = download_medrxiv_paper.run(tool_input)
|
56
|
-
update = result.update
|
57
|
-
|
58
|
-
self.assertIn("article_data", update)
|
59
|
-
self.assertIn(doi, update["article_data"])
|
60
|
-
metadata = update["article_data"][doi]
|
61
|
-
self.assertEqual(metadata["Title"], "Sample Medrxiv Paper")
|
62
|
-
self.assertEqual(metadata["Authors"], "Author One; Author Two")
|
63
|
-
self.assertEqual(metadata["Abstract"], "This is a medRxiv abstract.")
|
64
|
-
self.assertEqual(metadata["Publication Date"], "2025-04-25")
|
65
|
-
self.assertEqual(metadata["URL"], f"https://www.medrxiv.org/content/{doi}.full.pdf")
|
66
|
-
self.assertEqual(metadata["pdf_url"], f"https://www.medrxiv.org/content/{doi}.full.pdf")
|
67
|
-
self.assertEqual(metadata["filename"], f"{doi.rsplit('/', maxsplit=1)[-1]}.pdf")
|
68
|
-
self.assertEqual(metadata["source"], "medrxiv")
|
69
|
-
self.assertEqual(metadata["medrxiv_id"], doi)
|
70
|
-
|
71
|
-
self.assertTrue(len(update["messages"]) >= 1)
|
72
|
-
self.assertIsInstance(update["messages"][0], ToolMessage)
|
73
|
-
self.assertIn("Successfully retrieved metadata and PDF URL", update["messages"][0].content)
|
74
|
-
|
75
|
-
@patch(
|
76
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.initialize"
|
77
|
-
)
|
78
|
-
@patch(
|
79
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.compose"
|
80
|
-
)
|
81
|
-
@patch(
|
82
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.requests.get"
|
83
|
-
)
|
84
|
-
def test_no_entry_found(self, mock_get, mock_compose, mock_initialize):
|
85
|
-
"""Test behavior when no 'entry' is in response."""
|
86
|
-
dummy_cfg = MagicMock()
|
87
|
-
dummy_cfg.tools.download_medrxiv_paper.api_url = "http://dummy.medrxiv.org/api"
|
88
|
-
dummy_cfg.tools.download_medrxiv_paper.request_timeout = 10
|
89
|
-
mock_compose.return_value = dummy_cfg
|
90
|
-
mock_initialize.return_value.__enter__.return_value = None
|
91
|
-
|
92
|
-
dummy_response = MagicMock()
|
93
|
-
dummy_response.status_code = 200
|
94
|
-
dummy_response.raise_for_status = MagicMock()
|
95
|
-
dummy_response.json.return_value = {} # No entry
|
96
|
-
mock_get.return_value = dummy_response
|
97
|
-
|
98
|
-
doi = "10.1101/2025.04.25.25326432"
|
99
|
-
tool_input = {"doi": doi, "tool_call_id": "test_tool_id"}
|
100
|
-
|
101
|
-
with self.assertRaises(ValueError) as context:
|
102
|
-
download_medrxiv_paper.run(tool_input)
|
103
|
-
|
104
|
-
self.assertEqual(str(context.exception), f"No entry found for medRxiv ID {doi}")
|
105
|
-
|
106
|
-
@patch(
|
107
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.initialize"
|
108
|
-
)
|
109
|
-
@patch(
|
110
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.hydra.compose"
|
111
|
-
)
|
112
|
-
@patch(
|
113
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_medrxiv_input.requests.get"
|
114
|
-
)
|
115
|
-
def test_no_pdf_url_found(self, mock_get, mock_compose, mock_initialize):
|
116
|
-
"""Test fallback to DOI-based PDF URL construction when 'link' is missing."""
|
117
|
-
dummy_cfg = MagicMock()
|
118
|
-
dummy_cfg.tools.download_medrxiv_paper.api_url = "http://dummy.medrxiv.org/api"
|
119
|
-
dummy_cfg.tools.download_medrxiv_paper.request_timeout = 10
|
120
|
-
mock_compose.return_value = dummy_cfg
|
121
|
-
mock_initialize.return_value.__enter__.return_value = None
|
122
|
-
|
123
|
-
doi = "10.1101/2025.04.25.25326432"
|
124
|
-
|
125
|
-
dummy_response = MagicMock()
|
126
|
-
dummy_response.status_code = 200
|
127
|
-
dummy_response.raise_for_status = MagicMock()
|
128
|
-
dummy_response.json.return_value = {
|
129
|
-
"collection": [
|
130
|
-
{
|
131
|
-
"title": "Sample Medrxiv Paper",
|
132
|
-
"authors": "Author One; Author Two",
|
133
|
-
"abstract": "This is a medRxiv abstract.",
|
134
|
-
"date": "2025-04-25",
|
135
|
-
"doi": doi
|
136
|
-
# 'link' is intentionally omitted
|
137
|
-
}
|
138
|
-
]
|
139
|
-
}
|
140
|
-
mock_get.return_value = dummy_response
|
141
|
-
|
142
|
-
tool_input = {"doi": doi, "tool_call_id": "test_tool_id"}
|
143
|
-
result = download_medrxiv_paper.run(tool_input)
|
144
|
-
update = result.update
|
145
|
-
metadata = update["article_data"][doi]
|
146
|
-
|
147
|
-
# Assert that the PDF URL was constructed from DOI
|
148
|
-
expected_suffix = doi.rsplit('/', maxsplit=1)[-1]
|
149
|
-
expected_url = f"https://www.medrxiv.org/content/10.1101/{expected_suffix}.full.pdf"
|
150
|
-
|
151
|
-
self.assertEqual(metadata["pdf_url"], expected_url)
|
@@ -1,249 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Unit tests for arXiv paper downloading functionality, including:
|
3
|
-
- download_arxiv_paper tool function.
|
4
|
-
"""
|
5
|
-
|
6
|
-
import unittest
|
7
|
-
from unittest.mock import MagicMock, patch
|
8
|
-
|
9
|
-
import pytest
|
10
|
-
from langchain_core.messages import ToolMessage
|
11
|
-
|
12
|
-
from aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input import (
|
13
|
-
_get_snippet,
|
14
|
-
download_arxiv_paper,
|
15
|
-
)
|
16
|
-
|
17
|
-
|
18
|
-
class TestDownloadArxivPaper(unittest.TestCase):
|
19
|
-
"""tests for the download_arxiv_paper tool."""
|
20
|
-
|
21
|
-
@patch(
|
22
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.hydra.initialize"
|
23
|
-
)
|
24
|
-
@patch(
|
25
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.hydra.compose"
|
26
|
-
)
|
27
|
-
@patch(
|
28
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.requests.get"
|
29
|
-
)
|
30
|
-
def test_download_arxiv_paper_success(
|
31
|
-
self, mock_get, mock_compose, mock_initialize
|
32
|
-
):
|
33
|
-
"""test the download_arxiv_paper tool for successful retrieval of metadata and PDF URL."""
|
34
|
-
# Set up a dummy Hydra config.
|
35
|
-
dummy_cfg = MagicMock()
|
36
|
-
dummy_cfg.tools.download_arxiv_paper.api_url = "http://dummy.arxiv.org/api"
|
37
|
-
dummy_cfg.tools.download_arxiv_paper.request_timeout = 10
|
38
|
-
mock_compose.return_value = dummy_cfg
|
39
|
-
mock_initialize.return_value.__enter__.return_value = None
|
40
|
-
|
41
|
-
# Set up a dummy XML response with a valid entry including a pdf link.
|
42
|
-
arxiv_id = "1234.56789"
|
43
|
-
dummy_response = MagicMock()
|
44
|
-
dummy_response.text = (
|
45
|
-
f"""<?xml version=\"1.0\" encoding=\"UTF-8\"?>
|
46
|
-
<feed xmlns=\"http://www.w3.org/2005/Atom\">"""
|
47
|
-
f" <entry>"
|
48
|
-
f"<title>Sample Paper Title</title>"
|
49
|
-
f"<author><name>Author One</name></author>"
|
50
|
-
f"<author><name>Author Two</name></author>"
|
51
|
-
f"<summary>This is a sample abstract.</summary>"
|
52
|
-
f"<published>2020-01-01T00:00:00Z</published>"
|
53
|
-
f'<link title="pdf" href="http://arxiv.org/pdf/{arxiv_id}v1"/>'
|
54
|
-
f"</entry></feed>"
|
55
|
-
)
|
56
|
-
dummy_response.raise_for_status = MagicMock()
|
57
|
-
mock_get.return_value = dummy_response
|
58
|
-
|
59
|
-
tool_call_id = "test_tool_id"
|
60
|
-
tool_input = {"arxiv_ids": [arxiv_id], "tool_call_id": tool_call_id}
|
61
|
-
result = download_arxiv_paper.run(tool_input)
|
62
|
-
update = result.update
|
63
|
-
|
64
|
-
# Check that article_data was correctly set.
|
65
|
-
self.assertIn("article_data", update)
|
66
|
-
self.assertIn(arxiv_id, update["article_data"])
|
67
|
-
metadata = update["article_data"][arxiv_id]
|
68
|
-
self.assertEqual(metadata["Title"], "Sample Paper Title")
|
69
|
-
self.assertEqual(metadata["Authors"], ["Author One", "Author Two"])
|
70
|
-
self.assertEqual(metadata["Abstract"], "This is a sample abstract.")
|
71
|
-
self.assertEqual(metadata["Publication Date"], "2020-01-01T00:00:00Z")
|
72
|
-
self.assertEqual(metadata["URL"], f"http://arxiv.org/pdf/{arxiv_id}v1")
|
73
|
-
self.assertEqual(metadata["pdf_url"], f"http://arxiv.org/pdf/{arxiv_id}v1")
|
74
|
-
self.assertEqual(metadata["filename"], f"{arxiv_id}.pdf")
|
75
|
-
self.assertEqual(metadata["source"], "arxiv")
|
76
|
-
self.assertEqual(metadata["arxiv_id"], arxiv_id)
|
77
|
-
|
78
|
-
# Check that the message content matches the new summary format
|
79
|
-
messages = update["messages"]
|
80
|
-
self.assertEqual(len(messages), 1)
|
81
|
-
self.assertIsInstance(messages[0], ToolMessage)
|
82
|
-
content = messages[0].content
|
83
|
-
# Build expected summary
|
84
|
-
expected = (
|
85
|
-
"Download was successful. Papers metadata are attached as an artifact. "
|
86
|
-
"Here is a summary of the results:\n"
|
87
|
-
f"Number of papers found: 1\n"
|
88
|
-
"Top 3 papers:\n"
|
89
|
-
f"1. Sample Paper Title (2020-01-01T00:00:00Z)\n"
|
90
|
-
f" View PDF: http://arxiv.org/pdf/{arxiv_id}v1\n"
|
91
|
-
" Abstract snippet: This is a sample abstract."
|
92
|
-
)
|
93
|
-
self.assertEqual(content, expected)
|
94
|
-
|
95
|
-
@patch(
|
96
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.hydra.initialize"
|
97
|
-
)
|
98
|
-
@patch(
|
99
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.hydra.compose"
|
100
|
-
)
|
101
|
-
@patch(
|
102
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.requests.get"
|
103
|
-
)
|
104
|
-
def test_no_entry_found(self, mock_get, mock_compose, mock_initialize):
|
105
|
-
"""test the download_arxiv_paper tool for no entry found in XML response."""
|
106
|
-
# Dummy config as before.
|
107
|
-
dummy_cfg = MagicMock()
|
108
|
-
dummy_cfg.tools.download_arxiv_paper.api_url = "http://dummy.arxiv.org/api"
|
109
|
-
dummy_cfg.tools.download_arxiv_paper.request_timeout = 10
|
110
|
-
mock_compose.return_value = dummy_cfg
|
111
|
-
mock_initialize.return_value.__enter__.return_value = None
|
112
|
-
|
113
|
-
# Set up XML with no entry element.
|
114
|
-
arxiv_id = "1234.56789"
|
115
|
-
dummy_xml = (
|
116
|
-
"""<?xml version="1.0" encoding="UTF-8"?>"""
|
117
|
-
"""<feed xmlns="http://www.w3.org/2005/Atom"></feed>"""
|
118
|
-
)
|
119
|
-
dummy_response = MagicMock()
|
120
|
-
dummy_response.text = dummy_xml
|
121
|
-
dummy_response.raise_for_status = MagicMock()
|
122
|
-
mock_get.return_value = dummy_response
|
123
|
-
|
124
|
-
tool_call_id = "test_tool_id"
|
125
|
-
tool_input = {"arxiv_ids": [arxiv_id], "tool_call_id": tool_call_id}
|
126
|
-
# No entry found should result in empty article_data and header-only summary
|
127
|
-
result = download_arxiv_paper.run(tool_input)
|
128
|
-
update = result.update
|
129
|
-
self.assertIn("article_data", update)
|
130
|
-
self.assertEqual(update["article_data"], {})
|
131
|
-
messages = update.get("messages", [])
|
132
|
-
self.assertEqual(len(messages), 1)
|
133
|
-
content = messages[0].content
|
134
|
-
expected = (
|
135
|
-
"Download was successful. Papers metadata are attached as an artifact. "
|
136
|
-
"Here is a summary of the results:\n"
|
137
|
-
"Number of papers found: 0\n"
|
138
|
-
"Top 3 papers:\n"
|
139
|
-
)
|
140
|
-
self.assertEqual(content, expected)
|
141
|
-
|
142
|
-
@patch(
|
143
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.hydra.initialize"
|
144
|
-
)
|
145
|
-
@patch(
|
146
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.hydra.compose"
|
147
|
-
)
|
148
|
-
@patch(
|
149
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.requests.get"
|
150
|
-
)
|
151
|
-
def test_no_pdf_url_found(self, mock_get, mock_compose, mock_initialize):
|
152
|
-
"""test the download_arxiv_paper tool for no PDF URL found in XML response."""
|
153
|
-
# Dummy config.
|
154
|
-
dummy_cfg = MagicMock()
|
155
|
-
dummy_cfg.tools.download_arxiv_paper.api_url = "http://dummy.arxiv.org/api"
|
156
|
-
dummy_cfg.tools.download_arxiv_paper.request_timeout = 10
|
157
|
-
mock_compose.return_value = dummy_cfg
|
158
|
-
mock_initialize.return_value.__enter__.return_value = None
|
159
|
-
|
160
|
-
# Set up XML with an entry that does not contain a pdf link.
|
161
|
-
arxiv_id = "1234.56789"
|
162
|
-
dummy_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
163
|
-
<feed xmlns="http://www.w3.org/2005/Atom">
|
164
|
-
<entry>
|
165
|
-
<title>Sample Paper Title</title>
|
166
|
-
<author>
|
167
|
-
<name>Author One</name>
|
168
|
-
</author>
|
169
|
-
<summary>This is a sample abstract.</summary>
|
170
|
-
<published>2020-01-01T00:00:00Z</published>
|
171
|
-
<!-- Missing pdf link -->
|
172
|
-
</entry>
|
173
|
-
</feed>
|
174
|
-
"""
|
175
|
-
dummy_response = MagicMock()
|
176
|
-
dummy_response.text = dummy_xml
|
177
|
-
dummy_response.raise_for_status = MagicMock()
|
178
|
-
mock_get.return_value = dummy_response
|
179
|
-
|
180
|
-
tool_call_id = "test_tool_id"
|
181
|
-
tool_input = {"arxiv_ids": [arxiv_id], "tool_call_id": tool_call_id}
|
182
|
-
with self.assertRaises(RuntimeError) as context:
|
183
|
-
download_arxiv_paper.run(tool_input)
|
184
|
-
self.assertEqual(
|
185
|
-
str(context.exception), f"Could not find PDF URL for arXiv ID {arxiv_id}"
|
186
|
-
)
|
187
|
-
|
188
|
-
@patch(
|
189
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.extract_metadata"
|
190
|
-
)
|
191
|
-
@patch(
|
192
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_"
|
193
|
-
"arxiv_input.fetch_arxiv_metadata"
|
194
|
-
)
|
195
|
-
@patch(
|
196
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.hydra.compose"
|
197
|
-
)
|
198
|
-
@patch(
|
199
|
-
"aiagents4pharma.talk2scholars.tools.paper_download.download_arxiv_input.hydra.initialize"
|
200
|
-
)
|
201
|
-
def test_summary_multiple_papers(
|
202
|
-
self, mock_initialize, mock_compose, _mock_fetch, mock_extract
|
203
|
-
):
|
204
|
-
"""Test summary includes '...and N more papers.' when more than 3 papers."""
|
205
|
-
# Dummy config
|
206
|
-
dummy_cfg = MagicMock()
|
207
|
-
dummy_cfg.tools.download_arxiv_paper.api_url = "http://dummy"
|
208
|
-
dummy_cfg.tools.download_arxiv_paper.request_timeout = 5
|
209
|
-
mock_compose.return_value = dummy_cfg
|
210
|
-
mock_initialize.return_value.__enter__.return_value = None
|
211
|
-
|
212
|
-
# Simulate metadata extraction for multiple papers
|
213
|
-
def dummy_meta(_entry, _ns, aid):
|
214
|
-
"""dummy metadata extraction function."""
|
215
|
-
return {
|
216
|
-
"Title": f"T{aid}",
|
217
|
-
"Publication Date": "2020-01-01T00:00:00Z",
|
218
|
-
"URL": f"u{aid}v1",
|
219
|
-
}
|
220
|
-
|
221
|
-
mock_extract.side_effect = dummy_meta
|
222
|
-
# Prepare 5 paper IDs
|
223
|
-
ids = [str(i) for i in range(5)]
|
224
|
-
tool_input = {"arxiv_ids": ids, "tool_call_id": "tid"}
|
225
|
-
result = download_arxiv_paper.run(tool_input)
|
226
|
-
summary = result.update["messages"][0].content
|
227
|
-
# Should report total count of 5 and list only top 3 without ellipsis
|
228
|
-
assert "Number of papers found: 5" in summary
|
229
|
-
assert "Top 3 papers:" in summary
|
230
|
-
# Entries for first three IDs should include URL and no ellipsis
|
231
|
-
assert "1. T0 (2020-01-01T00:00:00Z)" in summary
|
232
|
-
assert " View PDF: u0v1" in summary
|
233
|
-
assert "3. T2 (2020-01-01T00:00:00Z)" in summary
|
234
|
-
assert "...and" not in summary
|
235
|
-
|
236
|
-
|
237
|
-
@pytest.mark.parametrize(
|
238
|
-
"input_text,expected",
|
239
|
-
[
|
240
|
-
("", ""),
|
241
|
-
("N/A", ""),
|
242
|
-
("Just one sentence", "Just one sentence."),
|
243
|
-
("First. Second", "First. Second."),
|
244
|
-
("Hello. World.", "Hello. World."),
|
245
|
-
],
|
246
|
-
)
|
247
|
-
def test_get_snippet_various(input_text, expected):
|
248
|
-
"""Test _get_snippet behavior for various abstracts."""
|
249
|
-
assert _get_snippet(input_text) == expected
|
@@ -1,177 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
Tool for downloading arXiv paper metadata and retrieving the PDF URL.
|
4
|
-
"""
|
5
|
-
|
6
|
-
import logging
|
7
|
-
import xml.etree.ElementTree as ET
|
8
|
-
from typing import Annotated, Any, List
|
9
|
-
|
10
|
-
import hydra
|
11
|
-
import requests
|
12
|
-
from langchain_core.messages import ToolMessage
|
13
|
-
from langchain_core.tools import tool
|
14
|
-
from langchain_core.tools.base import InjectedToolCallId
|
15
|
-
from langgraph.types import Command
|
16
|
-
from pydantic import BaseModel, Field
|
17
|
-
|
18
|
-
# Configure logging
|
19
|
-
logging.basicConfig(level=logging.INFO)
|
20
|
-
logger = logging.getLogger(__name__)
|
21
|
-
|
22
|
-
|
23
|
-
class DownloadArxivPaperInput(BaseModel):
|
24
|
-
"""Input schema for the arXiv paper download tool."""
|
25
|
-
|
26
|
-
arxiv_ids: List[str] = Field(
|
27
|
-
description="List of arXiv paper IDs used to retrieve paper details and PDF URLs."
|
28
|
-
)
|
29
|
-
tool_call_id: Annotated[str, InjectedToolCallId]
|
30
|
-
|
31
|
-
|
32
|
-
# Helper to load arXiv download configuration
|
33
|
-
def _get_arxiv_config() -> Any:
|
34
|
-
"""Load arXiv download configuration."""
|
35
|
-
with hydra.initialize(version_base=None, config_path="../../configs"):
|
36
|
-
cfg = hydra.compose(
|
37
|
-
config_name="config", overrides=["tools/download_arxiv_paper=default"]
|
38
|
-
)
|
39
|
-
return cfg.tools.download_arxiv_paper
|
40
|
-
|
41
|
-
|
42
|
-
def fetch_arxiv_metadata(
|
43
|
-
api_url: str, arxiv_id: str, request_timeout: int
|
44
|
-
) -> ET.Element:
|
45
|
-
"""Fetch and parse metadata from the arXiv API."""
|
46
|
-
query_url = f"{api_url}?search_query=id:{arxiv_id}&start=0&max_results=1"
|
47
|
-
response = requests.get(query_url, timeout=request_timeout)
|
48
|
-
response.raise_for_status()
|
49
|
-
return ET.fromstring(response.text)
|
50
|
-
|
51
|
-
|
52
|
-
def extract_metadata(entry: ET.Element, ns: dict, arxiv_id: str) -> dict:
|
53
|
-
"""Extract metadata from the XML entry."""
|
54
|
-
title_elem = entry.find("atom:title", ns)
|
55
|
-
title = (title_elem.text or "").strip() if title_elem is not None else "N/A"
|
56
|
-
|
57
|
-
authors = []
|
58
|
-
for author_elem in entry.findall("atom:author", ns):
|
59
|
-
name_elem = author_elem.find("atom:name", ns)
|
60
|
-
if name_elem is not None and name_elem.text:
|
61
|
-
authors.append(name_elem.text.strip())
|
62
|
-
|
63
|
-
summary_elem = entry.find("atom:summary", ns)
|
64
|
-
abstract = (summary_elem.text or "").strip() if summary_elem is not None else "N/A"
|
65
|
-
|
66
|
-
published_elem = entry.find("atom:published", ns)
|
67
|
-
pub_date = (
|
68
|
-
(published_elem.text or "").strip() if published_elem is not None else "N/A"
|
69
|
-
)
|
70
|
-
|
71
|
-
pdf_url = next(
|
72
|
-
(
|
73
|
-
link.attrib.get("href")
|
74
|
-
for link in entry.findall("atom:link", ns)
|
75
|
-
if link.attrib.get("title") == "pdf"
|
76
|
-
),
|
77
|
-
None,
|
78
|
-
)
|
79
|
-
if not pdf_url:
|
80
|
-
raise RuntimeError(f"Could not find PDF URL for arXiv ID {arxiv_id}")
|
81
|
-
|
82
|
-
return {
|
83
|
-
"Title": title,
|
84
|
-
"Authors": authors,
|
85
|
-
"Abstract": abstract,
|
86
|
-
"Publication Date": pub_date,
|
87
|
-
"URL": pdf_url,
|
88
|
-
"pdf_url": pdf_url,
|
89
|
-
"filename": f"{arxiv_id}.pdf",
|
90
|
-
"source": "arxiv",
|
91
|
-
"arxiv_id": arxiv_id,
|
92
|
-
}
|
93
|
-
|
94
|
-
|
95
|
-
def _get_snippet(abstract: str) -> str:
|
96
|
-
"""Extract the first one or two sentences from an abstract."""
|
97
|
-
if not abstract or abstract == "N/A":
|
98
|
-
return ""
|
99
|
-
sentences = abstract.split(". ")
|
100
|
-
snippet_sentences = sentences[:2]
|
101
|
-
snippet = ". ".join(snippet_sentences)
|
102
|
-
if not snippet.endswith("."):
|
103
|
-
snippet += "."
|
104
|
-
return snippet
|
105
|
-
|
106
|
-
|
107
|
-
def _build_summary(article_data: dict[str, Any]) -> str:
|
108
|
-
"""Build a summary string for up to three papers with snippets."""
|
109
|
-
top = list(article_data.values())[:3]
|
110
|
-
lines: list[str] = []
|
111
|
-
for idx, paper in enumerate(top):
|
112
|
-
title = paper.get("Title", "N/A")
|
113
|
-
pub_date = paper.get("Publication Date", "N/A")
|
114
|
-
url = paper.get("URL", "")
|
115
|
-
snippet = _get_snippet(paper.get("Abstract", ""))
|
116
|
-
line = f"{idx+1}. {title} ({pub_date})"
|
117
|
-
if url:
|
118
|
-
line += f"\n View PDF: {url}"
|
119
|
-
if snippet:
|
120
|
-
line += f"\n Abstract snippet: {snippet}"
|
121
|
-
lines.append(line)
|
122
|
-
summary = "\n".join(lines)
|
123
|
-
return (
|
124
|
-
"Download was successful. Papers metadata are attached as an artifact. "
|
125
|
-
"Here is a summary of the results:\n"
|
126
|
-
f"Number of papers found: {len(article_data)}\n"
|
127
|
-
"Top 3 papers:\n" + summary
|
128
|
-
)
|
129
|
-
|
130
|
-
|
131
|
-
@tool(
|
132
|
-
args_schema=DownloadArxivPaperInput,
|
133
|
-
parse_docstring=True,
|
134
|
-
)
|
135
|
-
def download_arxiv_paper(
|
136
|
-
arxiv_ids: List[str],
|
137
|
-
tool_call_id: Annotated[str, InjectedToolCallId],
|
138
|
-
) -> Command[Any]:
|
139
|
-
"""
|
140
|
-
Get metadata and PDF URLs for one or more arXiv papers using their unique arXiv IDs.
|
141
|
-
"""
|
142
|
-
logger.info("Fetching metadata from arXiv for paper IDs: %s", arxiv_ids)
|
143
|
-
|
144
|
-
# Load configuration
|
145
|
-
cfg = _get_arxiv_config()
|
146
|
-
api_url = cfg.api_url
|
147
|
-
request_timeout = cfg.request_timeout
|
148
|
-
|
149
|
-
# Aggregate results
|
150
|
-
article_data: dict[str, Any] = {}
|
151
|
-
for aid in arxiv_ids:
|
152
|
-
logger.info("Processing arXiv ID: %s", aid)
|
153
|
-
# Fetch and parse metadata
|
154
|
-
entry = fetch_arxiv_metadata(api_url, aid, request_timeout).find(
|
155
|
-
"atom:entry", {"atom": "http://www.w3.org/2005/Atom"}
|
156
|
-
)
|
157
|
-
if entry is None:
|
158
|
-
logger.warning("No entry found for arXiv ID %s", aid)
|
159
|
-
continue
|
160
|
-
article_data[aid] = extract_metadata(
|
161
|
-
entry, {"atom": "http://www.w3.org/2005/Atom"}, aid
|
162
|
-
)
|
163
|
-
|
164
|
-
# Build and return summary
|
165
|
-
content = _build_summary(article_data)
|
166
|
-
return Command(
|
167
|
-
update={
|
168
|
-
"article_data": article_data,
|
169
|
-
"messages": [
|
170
|
-
ToolMessage(
|
171
|
-
content=content,
|
172
|
-
tool_call_id=tool_call_id,
|
173
|
-
artifact=article_data,
|
174
|
-
)
|
175
|
-
],
|
176
|
-
}
|
177
|
-
)
|
@@ -1,114 +0,0 @@
|
|
1
|
-
#!/usr/bin/env python3
|
2
|
-
"""
|
3
|
-
Tool for downloading bioRxiv paper metadata and retrieving the PDF URL.
|
4
|
-
"""
|
5
|
-
|
6
|
-
import logging
|
7
|
-
from typing import Annotated, Any
|
8
|
-
|
9
|
-
import hydra
|
10
|
-
import requests
|
11
|
-
from langchain_core.messages import ToolMessage
|
12
|
-
from langchain_core.tools import tool
|
13
|
-
from langchain_core.tools.base import InjectedToolCallId
|
14
|
-
from langgraph.types import Command
|
15
|
-
from pydantic import BaseModel, Field
|
16
|
-
|
17
|
-
# Configure logging
|
18
|
-
logging.basicConfig(level=logging.INFO)
|
19
|
-
logger = logging.getLogger(__name__)
|
20
|
-
|
21
|
-
|
22
|
-
class DownloadBiorxivPaperInput(BaseModel):
|
23
|
-
"""Input schema for the bioRxiv paper download tool."""
|
24
|
-
|
25
|
-
doi: str = Field(
|
26
|
-
description="""The bioRxiv DOI, from search_helper or multi_helper or single_helper,
|
27
|
-
used to retrieve the paper details and PDF URL."""
|
28
|
-
)
|
29
|
-
tool_call_id: Annotated[str, InjectedToolCallId]
|
30
|
-
|
31
|
-
|
32
|
-
def fetch_biorxiv_metadata(doi: str, api_url: str, request_timeout: int) -> dict:
|
33
|
-
"""
|
34
|
-
Fetch metadata for a bioRxiv paper using its DOI and extract relevant fields.
|
35
|
-
|
36
|
-
Parameters:
|
37
|
-
doi (str): The DOI of the bioRxiv paper.
|
38
|
-
|
39
|
-
Returns:
|
40
|
-
dict: A dictionary containing the title, authors, abstract, publication date, and URLs.
|
41
|
-
"""
|
42
|
-
# Strip any version suffix (e.g., v1) since bioRxiv's API is version-sensitive
|
43
|
-
clean_doi = doi.split("v")[0]
|
44
|
-
|
45
|
-
api_url = f"{api_url}{clean_doi}"
|
46
|
-
logger.info("Fetching metadata from api url: %s", api_url)
|
47
|
-
response = requests.get(api_url, timeout=request_timeout)
|
48
|
-
response.raise_for_status()
|
49
|
-
|
50
|
-
data = response.json()
|
51
|
-
if not data.get("collection"):
|
52
|
-
raise ValueError(f"No metadata found for DOI: {doi}")
|
53
|
-
|
54
|
-
data = response.json()
|
55
|
-
|
56
|
-
return data["collection"][0]
|
57
|
-
|
58
|
-
|
59
|
-
def extract_metadata(paper: dict, doi: str) -> dict:
|
60
|
-
"""
|
61
|
-
Extract relevant metadata fields from a bioRxiv paper entry.
|
62
|
-
"""
|
63
|
-
title = paper.get("title", "")
|
64
|
-
authors = paper.get("authors", "")
|
65
|
-
abstract = paper.get("abstract", "")
|
66
|
-
pub_date = paper.get("date", "")
|
67
|
-
doi_suffix = paper.get("doi", "").split("10.1101/")[-1]
|
68
|
-
pdf_url = f"https://www.biorxiv.org/content/10.1101/{doi_suffix}.full.pdf"
|
69
|
-
logger.info("PDF URL: %s", pdf_url)
|
70
|
-
return {
|
71
|
-
"Title": title,
|
72
|
-
"Authors": authors,
|
73
|
-
"Abstract": abstract,
|
74
|
-
"Publication Date": pub_date,
|
75
|
-
"URL": pdf_url,
|
76
|
-
"pdf_url": pdf_url,
|
77
|
-
"filename": f"{doi_suffix}.pdf",
|
78
|
-
"source": "biorxiv",
|
79
|
-
"biorxiv_id": doi,
|
80
|
-
}
|
81
|
-
|
82
|
-
|
83
|
-
@tool(args_schema=DownloadBiorxivPaperInput, parse_docstring=True)
|
84
|
-
def download_biorxiv_paper(
|
85
|
-
doi: str,
|
86
|
-
tool_call_id: Annotated[str, InjectedToolCallId],
|
87
|
-
) -> Command[Any]:
|
88
|
-
"""
|
89
|
-
Get metadata and PDF URL for a bioRxiv paper using its DOI.
|
90
|
-
"""
|
91
|
-
logger.info("Fetching metadata from bioRxiv for DOI: %s", doi)
|
92
|
-
|
93
|
-
# Load configuration
|
94
|
-
with hydra.initialize(version_base=None, config_path="../../configs"):
|
95
|
-
cfg = hydra.compose(
|
96
|
-
config_name="config", overrides=["tools/download_biorxiv_paper=default"]
|
97
|
-
)
|
98
|
-
api_url = cfg.tools.download_biorxiv_paper.api_url
|
99
|
-
request_timeout = cfg.tools.download_biorxiv_paper.request_timeout
|
100
|
-
logger.info("API URL: %s", api_url)
|
101
|
-
logger.info("Request Timeout: %s", request_timeout)
|
102
|
-
|
103
|
-
# Fetch metadata
|
104
|
-
raw_data = fetch_biorxiv_metadata(doi, api_url, request_timeout)
|
105
|
-
metadata = extract_metadata(raw_data, doi)
|
106
|
-
article_data = {doi: metadata}
|
107
|
-
content = f"Successfully retrieved metadata and PDF URL for bioRxiv DOI {doi}"
|
108
|
-
|
109
|
-
return Command(
|
110
|
-
update={
|
111
|
-
"article_data": article_data,
|
112
|
-
"messages": [ToolMessage(content=content, tool_call_id=tool_call_id)],
|
113
|
-
}
|
114
|
-
)
|