aiagents4pharma 1.42.0__py3-none-any.whl → 1.44.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +17 -2
- aiagents4pharma/talk2knowledgegraphs/tests/test_tools_milvus_multimodal_subgraph_extraction.py +618 -413
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_extractions_milvus_multimodal_pcst.py +362 -25
- aiagents4pharma/talk2knowledgegraphs/tools/milvus_multimodal_subgraph_extraction.py +146 -109
- aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py +240 -83
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
- aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
- aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
- aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
- aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
- aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
- aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
- aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
- aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/METADATA +3 -1
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/RECORD +36 -33
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
- /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,534 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for MedrxivDownloader.
|
3
|
+
Tests JSON API interaction, PDF URL construction, and metadata extraction.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import json
|
7
|
+
import unittest
|
8
|
+
from unittest.mock import Mock, patch
|
9
|
+
|
10
|
+
import requests
|
11
|
+
|
12
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.utils.medrxiv_downloader import (
|
13
|
+
MedrxivDownloader,
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
# ---- Test-only shim to access protected helpers without pylint W0212 ----
|
18
|
+
class MedrxivDownloaderTestShim(MedrxivDownloader):
|
19
|
+
"""mock class to access protected methods for testing."""
|
20
|
+
|
21
|
+
__test__ = False # prevent pytest collection
|
22
|
+
|
23
|
+
def extract_basic_metadata_public(self, paper, identifier):
|
24
|
+
"""extract basic metadata from a paper."""
|
25
|
+
return self._extract_basic_metadata(paper, identifier)
|
26
|
+
|
27
|
+
def extract_authors_public(self, authors_str):
|
28
|
+
"""extract authors from a semicolon-separated string."""
|
29
|
+
return self._extract_authors(authors_str)
|
30
|
+
|
31
|
+
def extract_pdf_metadata_public(self, pdf_result, identifier):
|
32
|
+
"""extract PDF metadata from the download result."""
|
33
|
+
return self._extract_pdf_metadata(pdf_result, identifier)
|
34
|
+
|
35
|
+
def get_paper_identifier_info_public(self, paper):
|
36
|
+
"""get paper identifier info for public use."""
|
37
|
+
return self._get_paper_identifier_info(paper)
|
38
|
+
|
39
|
+
def add_service_identifier_public(self, entry, identifier):
|
40
|
+
"""add service identifier to an entry."""
|
41
|
+
self._add_service_identifier(entry, identifier)
|
42
|
+
|
43
|
+
|
44
|
+
class TestMedrxivDownloader(unittest.TestCase):
|
45
|
+
"""Tests for the MedrxivDownloader class."""
|
46
|
+
|
47
|
+
def setUp(self):
|
48
|
+
"""Set up test fixtures."""
|
49
|
+
self.mock_config = Mock()
|
50
|
+
self.mock_config.api_url = "https://api.medrxiv.org/details"
|
51
|
+
self.mock_config.request_timeout = 30
|
52
|
+
self.mock_config.chunk_size = 8192
|
53
|
+
self.mock_config.pdf_url_template = (
|
54
|
+
"https://www.medrxiv.org/content/{identifier}v{version}.full.pdf"
|
55
|
+
)
|
56
|
+
self.mock_config.default_version = "1"
|
57
|
+
|
58
|
+
self.downloader = MedrxivDownloaderTestShim(self.mock_config)
|
59
|
+
|
60
|
+
# Sample medRxiv API response
|
61
|
+
self.sample_json_response = {
|
62
|
+
"collection": [
|
63
|
+
{
|
64
|
+
"title": "Test MedRxiv Paper",
|
65
|
+
"authors": "John Doe; Jane Smith",
|
66
|
+
"abstract": "This is a test abstract for medRxiv paper.",
|
67
|
+
"date": "2023-01-01",
|
68
|
+
"category": "Infectious Diseases",
|
69
|
+
"version": "1",
|
70
|
+
"doi": "10.1101/2023.01.01.123456",
|
71
|
+
}
|
72
|
+
]
|
73
|
+
}
|
74
|
+
|
75
|
+
def test_initialization(self):
|
76
|
+
"""Test MedrxivDownloader initialization."""
|
77
|
+
self.assertEqual(self.downloader.api_url, "https://api.medrxiv.org/details")
|
78
|
+
self.assertEqual(self.downloader.request_timeout, 30)
|
79
|
+
self.assertEqual(self.downloader.chunk_size, 8192)
|
80
|
+
|
81
|
+
@patch("requests.get")
|
82
|
+
def test_fetch_metadata_success(self, mock_get):
|
83
|
+
"""Test successful metadata fetching from medRxiv API."""
|
84
|
+
mock_response = Mock()
|
85
|
+
mock_response.json.return_value = self.sample_json_response
|
86
|
+
mock_response.raise_for_status = Mock()
|
87
|
+
mock_get.return_value = mock_response
|
88
|
+
|
89
|
+
result = self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
|
90
|
+
|
91
|
+
# Verify API call - should include /medrxiv/ and /na/json
|
92
|
+
expected_url = (
|
93
|
+
"https://api.medrxiv.org/details/medrxiv/10.1101/2023.01.01.123456/na/json"
|
94
|
+
)
|
95
|
+
mock_get.assert_called_once_with(expected_url, timeout=30)
|
96
|
+
mock_response.raise_for_status.assert_called_once()
|
97
|
+
|
98
|
+
# Verify JSON parsing
|
99
|
+
self.assertEqual(result, self.sample_json_response)
|
100
|
+
|
101
|
+
@patch("requests.get")
|
102
|
+
def test_fetch_metadata_network_error(self, mock_get):
|
103
|
+
"""Test fetch_metadata with network error."""
|
104
|
+
mock_get.side_effect = requests.RequestException("Network error")
|
105
|
+
|
106
|
+
with self.assertRaises(requests.RequestException):
|
107
|
+
self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
|
108
|
+
|
109
|
+
@patch("requests.get")
|
110
|
+
def test_fetch_metadata_json_decode_error(self, mock_get):
|
111
|
+
"""Test fetch_metadata with JSON decode error."""
|
112
|
+
mock_response = Mock()
|
113
|
+
mock_response.json.side_effect = json.JSONDecodeError("Invalid JSON", "", 0)
|
114
|
+
mock_response.raise_for_status = Mock()
|
115
|
+
mock_get.return_value = mock_response
|
116
|
+
|
117
|
+
with self.assertRaises(json.JSONDecodeError):
|
118
|
+
self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
|
119
|
+
|
120
|
+
def test_construct_pdf_url_variants(self):
|
121
|
+
"""PDF URL construction: normal, missing/empty collection, custom version."""
|
122
|
+
# Success
|
123
|
+
self.assertEqual(
|
124
|
+
self.downloader.construct_pdf_url(
|
125
|
+
self.sample_json_response, "10.1101/2023.01.01.123456"
|
126
|
+
),
|
127
|
+
"https://www.medrxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf",
|
128
|
+
)
|
129
|
+
# No collection
|
130
|
+
self.assertEqual(
|
131
|
+
self.downloader.construct_pdf_url({}, "10.1101/2023.01.01.123456"),
|
132
|
+
"",
|
133
|
+
)
|
134
|
+
# Empty collection
|
135
|
+
self.assertEqual(
|
136
|
+
self.downloader.construct_pdf_url(
|
137
|
+
{"collection": []}, "10.1101/2023.01.01.123456"
|
138
|
+
),
|
139
|
+
"",
|
140
|
+
)
|
141
|
+
# Custom version
|
142
|
+
self.assertEqual(
|
143
|
+
self.downloader.construct_pdf_url(
|
144
|
+
{"collection": [{"title": "Test Paper", "version": "3"}]},
|
145
|
+
"10.1101/2023.01.01.123456",
|
146
|
+
),
|
147
|
+
"https://www.medrxiv.org/content/10.1101/2023.01.01.123456v3.full.pdf",
|
148
|
+
)
|
149
|
+
|
150
|
+
def test_extract_paper_metadata_success(self):
|
151
|
+
"""Test successful paper metadata extraction."""
|
152
|
+
metadata = self.sample_json_response
|
153
|
+
pdf_result = ("/tmp/paper.pdf", "medrxiv_paper.pdf")
|
154
|
+
|
155
|
+
result = self.downloader.extract_paper_metadata(
|
156
|
+
metadata, "10.1101/2023.01.01.123456", pdf_result
|
157
|
+
)
|
158
|
+
|
159
|
+
expected = {
|
160
|
+
"Title": "Test MedRxiv Paper",
|
161
|
+
"Authors": ["John Doe", "Jane Smith"],
|
162
|
+
"Abstract": "This is a test abstract for medRxiv paper.",
|
163
|
+
"Publication Date": "2023-01-01",
|
164
|
+
"DOI": "10.1101/2023.01.01.123456",
|
165
|
+
"Category": "Infectious Diseases",
|
166
|
+
"Version": "1",
|
167
|
+
"source": "medrxiv",
|
168
|
+
"server": "medrxiv",
|
169
|
+
"URL": "/tmp/paper.pdf",
|
170
|
+
"pdf_url": "/tmp/paper.pdf",
|
171
|
+
"filename": "medrxiv_paper.pdf",
|
172
|
+
"access_type": "open_access_downloaded",
|
173
|
+
"temp_file_path": "/tmp/paper.pdf",
|
174
|
+
}
|
175
|
+
|
176
|
+
self.assertEqual(result, expected)
|
177
|
+
|
178
|
+
def test_extract_paper_metadata_no_pdf(self):
|
179
|
+
"""Test metadata extraction without PDF download."""
|
180
|
+
metadata = self.sample_json_response
|
181
|
+
|
182
|
+
with patch.object(
|
183
|
+
self.downloader, "get_default_filename", return_value="default.pdf"
|
184
|
+
):
|
185
|
+
result = self.downloader.extract_paper_metadata(
|
186
|
+
metadata, "10.1101/2023.01.01.123456", None
|
187
|
+
)
|
188
|
+
|
189
|
+
self.assertEqual(result["Title"], "Test MedRxiv Paper")
|
190
|
+
self.assertEqual(result["URL"], "")
|
191
|
+
self.assertEqual(result["access_type"], "download_failed")
|
192
|
+
self.assertEqual(result["filename"], "default.pdf")
|
193
|
+
|
194
|
+
def test_extract_paper_metadata_no_collection(self):
|
195
|
+
"""Test metadata extraction with missing collection."""
|
196
|
+
metadata = {}
|
197
|
+
|
198
|
+
with self.assertRaises(RuntimeError) as context:
|
199
|
+
self.downloader.extract_paper_metadata(
|
200
|
+
metadata, "10.1101/2023.01.01.123456", None
|
201
|
+
)
|
202
|
+
|
203
|
+
self.assertIn("No collection data found", str(context.exception))
|
204
|
+
|
205
|
+
def test_extract_basic_metadata_variants(self):
|
206
|
+
"""Basic metadata extraction: complete and missing fields."""
|
207
|
+
# Complete
|
208
|
+
paper_full = self.sample_json_response["collection"][0]
|
209
|
+
got_full = self.downloader.extract_basic_metadata_public(
|
210
|
+
paper_full, "10.1101/2023.01.01.123456"
|
211
|
+
)
|
212
|
+
expected_full = {
|
213
|
+
"Title": "Test MedRxiv Paper",
|
214
|
+
"Authors": ["John Doe", "Jane Smith"],
|
215
|
+
"Abstract": "This is a test abstract for medRxiv paper.",
|
216
|
+
"Publication Date": "2023-01-01",
|
217
|
+
"DOI": "10.1101/2023.01.01.123456",
|
218
|
+
"Category": "Infectious Diseases",
|
219
|
+
"Version": "1",
|
220
|
+
"source": "medrxiv",
|
221
|
+
"server": "medrxiv",
|
222
|
+
}
|
223
|
+
self.assertEqual(got_full, expected_full)
|
224
|
+
|
225
|
+
# Missing fields
|
226
|
+
paper_missing = {"title": "Test Paper"} # Missing others
|
227
|
+
got_missing = self.downloader.extract_basic_metadata_public(
|
228
|
+
paper_missing, "10.1101/test"
|
229
|
+
)
|
230
|
+
self.assertEqual(got_missing["Title"], "Test Paper")
|
231
|
+
self.assertEqual(got_missing["Authors"], [])
|
232
|
+
self.assertEqual(got_missing["Abstract"], "N/A")
|
233
|
+
self.assertEqual(got_missing["Category"], "N/A")
|
234
|
+
|
235
|
+
def test_extract_authors_variants(self):
|
236
|
+
"""Author parsing from semicolon string, empty, and whitespace-heavy inputs."""
|
237
|
+
self.assertEqual(
|
238
|
+
self.downloader.extract_authors_public("John Doe; Jane Smith; Bob Johnson"),
|
239
|
+
["John Doe", "Jane Smith", "Bob Johnson"],
|
240
|
+
)
|
241
|
+
self.assertEqual(self.downloader.extract_authors_public(""), [])
|
242
|
+
self.assertEqual(
|
243
|
+
self.downloader.extract_authors_public(" John Doe ; Jane Smith ; "),
|
244
|
+
["John Doe", "Jane Smith"],
|
245
|
+
)
|
246
|
+
|
247
|
+
def test_extract_pdf_metadata_variants(self):
|
248
|
+
"""PDF metadata: with and without download result."""
|
249
|
+
# With result
|
250
|
+
pdf_result = ("/tmp/test.pdf", "paper.pdf")
|
251
|
+
expected_with = {
|
252
|
+
"URL": "/tmp/test.pdf",
|
253
|
+
"pdf_url": "/tmp/test.pdf",
|
254
|
+
"filename": "paper.pdf",
|
255
|
+
"access_type": "open_access_downloaded",
|
256
|
+
"temp_file_path": "/tmp/test.pdf",
|
257
|
+
}
|
258
|
+
self.assertEqual(
|
259
|
+
self.downloader.extract_pdf_metadata_public(pdf_result, "10.1101/test"),
|
260
|
+
expected_with,
|
261
|
+
)
|
262
|
+
|
263
|
+
# Without result
|
264
|
+
with patch.object(
|
265
|
+
self.downloader, "get_default_filename", return_value="default.pdf"
|
266
|
+
):
|
267
|
+
expected_without = {
|
268
|
+
"URL": "",
|
269
|
+
"pdf_url": "",
|
270
|
+
"filename": "default.pdf",
|
271
|
+
"access_type": "download_failed",
|
272
|
+
"temp_file_path": "",
|
273
|
+
}
|
274
|
+
self.assertEqual(
|
275
|
+
self.downloader.extract_pdf_metadata_public(None, "10.1101/test"),
|
276
|
+
expected_without,
|
277
|
+
)
|
278
|
+
|
279
|
+
def test_service_and_identifier_helpers(self):
|
280
|
+
"""Service, identifier, and default filename helpers."""
|
281
|
+
self.assertEqual(self.downloader.get_service_name(), "medRxiv")
|
282
|
+
self.assertEqual(self.downloader.get_identifier_name(), "DOI")
|
283
|
+
self.assertEqual(
|
284
|
+
self.downloader.get_default_filename("10.1101/2023.01.01.123456"),
|
285
|
+
"10_1101_2023_01_01_123456.pdf",
|
286
|
+
)
|
287
|
+
|
288
|
+
def test_get_paper_identifier_info(self):
|
289
|
+
"""Test _get_paper_identifier_info method."""
|
290
|
+
paper = {
|
291
|
+
"DOI": "10.1101/2023.01.01.123456",
|
292
|
+
"Publication Date": "2023-01-01",
|
293
|
+
"Category": "Medicine",
|
294
|
+
}
|
295
|
+
|
296
|
+
result = self.downloader.get_paper_identifier_info_public(paper)
|
297
|
+
|
298
|
+
self.assertIn("10.1101/2023.01.01.123456", result)
|
299
|
+
self.assertIn("2023-01-01", result)
|
300
|
+
self.assertIn("Medicine", result)
|
301
|
+
|
302
|
+
def test_add_service_identifier(self):
|
303
|
+
"""Test _add_service_identifier method."""
|
304
|
+
entry = {}
|
305
|
+
self.downloader.add_service_identifier_public(
|
306
|
+
entry, "10.1101/2023.01.01.123456"
|
307
|
+
)
|
308
|
+
self.assertEqual(entry["DOI"], "10.1101/2023.01.01.123456")
|
309
|
+
self.assertEqual(entry["server"], "medrxiv")
|
310
|
+
|
311
|
+
|
312
|
+
class TestMedrxivDownloaderIntegration(unittest.TestCase):
|
313
|
+
"""Integration tests for MedrxivDownloader workflow."""
|
314
|
+
|
315
|
+
def setUp(self):
|
316
|
+
"""Set up integration test fixtures."""
|
317
|
+
self.mock_config = Mock()
|
318
|
+
self.mock_config.api_url = "https://api.medrxiv.org/details"
|
319
|
+
self.mock_config.request_timeout = 30
|
320
|
+
self.mock_config.chunk_size = 8192
|
321
|
+
self.mock_config.pdf_url_template = (
|
322
|
+
"https://www.medrxiv.org/content/{identifier}v{version}.full.pdf"
|
323
|
+
)
|
324
|
+
self.mock_config.default_version = "1"
|
325
|
+
|
326
|
+
self.downloader = MedrxivDownloaderTestShim(self.mock_config)
|
327
|
+
|
328
|
+
self.sample_response = {
|
329
|
+
"collection": [
|
330
|
+
{
|
331
|
+
"title": "Integration Test Paper",
|
332
|
+
"authors": "Test Author",
|
333
|
+
"abstract": "Integration test abstract.",
|
334
|
+
"date": "2023-01-01",
|
335
|
+
"category": "Medicine",
|
336
|
+
"version": "2",
|
337
|
+
"doi": "10.1101/2023.01.01.123456",
|
338
|
+
}
|
339
|
+
]
|
340
|
+
}
|
341
|
+
|
342
|
+
@patch(
|
343
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.utils."
|
344
|
+
"medrxiv_downloader.MedrxivDownloader.download_pdf_to_temp"
|
345
|
+
)
|
346
|
+
@patch("requests.get")
|
347
|
+
def test_full_paper_processing_workflow(self, mock_get, mock_download):
|
348
|
+
"""Test the complete workflow from DOI to processed paper data."""
|
349
|
+
# Mock API response
|
350
|
+
mock_response = Mock()
|
351
|
+
mock_response.json.return_value = self.sample_response
|
352
|
+
mock_response.raise_for_status = Mock()
|
353
|
+
mock_get.return_value = mock_response
|
354
|
+
|
355
|
+
# Mock PDF download
|
356
|
+
mock_download.return_value = ("/tmp/paper.pdf", "medrxiv_paper.pdf")
|
357
|
+
|
358
|
+
# Simulate the workflow
|
359
|
+
identifier = "10.1101/2023.01.01.123456"
|
360
|
+
|
361
|
+
# Step 1: Fetch metadata
|
362
|
+
metadata = self.downloader.fetch_metadata(identifier)
|
363
|
+
|
364
|
+
# Step 2: Construct PDF URL
|
365
|
+
pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
|
366
|
+
|
367
|
+
# Step 3: Download PDF
|
368
|
+
pdf_result = self.downloader.download_pdf_to_temp(pdf_url, identifier)
|
369
|
+
|
370
|
+
# Step 4: Extract metadata
|
371
|
+
paper_data = self.downloader.extract_paper_metadata(
|
372
|
+
metadata, identifier, pdf_result
|
373
|
+
)
|
374
|
+
|
375
|
+
# Verify the complete workflow
|
376
|
+
self.assertEqual(paper_data["Title"], "Integration Test Paper")
|
377
|
+
self.assertEqual(paper_data["Authors"], ["Test Author"])
|
378
|
+
self.assertEqual(paper_data["access_type"], "open_access_downloaded")
|
379
|
+
self.assertEqual(paper_data["filename"], "medrxiv_paper.pdf")
|
380
|
+
self.assertEqual(paper_data["temp_file_path"], "/tmp/paper.pdf")
|
381
|
+
|
382
|
+
# Verify method calls
|
383
|
+
mock_get.assert_called_once_with(
|
384
|
+
"https://api.medrxiv.org/details/medrxiv/10.1101/2023.01.01.123456/na/json",
|
385
|
+
timeout=30,
|
386
|
+
)
|
387
|
+
expected_pdf_url = (
|
388
|
+
"https://www.medrxiv.org/content/10.1101/2023.01.01.123456v2.full.pdf"
|
389
|
+
)
|
390
|
+
mock_download.assert_called_once_with(expected_pdf_url, identifier)
|
391
|
+
|
392
|
+
@patch("requests.get")
|
393
|
+
def test_error_handling_workflow(self, mock_get):
|
394
|
+
"""Test error handling in the workflow."""
|
395
|
+
# Mock API error
|
396
|
+
mock_get.side_effect = requests.RequestException("API error")
|
397
|
+
|
398
|
+
with self.assertRaises(requests.RequestException):
|
399
|
+
self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
|
400
|
+
|
401
|
+
@patch("requests.get")
|
402
|
+
def test_workflow_with_empty_collection(self, mock_get):
|
403
|
+
"""Test workflow with empty collection response."""
|
404
|
+
# Mock API response with empty collection - this should raise error in fetch_metadata
|
405
|
+
mock_response = Mock()
|
406
|
+
mock_response.json.return_value = {"collection": []}
|
407
|
+
mock_response.raise_for_status = Mock()
|
408
|
+
mock_get.return_value = mock_response
|
409
|
+
|
410
|
+
identifier = "10.1101/2023.01.01.123456"
|
411
|
+
|
412
|
+
# Should raise error in fetch_metadata when collection is empty
|
413
|
+
with self.assertRaises(RuntimeError) as context:
|
414
|
+
self.downloader.fetch_metadata(identifier)
|
415
|
+
|
416
|
+
self.assertIn(
|
417
|
+
"No collection data found in medRxiv API response", str(context.exception)
|
418
|
+
)
|
419
|
+
|
420
|
+
@patch("requests.get")
|
421
|
+
def test_multiple_identifiers_workflow(self, mock_get):
|
422
|
+
"""Test processing multiple identifiers."""
|
423
|
+
# Mock different responses for different DOIs
|
424
|
+
responses = [
|
425
|
+
{
|
426
|
+
"collection": [
|
427
|
+
{"title": "Paper 1", "version": "1", "authors": "Author 1"}
|
428
|
+
]
|
429
|
+
},
|
430
|
+
{
|
431
|
+
"collection": [
|
432
|
+
{"title": "Paper 2", "version": "2", "authors": "Author 2"}
|
433
|
+
]
|
434
|
+
},
|
435
|
+
]
|
436
|
+
|
437
|
+
mock_responses = []
|
438
|
+
for response in responses:
|
439
|
+
mock_resp = Mock()
|
440
|
+
mock_resp.json.return_value = response
|
441
|
+
mock_resp.raise_for_status = Mock()
|
442
|
+
mock_responses.append(mock_resp)
|
443
|
+
|
444
|
+
mock_get.side_effect = mock_responses
|
445
|
+
|
446
|
+
identifiers = ["10.1101/2023.01.01.111111", "10.1101/2023.01.01.222222"]
|
447
|
+
results = {}
|
448
|
+
|
449
|
+
for identifier in identifiers:
|
450
|
+
metadata = self.downloader.fetch_metadata(identifier)
|
451
|
+
_ = self.downloader.construct_pdf_url(
|
452
|
+
metadata, identifier
|
453
|
+
) # ensure path covered
|
454
|
+
paper_data = self.downloader.extract_paper_metadata(
|
455
|
+
metadata, identifier, None
|
456
|
+
)
|
457
|
+
results[identifier] = paper_data
|
458
|
+
|
459
|
+
# Verify both papers were processed
|
460
|
+
self.assertEqual(len(results), 2)
|
461
|
+
self.assertEqual(results["10.1101/2023.01.01.111111"]["Title"], "Paper 1")
|
462
|
+
self.assertEqual(results["10.1101/2023.01.01.222222"]["Title"], "Paper 2")
|
463
|
+
|
464
|
+
# Verify API calls with correct URLs
|
465
|
+
self.assertEqual(mock_get.call_count, 2)
|
466
|
+
expected_calls = [
|
467
|
+
"https://api.medrxiv.org/details/medrxiv/10.1101/2023.01.01.111111/na/json",
|
468
|
+
"https://api.medrxiv.org/details/medrxiv/10.1101/2023.01.01.222222/na/json",
|
469
|
+
]
|
470
|
+
actual_urls = [call[0][0] for call in mock_get.call_args_list]
|
471
|
+
for expected_url in expected_calls:
|
472
|
+
self.assertIn(expected_url, actual_urls)
|
473
|
+
|
474
|
+
|
475
|
+
class TestMedrxivSpecialCases(unittest.TestCase):
|
476
|
+
"""Tests for special cases and edge conditions."""
|
477
|
+
|
478
|
+
def setUp(self):
|
479
|
+
"""Set up test fixtures for special cases."""
|
480
|
+
self.mock_config = Mock()
|
481
|
+
self.mock_config.api_url = "https://api.medrxiv.org/details"
|
482
|
+
self.mock_config.request_timeout = 30
|
483
|
+
self.mock_config.chunk_size = 8192
|
484
|
+
self.mock_config.pdf_url_template = (
|
485
|
+
"https://www.medrxiv.org/content/{identifier}v{version}.full.pdf"
|
486
|
+
)
|
487
|
+
self.mock_config.default_version = "1"
|
488
|
+
|
489
|
+
self.downloader = MedrxivDownloaderTestShim(self.mock_config)
|
490
|
+
|
491
|
+
def test_filename_generation_special_characters(self):
|
492
|
+
"""Test filename generation with special characters in DOI."""
|
493
|
+
doi_with_special_chars = "10.1101/2023.01.01.123456/special-chars_test"
|
494
|
+
|
495
|
+
result = self.downloader.get_default_filename(doi_with_special_chars)
|
496
|
+
|
497
|
+
# Should replace problematic characters
|
498
|
+
self.assertEqual(result, "10_1101_2023_01_01_123456_special-chars_test.pdf")
|
499
|
+
|
500
|
+
def test_version_handling_edge_cases(self):
|
501
|
+
"""Test PDF URL construction with various version formats."""
|
502
|
+
test_cases = [
|
503
|
+
({"collection": [{"version": ""}]}, "v.full.pdf"), # Empty version
|
504
|
+
({"collection": [{"version": None}]}, "vNone.full.pdf"), # None version
|
505
|
+
({"collection": [{}]}, "v1.full.pdf"), # Missing version key defaults to 1
|
506
|
+
]
|
507
|
+
|
508
|
+
for metadata, expected_suffix in test_cases:
|
509
|
+
result = self.downloader.construct_pdf_url(metadata, "10.1101/test")
|
510
|
+
self.assertTrue(result.endswith(expected_suffix))
|
511
|
+
|
512
|
+
def test_metadata_extraction_unicode_handling(self):
|
513
|
+
"""Test metadata extraction with Unicode characters."""
|
514
|
+
metadata = {
|
515
|
+
"collection": [
|
516
|
+
{
|
517
|
+
"title": "Título com acentos é símbolos especiais",
|
518
|
+
"authors": "José María; François Müller",
|
519
|
+
"abstract": "Resumo com çaracteres especiais ñ símbolos",
|
520
|
+
"date": "2023-01-01",
|
521
|
+
"category": "Médecine",
|
522
|
+
"version": "1",
|
523
|
+
}
|
524
|
+
]
|
525
|
+
}
|
526
|
+
|
527
|
+
result = self.downloader.extract_paper_metadata(metadata, "10.1101/test", None)
|
528
|
+
|
529
|
+
# Should handle Unicode properly
|
530
|
+
self.assertEqual(result["Title"], "Título com acentos é símbolos especiais")
|
531
|
+
self.assertEqual(result["Authors"], ["José María", "François Müller"])
|
532
|
+
self.assertEqual(
|
533
|
+
result["Abstract"], "Resumo com çaracteres especiais ñ símbolos"
|
534
|
+
)
|
@@ -13,8 +13,9 @@ def mock_hydra_fixture():
|
|
13
13
|
"""Mocks Hydra configuration for tests."""
|
14
14
|
with mock.patch("hydra.initialize"), mock.patch("hydra.compose") as mock_compose:
|
15
15
|
cfg_mock = mock.MagicMock()
|
16
|
-
cfg_mock.agents.talk2scholars.
|
17
|
-
|
16
|
+
cfg_mock.agents.talk2scholars.paper_download_agent.paper_download_agent = (
|
17
|
+
"Test prompt"
|
18
|
+
)
|
18
19
|
mock_compose.return_value = cfg_mock
|
19
20
|
yield mock_compose
|
20
21
|
|
@@ -23,13 +24,12 @@ def mock_hydra_fixture():
|
|
23
24
|
def mock_tools_fixture():
|
24
25
|
"""Mocks paper download tools to prevent real HTTP calls."""
|
25
26
|
with mock.patch(
|
26
|
-
"aiagents4pharma.talk2scholars.tools.paper_download."
|
27
|
-
|
28
|
-
|
29
|
-
mock_download_arxiv_paper.return_value = {
|
27
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.paper_downloader.download_papers"
|
28
|
+
) as mock_download_papers:
|
29
|
+
mock_download_papers.return_value = {
|
30
30
|
"article_data": {"dummy_key": "dummy_value"}
|
31
31
|
}
|
32
|
-
yield [
|
32
|
+
yield [mock_download_papers]
|
33
33
|
|
34
34
|
|
35
35
|
@pytest.mark.usefixtures("mock_hydra_fixture")
|
@@ -83,10 +83,12 @@ def test_paper_download_agent_invocation():
|
|
83
83
|
assert "article_data" in result
|
84
84
|
|
85
85
|
|
86
|
-
def test_paper_download_agent_tools_assignment(
|
87
|
-
|
86
|
+
def test_paper_download_agent_tools_assignment(
|
87
|
+
request,
|
88
|
+
):
|
89
|
+
"""Checks correct tool assignment (download_papers tool)."""
|
88
90
|
thread_id = "test_thread_paper_dl"
|
89
|
-
|
91
|
+
request.getfixturevalue("mock_tools_fixture")
|
90
92
|
llm_mock = mock.Mock(spec=BaseChatModel)
|
91
93
|
|
92
94
|
with (
|
@@ -100,12 +102,20 @@ def test_paper_download_agent_tools_assignment(request): # Keep fixture name
|
|
100
102
|
mock_agent = mock.Mock()
|
101
103
|
mock_create_agent.return_value = mock_agent
|
102
104
|
mock_tool_instance = mock.Mock()
|
103
|
-
mock_tool_instance.tools = mock_tools if mock_tools else []
|
104
105
|
mock_toolnode.return_value = mock_tool_instance
|
105
106
|
|
106
107
|
get_app(thread_id, llm_mock)
|
108
|
+
# Verify ToolNode was called with download_papers function
|
107
109
|
assert mock_toolnode.called
|
108
|
-
|
110
|
+
# Check that ToolNode was called with a list containing the download_papers tool
|
111
|
+
call_args = mock_toolnode.call_args[0][
|
112
|
+
0
|
113
|
+
] # Get first positional argument (the tools list)
|
114
|
+
assert len(call_args) == 1
|
115
|
+
# The tool should be a StructuredTool with name 'download_papers'
|
116
|
+
tool = call_args[0]
|
117
|
+
assert hasattr(tool, "name")
|
118
|
+
assert tool.name == "download_papers"
|
109
119
|
|
110
120
|
|
111
121
|
def test_paper_download_agent_hydra_failure():
|