aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
- aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
- aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
- aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
- aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
- aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
- aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
- aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
- aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
- /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1067 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for PubmedDownloader.
|
3
|
+
Tests PMID to PMCID conversion, XML parsing, and PDF URL extraction
|
4
|
+
from multiple sources. Uses a public shim to avoid accessing protected
|
5
|
+
members in tests.
|
6
|
+
"""
|
7
|
+
|
8
|
+
# pylint: disable=too-many-lines
|
9
|
+
import unittest
|
10
|
+
from types import SimpleNamespace
|
11
|
+
from unittest.mock import Mock, patch
|
12
|
+
|
13
|
+
import requests
|
14
|
+
|
15
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.utils.pubmed_downloader import (
|
16
|
+
BasePaperDownloader,
|
17
|
+
PubmedDownloader,
|
18
|
+
)
|
19
|
+
|
20
|
+
|
21
|
+
class PubmedDownloaderTestShim(PubmedDownloader):
|
22
|
+
"""Thin public shim that forwards to the real implementation."""
|
23
|
+
|
24
|
+
__test__ = False # prevent pytest from collecting it as a test
|
25
|
+
|
26
|
+
# Public wrappers for protected helpers
|
27
|
+
def try_oa_api_public(self, pmcid: str) -> str:
|
28
|
+
"""Public wrapper for _try_oa_api method."""
|
29
|
+
return self._try_oa_api(pmcid)
|
30
|
+
|
31
|
+
def try_europe_pmc_public(self, pmcid: str) -> str:
|
32
|
+
"""Public wrapper for _try_europe_pmc method."""
|
33
|
+
return self._try_europe_pmc(pmcid)
|
34
|
+
|
35
|
+
def try_pmc_page_scraping_public(self, pmcid: str) -> str:
|
36
|
+
"""Public wrapper for _try_pmc_page_scraping method."""
|
37
|
+
return self._try_pmc_page_scraping(pmcid)
|
38
|
+
|
39
|
+
def try_direct_pmc_url_public(self, pmcid: str) -> str:
|
40
|
+
"""Public wrapper for _try_direct_pmc_url method."""
|
41
|
+
return self._try_direct_pmc_url(pmcid)
|
42
|
+
|
43
|
+
def fetch_pdf_url_with_fallbacks_public(self, pmcid: str) -> str:
|
44
|
+
"""Same fallback order as production, but via public wrappers."""
|
45
|
+
for fn in (
|
46
|
+
self.try_oa_api_public,
|
47
|
+
self.try_europe_pmc_public,
|
48
|
+
self.try_pmc_page_scraping_public,
|
49
|
+
self.try_direct_pmc_url_public,
|
50
|
+
):
|
51
|
+
url = fn(pmcid)
|
52
|
+
if url:
|
53
|
+
return url
|
54
|
+
return ""
|
55
|
+
|
56
|
+
# IMPORTANT: override to use the shim's public chain so tests can patch it
|
57
|
+
def construct_pdf_url(self, metadata, identifier): # same signature
|
58
|
+
"""Test-friendly override that uses the shim's public fallback chain."""
|
59
|
+
if "records" not in metadata or not metadata["records"]:
|
60
|
+
return ""
|
61
|
+
pmcid = metadata["records"][0].get("pmcid", "")
|
62
|
+
if not pmcid or pmcid == "N/A":
|
63
|
+
return ""
|
64
|
+
return self.fetch_pdf_url_with_fallbacks_public(pmcid)
|
65
|
+
|
66
|
+
# Public accessors for identifier helpers (avoid protected-access in tests)
|
67
|
+
def get_paper_identifier_info_public(self, paper):
|
68
|
+
"""Public wrapper for _get_paper_identifier_info method."""
|
69
|
+
return self._get_paper_identifier_info(paper)
|
70
|
+
|
71
|
+
def add_service_identifier_public(self, entry, identifier):
|
72
|
+
"""Public wrapper for _add_service_identifier method."""
|
73
|
+
return self._add_service_identifier(entry, identifier)
|
74
|
+
|
75
|
+
def fetch_pdf_url_with_fallbacks_production(self, pmcid: str) -> str:
|
76
|
+
"""Public wrapper for _fetch_pdf_url_with_fallbacks method."""
|
77
|
+
return self._fetch_pdf_url_with_fallbacks(pmcid)
|
78
|
+
|
79
|
+
|
80
|
+
class TestPubmedDownloaderBasics(unittest.TestCase):
|
81
|
+
"""Basic metadata + OA API parsing tests (hit the production code)."""
|
82
|
+
|
83
|
+
def setUp(self):
|
84
|
+
cfg = SimpleNamespace(
|
85
|
+
id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
|
86
|
+
oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
|
87
|
+
europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
|
88
|
+
pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
89
|
+
direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
90
|
+
ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
|
91
|
+
https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
|
92
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
93
|
+
request_timeout=30,
|
94
|
+
chunk_size=8192,
|
95
|
+
)
|
96
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
97
|
+
|
98
|
+
def test_initialization(self):
|
99
|
+
"""Sanity check config wiring."""
|
100
|
+
self.assertIn("idconv", self.downloader.id_converter_url)
|
101
|
+
self.assertIn("oa.fcgi", self.downloader.oa_api_url)
|
102
|
+
|
103
|
+
@patch("requests.get")
|
104
|
+
def test_fetch_metadata_success(self, mock_get):
|
105
|
+
"""Successful PMID→PMCID conversion."""
|
106
|
+
resp = Mock()
|
107
|
+
resp.json.return_value = {
|
108
|
+
"records": [{"pmid": "12345678", "pmcid": "PMC123456", "doi": "10.1/x"}]
|
109
|
+
}
|
110
|
+
resp.raise_for_status = Mock()
|
111
|
+
mock_get.return_value = resp
|
112
|
+
|
113
|
+
data = self.downloader.fetch_metadata("12345678")
|
114
|
+
mock_get.assert_called_once()
|
115
|
+
self.assertIn("records", data)
|
116
|
+
self.assertEqual(data["records"][0]["pmcid"], "PMC123456")
|
117
|
+
|
118
|
+
@patch("requests.get")
|
119
|
+
def test_fetch_metadata_no_records(self, mock_get):
|
120
|
+
"""Test fetch_metadata with empty records."""
|
121
|
+
resp = Mock()
|
122
|
+
resp.json.return_value = {"records": []}
|
123
|
+
resp.raise_for_status = Mock()
|
124
|
+
mock_get.return_value = resp
|
125
|
+
with self.assertRaises(RuntimeError):
|
126
|
+
self.downloader.fetch_metadata("12345678")
|
127
|
+
|
128
|
+
@patch("requests.get")
|
129
|
+
def test_fetch_metadata_network_error(self, mock_get):
|
130
|
+
"""Test fetch_metadata with network error."""
|
131
|
+
mock_get.side_effect = requests.RequestException("down")
|
132
|
+
with self.assertRaises(requests.RequestException):
|
133
|
+
self.downloader.fetch_metadata("12345678")
|
134
|
+
|
135
|
+
# ---- OA API paths (cover lines ~77–87, 99–122) ----
|
136
|
+
|
137
|
+
@patch("requests.get")
|
138
|
+
def test_oa_api_xml_error_node_returns_empty(self, mock_get):
|
139
|
+
"""<error> node -> return empty string."""
|
140
|
+
resp = Mock()
|
141
|
+
resp.text = (
|
142
|
+
'<?xml version="1.0"?><OA><error code="idDoesNotExist">'
|
143
|
+
"Invalid PMC ID</error></OA>"
|
144
|
+
)
|
145
|
+
resp.raise_for_status = Mock()
|
146
|
+
mock_get.return_value = resp
|
147
|
+
|
148
|
+
out = self.downloader.try_oa_api_public("PMC999999")
|
149
|
+
self.assertEqual(out, "")
|
150
|
+
|
151
|
+
@patch("requests.get")
|
152
|
+
def test_oa_api_pdf_link_success(self, mock_get):
|
153
|
+
"""<link format='pdf' href='https://...'> -> return the https link."""
|
154
|
+
resp = Mock()
|
155
|
+
resp.text = (
|
156
|
+
'<?xml version="1.0"?><OA><records><record>'
|
157
|
+
'<link format="pdf" '
|
158
|
+
'href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1/pdf/a.pdf"/>'
|
159
|
+
"</record></records></OA>"
|
160
|
+
)
|
161
|
+
resp.raise_for_status = Mock()
|
162
|
+
mock_get.return_value = resp
|
163
|
+
|
164
|
+
out = self.downloader.try_oa_api_public("PMC1")
|
165
|
+
self.assertTrue(out.endswith("/PMC1/pdf/a.pdf"))
|
166
|
+
|
167
|
+
@patch("requests.get")
|
168
|
+
def test_oa_api_ftp_link_converts_to_https(self, mock_get):
|
169
|
+
"""ftp:// link -> converted to https:// base (covers conversion branch)."""
|
170
|
+
resp = Mock()
|
171
|
+
resp.text = (
|
172
|
+
'<?xml version="1.0"?><OA><records><record>'
|
173
|
+
'<link format="pdf" '
|
174
|
+
'href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/a/b/c.pdf"/>'
|
175
|
+
"</record></records></OA>"
|
176
|
+
)
|
177
|
+
resp.raise_for_status = Mock()
|
178
|
+
mock_get.return_value = resp
|
179
|
+
|
180
|
+
out = self.downloader.try_oa_api_public("PMC2")
|
181
|
+
self.assertTrue(out.startswith("https://www.ncbi.nlm.nih.gov/pmc"))
|
182
|
+
self.assertTrue(out.endswith("c.pdf"))
|
183
|
+
|
184
|
+
@patch("requests.get")
|
185
|
+
def test_oa_api_network_exception_returns_empty(self, mock_get):
|
186
|
+
"""Test OA API with network exception returns empty string."""
|
187
|
+
mock_get.side_effect = requests.RequestException("net")
|
188
|
+
out = self.downloader.try_oa_api_public("PMC3")
|
189
|
+
self.assertEqual(out, "")
|
190
|
+
|
191
|
+
|
192
|
+
class TestPubmedDownloaderOAAPI(unittest.TestCase):
|
193
|
+
"""Tests for OA API and FTP->HTTPS conversion."""
|
194
|
+
|
195
|
+
def setUp(self):
|
196
|
+
cfg = SimpleNamespace(
|
197
|
+
id_converter_url="",
|
198
|
+
oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
|
199
|
+
europe_pmc_base_url="",
|
200
|
+
pmc_page_base_url="",
|
201
|
+
direct_pmc_pdf_base_url="",
|
202
|
+
ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
|
203
|
+
https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
|
204
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
205
|
+
request_timeout=30,
|
206
|
+
chunk_size=8192,
|
207
|
+
)
|
208
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
209
|
+
|
210
|
+
@patch("requests.get")
|
211
|
+
def test_try_oa_api_success(self, mock_get):
|
212
|
+
"""Test successful OA API response."""
|
213
|
+
mock_response = Mock()
|
214
|
+
mock_response.text = """<?xml version="1.0" encoding="UTF-8"?>
|
215
|
+
<OA>
|
216
|
+
<records>
|
217
|
+
<record>
|
218
|
+
<link format="pdf"
|
219
|
+
href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf"/>
|
220
|
+
</record>
|
221
|
+
</records>
|
222
|
+
</OA>"""
|
223
|
+
mock_response.raise_for_status = Mock()
|
224
|
+
mock_get.return_value = mock_response
|
225
|
+
|
226
|
+
result = self.downloader.try_oa_api_public("PMC123456")
|
227
|
+
expected_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC123456"
|
228
|
+
mock_get.assert_called_once_with(expected_url, timeout=30)
|
229
|
+
self.assertEqual(
|
230
|
+
result,
|
231
|
+
"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf",
|
232
|
+
)
|
233
|
+
self.assertIn("PMC123456", result)
|
234
|
+
|
235
|
+
@patch("requests.get")
|
236
|
+
def test_try_oa_api_error_response(self, mock_get):
|
237
|
+
"""Test OA API error response."""
|
238
|
+
mock_response = Mock()
|
239
|
+
mock_response.text = """<?xml version="1.0" encoding="UTF-8"?>
|
240
|
+
<OA>
|
241
|
+
<error code="idDoesNotExist">Invalid PMC ID</error>
|
242
|
+
</OA>"""
|
243
|
+
mock_response.raise_for_status = Mock()
|
244
|
+
mock_get.return_value = mock_response
|
245
|
+
result = self.downloader.try_oa_api_public("PMC123456")
|
246
|
+
self.assertEqual(result, "")
|
247
|
+
|
248
|
+
@patch("requests.get")
|
249
|
+
def test_try_oa_api_network_error(self, mock_get):
|
250
|
+
"""Test OA API with network error."""
|
251
|
+
mock_get.side_effect = requests.RequestException("Network error")
|
252
|
+
result = self.downloader.try_oa_api_public("PMC123456")
|
253
|
+
self.assertEqual(result, "")
|
254
|
+
|
255
|
+
def test_ftp_to_https_conversion(self):
|
256
|
+
"""Test FTP to HTTPS URL conversion."""
|
257
|
+
xml = """<?xml version="1.0" encoding="UTF-8"?>
|
258
|
+
<OA>
|
259
|
+
<records>
|
260
|
+
<record>
|
261
|
+
<link format="pdf"
|
262
|
+
href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/test.pdf"/>
|
263
|
+
</record>
|
264
|
+
</records>
|
265
|
+
</OA>"""
|
266
|
+
with patch("requests.get") as mock_get:
|
267
|
+
resp = Mock()
|
268
|
+
resp.text = xml
|
269
|
+
resp.raise_for_status = Mock()
|
270
|
+
mock_get.return_value = resp
|
271
|
+
result = self.downloader.try_oa_api_public("PMC123456")
|
272
|
+
self.assertTrue(result.startswith("https://www.ncbi.nlm.nih.gov/pmc"))
|
273
|
+
self.assertNotIn("ftp://", result)
|
274
|
+
self.assertIn("test.pdf", result)
|
275
|
+
|
276
|
+
|
277
|
+
class TestPubmedDownloaderEuropePMC(unittest.TestCase):
|
278
|
+
"""Europe PMC link checking."""
|
279
|
+
|
280
|
+
def setUp(self):
|
281
|
+
cfg = SimpleNamespace(
|
282
|
+
id_converter_url="",
|
283
|
+
oa_api_url="",
|
284
|
+
europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
|
285
|
+
pmc_page_base_url="",
|
286
|
+
direct_pmc_pdf_base_url="",
|
287
|
+
ftp_base_url="",
|
288
|
+
https_base_url="",
|
289
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
290
|
+
request_timeout=30,
|
291
|
+
chunk_size=8192,
|
292
|
+
)
|
293
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
294
|
+
|
295
|
+
@patch("requests.head")
|
296
|
+
def test_try_europe_pmc_success(self, mock_head):
|
297
|
+
"""Test successful Europe PMC response."""
|
298
|
+
resp = Mock()
|
299
|
+
resp.status_code = 200
|
300
|
+
mock_head.return_value = resp
|
301
|
+
result = self.downloader.try_europe_pmc_public("PMC123456")
|
302
|
+
expected = (
|
303
|
+
"https://www.ebi.ac.uk/europepmc/webservices/rest"
|
304
|
+
"?accid=PMC123456&blobtype=pdf"
|
305
|
+
)
|
306
|
+
mock_head.assert_called_once_with(expected, timeout=30)
|
307
|
+
self.assertEqual(result, expected)
|
308
|
+
|
309
|
+
@patch("requests.head")
|
310
|
+
def test_try_europe_pmc_not_found(self, mock_head):
|
311
|
+
"""Test Europe PMC not found response."""
|
312
|
+
resp = Mock()
|
313
|
+
resp.status_code = 404
|
314
|
+
mock_head.return_value = resp
|
315
|
+
self.assertEqual(self.downloader.try_europe_pmc_public("PMC123456"), "")
|
316
|
+
|
317
|
+
@patch("requests.head")
|
318
|
+
def test_try_europe_pmc_network_error(self, mock_head):
|
319
|
+
"""Test Europe PMC with network error."""
|
320
|
+
mock_head.side_effect = requests.RequestException("Network error")
|
321
|
+
self.assertEqual(self.downloader.try_europe_pmc_public("PMC123456"), "")
|
322
|
+
|
323
|
+
|
324
|
+
class TestPubmedDownloaderPMCScrape(unittest.TestCase):
|
325
|
+
"""Scraping from PMC page."""
|
326
|
+
|
327
|
+
def setUp(self):
|
328
|
+
cfg = SimpleNamespace(
|
329
|
+
id_converter_url="",
|
330
|
+
oa_api_url="",
|
331
|
+
europe_pmc_base_url="",
|
332
|
+
pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
333
|
+
direct_pmc_pdf_base_url="",
|
334
|
+
ftp_base_url="",
|
335
|
+
https_base_url="",
|
336
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
337
|
+
request_timeout=30,
|
338
|
+
chunk_size=8192,
|
339
|
+
)
|
340
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
341
|
+
|
342
|
+
@patch("requests.get")
|
343
|
+
def test_try_pmc_page_scraping_success(self, mock_get):
|
344
|
+
"""Test successful PMC page scraping."""
|
345
|
+
resp = Mock()
|
346
|
+
html = (
|
347
|
+
'<html><head><meta name="citation_pdf_url" '
|
348
|
+
'content="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf">'
|
349
|
+
"</head></html>"
|
350
|
+
)
|
351
|
+
resp.content = html.encode()
|
352
|
+
resp.raise_for_status = Mock()
|
353
|
+
mock_get.return_value = resp
|
354
|
+
|
355
|
+
result = self.downloader.try_pmc_page_scraping_public("PMC123456")
|
356
|
+
|
357
|
+
expected_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/"
|
358
|
+
expected_headers = {"User-Agent": "Mozilla/5.0 (compatible; test-agent)"}
|
359
|
+
mock_get.assert_called_once_with(
|
360
|
+
expected_url, headers=expected_headers, timeout=30
|
361
|
+
)
|
362
|
+
self.assertEqual(
|
363
|
+
result, "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf"
|
364
|
+
)
|
365
|
+
|
366
|
+
@patch("requests.get")
|
367
|
+
def test_try_pmc_page_scraping_no_pdf(self, mock_get):
|
368
|
+
"""Test PMC page scraping with no PDF found."""
|
369
|
+
resp = Mock()
|
370
|
+
resp.content = "<html><head></head></html>".encode()
|
371
|
+
resp.raise_for_status = Mock()
|
372
|
+
mock_get.return_value = resp
|
373
|
+
self.assertEqual(self.downloader.try_pmc_page_scraping_public("PMC123456"), "")
|
374
|
+
|
375
|
+
@patch("requests.get")
|
376
|
+
def test_try_pmc_page_scraping_network_error(self, mock_get):
|
377
|
+
"""Test PMC page scraping with network error."""
|
378
|
+
mock_get.side_effect = requests.RequestException("Network error")
|
379
|
+
self.assertEqual(self.downloader.try_pmc_page_scraping_public("PMC123456"), "")
|
380
|
+
|
381
|
+
|
382
|
+
class TestPubmedDownloaderDirectPMC(unittest.TestCase):
|
383
|
+
"""Direct PMC PDF attempts."""
|
384
|
+
|
385
|
+
def setUp(self):
|
386
|
+
cfg = SimpleNamespace(
|
387
|
+
id_converter_url="",
|
388
|
+
oa_api_url="",
|
389
|
+
europe_pmc_base_url="",
|
390
|
+
pmc_page_base_url="",
|
391
|
+
direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
392
|
+
ftp_base_url="",
|
393
|
+
https_base_url="",
|
394
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
395
|
+
request_timeout=30,
|
396
|
+
chunk_size=8192,
|
397
|
+
)
|
398
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
399
|
+
|
400
|
+
@patch("requests.head")
|
401
|
+
def test_try_direct_pmc_url_success(self, mock_head):
|
402
|
+
"""Test successful direct PMC URL access."""
|
403
|
+
resp = Mock()
|
404
|
+
resp.status_code = 200
|
405
|
+
mock_head.return_value = resp
|
406
|
+
result = self.downloader.try_direct_pmc_url_public("PMC123456")
|
407
|
+
expected = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/"
|
408
|
+
mock_head.assert_called_once_with(expected, timeout=30)
|
409
|
+
self.assertEqual(result, expected)
|
410
|
+
|
411
|
+
@patch("requests.head")
|
412
|
+
def test_try_direct_pmc_url_not_found(self, mock_head):
|
413
|
+
"""Test direct PMC URL not found."""
|
414
|
+
resp = Mock()
|
415
|
+
resp.status_code = 404
|
416
|
+
mock_head.return_value = resp
|
417
|
+
self.assertEqual(self.downloader.try_direct_pmc_url_public("PMC123456"), "")
|
418
|
+
|
419
|
+
@patch("requests.head")
|
420
|
+
def test_try_direct_pmc_url_exception(self, mock_head):
|
421
|
+
"""Test direct PMC URL with exception."""
|
422
|
+
mock_head.side_effect = requests.RequestException("Network error")
|
423
|
+
result = self.downloader.try_direct_pmc_url_public("PMC123456")
|
424
|
+
self.assertEqual(result, "")
|
425
|
+
expected_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/"
|
426
|
+
mock_head.assert_called_once_with(expected_url, timeout=30)
|
427
|
+
|
428
|
+
|
429
|
+
class TestPubmedDownloaderConstructAndFallbacks(unittest.TestCase):
|
430
|
+
"""Construct URL + fallback chains via public wrappers (no protected access)."""
|
431
|
+
|
432
|
+
def setUp(self):
|
433
|
+
cfg = SimpleNamespace(
|
434
|
+
id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
|
435
|
+
oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
|
436
|
+
europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
|
437
|
+
pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
438
|
+
direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
439
|
+
ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
|
440
|
+
https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
|
441
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
442
|
+
request_timeout=30,
|
443
|
+
chunk_size=8192,
|
444
|
+
)
|
445
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
446
|
+
|
447
|
+
def test_construct_pdf_url_success(self):
|
448
|
+
"""construct_pdf_url routes to the public fallback chain in tests."""
|
449
|
+
metadata = {"records": [{"pmcid": "PMC123456", "doi": "10.1234/test"}]}
|
450
|
+
with patch.object(
|
451
|
+
self.downloader,
|
452
|
+
"fetch_pdf_url_with_fallbacks_public",
|
453
|
+
return_value="http://test.pdf",
|
454
|
+
) as mock_fetch:
|
455
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
456
|
+
self.assertEqual(result, "http://test.pdf")
|
457
|
+
mock_fetch.assert_called_once_with("PMC123456")
|
458
|
+
|
459
|
+
def test_construct_pdf_url_no_records(self):
|
460
|
+
"""Test construct_pdf_url with no records."""
|
461
|
+
self.assertEqual(self.downloader.construct_pdf_url({}, "x"), "")
|
462
|
+
|
463
|
+
def test_construct_pdf_url_no_pmcid(self):
|
464
|
+
"""Test construct_pdf_url with no PMCID."""
|
465
|
+
md = {"records": [{"pmcid": "N/A", "doi": "10.1/x"}]}
|
466
|
+
self.assertEqual(self.downloader.construct_pdf_url(md, "x"), "")
|
467
|
+
|
468
|
+
def test_fetch_pdf_url_with_fallbacks_europe_pmc_success(self):
|
469
|
+
"""Test fallback chain with Europe PMC success."""
|
470
|
+
with (
|
471
|
+
patch.object(self.downloader, "try_oa_api_public", return_value="") as m_oa,
|
472
|
+
patch.object(
|
473
|
+
self.downloader, "try_europe_pmc_public", return_value="http://eu.pdf"
|
474
|
+
) as m_eu,
|
475
|
+
patch.object(self.downloader, "try_pmc_page_scraping_public") as m_scr,
|
476
|
+
patch.object(self.downloader, "try_direct_pmc_url_public") as m_dir,
|
477
|
+
):
|
478
|
+
out = self.downloader.fetch_pdf_url_with_fallbacks_public("PMC123456")
|
479
|
+
self.assertEqual(out, "http://eu.pdf")
|
480
|
+
m_oa.assert_called_once_with("PMC123456")
|
481
|
+
m_eu.assert_called_once_with("PMC123456")
|
482
|
+
m_scr.assert_not_called()
|
483
|
+
m_dir.assert_not_called()
|
484
|
+
|
485
|
+
def test_fetch_pdf_url_with_fallbacks_multiple_sources(self):
|
486
|
+
"""Test fallback chain through multiple sources."""
|
487
|
+
with (
|
488
|
+
patch.object(self.downloader, "try_oa_api_public", return_value="") as m_oa,
|
489
|
+
patch.object(
|
490
|
+
self.downloader, "try_europe_pmc_public", return_value=""
|
491
|
+
) as m_eu,
|
492
|
+
patch.object(
|
493
|
+
self.downloader,
|
494
|
+
"try_pmc_page_scraping_public",
|
495
|
+
return_value="http://test.pdf",
|
496
|
+
) as m_scr,
|
497
|
+
patch.object(
|
498
|
+
self.downloader, "try_direct_pmc_url_public", return_value=""
|
499
|
+
) as m_dir,
|
500
|
+
):
|
501
|
+
out = self.downloader.fetch_pdf_url_with_fallbacks_public("PMC123456")
|
502
|
+
self.assertEqual(out, "http://test.pdf")
|
503
|
+
m_oa.assert_called_once_with("PMC123456")
|
504
|
+
m_eu.assert_called_once_with("PMC123456")
|
505
|
+
m_scr.assert_called_once_with("PMC123456")
|
506
|
+
m_dir.assert_not_called()
|
507
|
+
|
508
|
+
def test_fetch_pdf_url_with_fallbacks_direct_pmc_success(self):
|
509
|
+
"""Test fallback chain with direct PMC success."""
|
510
|
+
with (
|
511
|
+
patch.object(self.downloader, "try_oa_api_public", return_value="") as m_oa,
|
512
|
+
patch.object(
|
513
|
+
self.downloader, "try_europe_pmc_public", return_value=""
|
514
|
+
) as m_eu,
|
515
|
+
patch.object(
|
516
|
+
self.downloader, "try_pmc_page_scraping_public", return_value=""
|
517
|
+
) as m_scr,
|
518
|
+
patch.object(
|
519
|
+
self.downloader,
|
520
|
+
"try_direct_pmc_url_public",
|
521
|
+
return_value="http://direct.pdf",
|
522
|
+
) as m_dir,
|
523
|
+
):
|
524
|
+
out = self.downloader.fetch_pdf_url_with_fallbacks_public("PMC123456")
|
525
|
+
self.assertEqual(out, "http://direct.pdf")
|
526
|
+
m_oa.assert_called_once_with("PMC123456")
|
527
|
+
m_eu.assert_called_once_with("PMC123456")
|
528
|
+
m_scr.assert_called_once_with("PMC123456")
|
529
|
+
m_dir.assert_called_once_with("PMC123456")
|
530
|
+
|
531
|
+
def test_fetch_pdf_url_with_fallbacks_all_fail(self):
|
532
|
+
"""Test fallback chain when all sources fail."""
|
533
|
+
with (
|
534
|
+
patch.object(self.downloader, "try_oa_api_public", return_value="") as m_oa,
|
535
|
+
patch.object(
|
536
|
+
self.downloader, "try_europe_pmc_public", return_value=""
|
537
|
+
) as m_eu,
|
538
|
+
patch.object(
|
539
|
+
self.downloader, "try_pmc_page_scraping_public", return_value=""
|
540
|
+
) as m_scr,
|
541
|
+
patch.object(
|
542
|
+
self.downloader, "try_direct_pmc_url_public", return_value=""
|
543
|
+
) as m_dir,
|
544
|
+
):
|
545
|
+
out = self.downloader.fetch_pdf_url_with_fallbacks_public("PMC123456")
|
546
|
+
self.assertEqual(out, "")
|
547
|
+
m_oa.assert_called_once_with("PMC123456")
|
548
|
+
m_eu.assert_called_once_with("PMC123456")
|
549
|
+
m_scr.assert_called_once_with("PMC123456")
|
550
|
+
m_dir.assert_called_once_with("PMC123456")
|
551
|
+
|
552
|
+
def test_identifier_helper_wrappers(self):
|
553
|
+
"""Covers _get_paper_identifier_info and _add_service_identifier via wrappers."""
|
554
|
+
paper = {"PMID": "12345678", "PMCID": "PMC9"}
|
555
|
+
info = self.downloader.get_paper_identifier_info_public(paper)
|
556
|
+
self.assertIn("PMID: 12345678", info)
|
557
|
+
self.assertIn("PMCID: PMC9", info)
|
558
|
+
|
559
|
+
entry = {}
|
560
|
+
self.downloader.add_service_identifier_public(entry, "11122233")
|
561
|
+
self.assertEqual(
|
562
|
+
entry,
|
563
|
+
{"PMID": "11122233", "PMCID": "N/A", "DOI": "N/A", "Journal": "N/A"},
|
564
|
+
)
|
565
|
+
|
566
|
+
|
567
|
+
class TestPubmedDownloaderIntegration(unittest.TestCase):
|
568
|
+
"""Integration tests for PubmedDownloader workflow."""
|
569
|
+
|
570
|
+
def setUp(self):
|
571
|
+
cfg = SimpleNamespace(
|
572
|
+
id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
|
573
|
+
oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
|
574
|
+
europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
|
575
|
+
pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
576
|
+
direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
577
|
+
ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
|
578
|
+
https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
|
579
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
580
|
+
request_timeout=30,
|
581
|
+
chunk_size=8192,
|
582
|
+
)
|
583
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
584
|
+
|
585
|
+
@patch("requests.get")
|
586
|
+
def test_full_workflow_pmid_to_pdf(self, mock_get):
|
587
|
+
"""Test full workflow from PMID to PDF URL."""
|
588
|
+
metadata_response = Mock()
|
589
|
+
metadata_response.json.return_value = {
|
590
|
+
"records": [
|
591
|
+
{"pmid": "12345678", "pmcid": "PMC123456", "doi": "10.1234/test"}
|
592
|
+
]
|
593
|
+
}
|
594
|
+
metadata_response.raise_for_status = Mock()
|
595
|
+
|
596
|
+
oa_response = Mock()
|
597
|
+
oa_response.text = """<?xml version="1.0" encoding="UTF-8"?>
|
598
|
+
<OA><records><record>
|
599
|
+
<link format="pdf"
|
600
|
+
href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf"/>
|
601
|
+
</record></records></OA>"""
|
602
|
+
oa_response.raise_for_status = Mock()
|
603
|
+
|
604
|
+
def get_side_effect(url, *_, **__):
|
605
|
+
if "idconv" in url:
|
606
|
+
return metadata_response
|
607
|
+
if "oa.fcgi" in url:
|
608
|
+
return oa_response
|
609
|
+
return None
|
610
|
+
|
611
|
+
mock_get.side_effect = get_side_effect
|
612
|
+
|
613
|
+
identifier = "12345678"
|
614
|
+
metadata = self.downloader.fetch_metadata(identifier)
|
615
|
+
pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
|
616
|
+
|
617
|
+
self.assertEqual(metadata["records"][0]["pmid"], "12345678")
|
618
|
+
self.assertEqual(metadata["records"][0]["pmcid"], "PMC123456")
|
619
|
+
self.assertIn("PMC123456", pdf_url)
|
620
|
+
self.assertTrue(pdf_url.startswith("https://"))
|
621
|
+
self.assertEqual(mock_get.call_count, 2)
|
622
|
+
self.assertIn("idconv", mock_get.call_args_list[0][0][0])
|
623
|
+
self.assertIn("oa.fcgi", mock_get.call_args_list[1][0][0])
|
624
|
+
|
625
|
+
# Test the None return path in get_side_effect
|
626
|
+
result = get_side_effect("https://unknown-api.com/test")
|
627
|
+
self.assertIsNone(result)
|
628
|
+
|
629
|
+
@patch("requests.get")
|
630
|
+
def test_workflow_with_fallback_sources(self, mock_get):
|
631
|
+
"""Test workflow with fallback to alternative sources."""
|
632
|
+
metadata_response = Mock()
|
633
|
+
metadata_response.json.return_value = {
|
634
|
+
"records": [
|
635
|
+
{"pmid": "12345678", "pmcid": "PMC123456", "doi": "10.1234/test"}
|
636
|
+
]
|
637
|
+
}
|
638
|
+
metadata_response.raise_for_status = Mock()
|
639
|
+
|
640
|
+
oa_response = Mock()
|
641
|
+
oa_response.text = """<?xml version="1.0" encoding="UTF-8"?>
|
642
|
+
<OA><error code="idDoesNotExist">Invalid PMC ID</error></OA>"""
|
643
|
+
oa_response.raise_for_status = Mock()
|
644
|
+
|
645
|
+
scrape_response = Mock()
|
646
|
+
html = (
|
647
|
+
'<html><head><meta name="citation_pdf_url" '
|
648
|
+
'content="https://www.ncbi.nlm.nih.gov/pmc/articles/'
|
649
|
+
'PMC123456/pdf/fallback.pdf"></head></html>'
|
650
|
+
)
|
651
|
+
scrape_response.content = html.encode()
|
652
|
+
scrape_response.raise_for_status = Mock()
|
653
|
+
|
654
|
+
mock_get.side_effect = [metadata_response, oa_response, scrape_response]
|
655
|
+
|
656
|
+
with patch("requests.head") as mock_head:
|
657
|
+
mock_head.return_value.status_code = 404
|
658
|
+
identifier = "12345678"
|
659
|
+
metadata = self.downloader.fetch_metadata(identifier)
|
660
|
+
pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
|
661
|
+
|
662
|
+
self.assertEqual(
|
663
|
+
pdf_url,
|
664
|
+
"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/fallback.pdf",
|
665
|
+
)
|
666
|
+
self.assertEqual(mock_get.call_count, 3)
|
667
|
+
mock_head.assert_called_once()
|
668
|
+
|
669
|
+
|
670
|
+
class TestPubmedDownloaderOAAPINoLink(unittest.TestCase):
|
671
|
+
"""Test OA API responses without PDF links."""
|
672
|
+
|
673
|
+
def setUp(self):
|
674
|
+
"""Set up test configuration."""
|
675
|
+
cfg = SimpleNamespace(
|
676
|
+
id_converter_url="",
|
677
|
+
oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
|
678
|
+
europe_pmc_base_url="",
|
679
|
+
pmc_page_base_url="",
|
680
|
+
direct_pmc_pdf_base_url="",
|
681
|
+
ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
|
682
|
+
https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
|
683
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
684
|
+
request_timeout=30,
|
685
|
+
chunk_size=8192,
|
686
|
+
)
|
687
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
688
|
+
|
689
|
+
@patch("requests.get")
|
690
|
+
def test_oa_api_ok_but_no_pdf_link_returns_empty(self, mock_get):
|
691
|
+
"""Covers OA API response that has no <link format='pdf'>."""
|
692
|
+
resp = Mock()
|
693
|
+
resp.text = """<?xml version="1.0"?>
|
694
|
+
<OA><records><record><link format="tgz" href="https://x/y.tgz"/></record></records></OA>"""
|
695
|
+
resp.raise_for_status = Mock()
|
696
|
+
mock_get.return_value = resp
|
697
|
+
out = self.downloader.try_oa_api_public("PMCNOPOINTER")
|
698
|
+
self.assertEqual(out, "")
|
699
|
+
|
700
|
+
|
701
|
+
class TestPubmedDownloaderExtractMetadata(unittest.TestCase):
|
702
|
+
"""Test extract_paper_metadata method functionality."""
|
703
|
+
|
704
|
+
def setUp(self):
|
705
|
+
"""Set up test configuration for extract metadata tests."""
|
706
|
+
cfg = SimpleNamespace(
|
707
|
+
id_converter_url="",
|
708
|
+
oa_api_url="",
|
709
|
+
europe_pmc_base_url="",
|
710
|
+
pmc_page_base_url="",
|
711
|
+
direct_pmc_pdf_base_url="",
|
712
|
+
ftp_base_url="",
|
713
|
+
https_base_url="",
|
714
|
+
user_agent="ua",
|
715
|
+
request_timeout=5,
|
716
|
+
chunk_size=1024,
|
717
|
+
)
|
718
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
719
|
+
|
720
|
+
def test_extract_metadata_raises_when_no_records(self):
|
721
|
+
"""Test that extract_metadata raises RuntimeError when no records."""
|
722
|
+
with self.assertRaises(RuntimeError):
|
723
|
+
self.downloader.extract_paper_metadata({}, "123", None)
|
724
|
+
|
725
|
+
def test_extract_metadata_with_pdf_result(self):
|
726
|
+
"""Test extract_metadata with PDF result tuple."""
|
727
|
+
metadata = {"records": [{"pmcid": "PMC1", "doi": "10.1/x"}]}
|
728
|
+
pdf_result = ("/tmp/file.pdf", "custom.pdf")
|
729
|
+
out = self.downloader.extract_paper_metadata(metadata, "12345678", pdf_result)
|
730
|
+
self.assertEqual(out["access_type"], "open_access_downloaded")
|
731
|
+
self.assertEqual(out["URL"], "/tmp/file.pdf")
|
732
|
+
self.assertEqual(out["pdf_url"], "/tmp/file.pdf")
|
733
|
+
self.assertEqual(out["filename"], "custom.pdf")
|
734
|
+
self.assertEqual(out["PMCID"], "PMC1")
|
735
|
+
self.assertEqual(out["PMID"], "12345678")
|
736
|
+
|
737
|
+
def test_extract_metadata_without_pdf_with_pmcid(self):
|
738
|
+
"""Test extract_metadata without PDF but with valid PMCID."""
|
739
|
+
metadata = {"records": [{"pmcid": "PMC9", "doi": "10.1/x"}]}
|
740
|
+
out = self.downloader.extract_paper_metadata(metadata, "42", None)
|
741
|
+
self.assertEqual(out["access_type"], "abstract_only")
|
742
|
+
self.assertEqual(out["filename"], "pmid_42.pdf")
|
743
|
+
self.assertEqual(out["URL"], "")
|
744
|
+
self.assertEqual(out["pdf_url"], "")
|
745
|
+
|
746
|
+
def test_extract_metadata_without_pdf_no_pmcid(self):
|
747
|
+
"""Test extract_metadata without PDF and no PMCID."""
|
748
|
+
metadata = {"records": [{"pmcid": "N/A", "doi": "10.1/x"}]}
|
749
|
+
out = self.downloader.extract_paper_metadata(metadata, "42", None)
|
750
|
+
self.assertEqual(out["access_type"], "no_pmcid")
|
751
|
+
self.assertEqual(out["filename"], "pmid_42.pdf")
|
752
|
+
|
753
|
+
|
754
|
+
class TestPubmedDownloaderHelpers(unittest.TestCase):
|
755
|
+
"""Test helper methods and utility functions."""
|
756
|
+
|
757
|
+
def setUp(self):
|
758
|
+
"""Set up test configuration with helper downloader."""
|
759
|
+
cfg = SimpleNamespace(
|
760
|
+
id_converter_url="",
|
761
|
+
oa_api_url="",
|
762
|
+
europe_pmc_base_url="",
|
763
|
+
pmc_page_base_url="",
|
764
|
+
direct_pmc_pdf_base_url="",
|
765
|
+
ftp_base_url="",
|
766
|
+
https_base_url="",
|
767
|
+
user_agent="ua",
|
768
|
+
request_timeout=5,
|
769
|
+
chunk_size=1024,
|
770
|
+
)
|
771
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
772
|
+
|
773
|
+
def test_service_and_identifier_names_and_default_filename(self):
|
774
|
+
"""Test service name, identifier name, and default filename generation."""
|
775
|
+
self.assertEqual(self.downloader.get_service_name(), "PubMed")
|
776
|
+
self.assertEqual(self.downloader.get_identifier_name(), "PMID")
|
777
|
+
self.assertEqual(self.downloader.get_default_filename("777"), "pmid_777.pdf")
|
778
|
+
|
779
|
+
def test_get_snippet_placeholders_return_empty(self):
|
780
|
+
"""Test that placeholder abstracts return empty snippets."""
|
781
|
+
self.assertEqual(self.downloader.get_snippet(""), "")
|
782
|
+
self.assertEqual(self.downloader.get_snippet("N/A"), "")
|
783
|
+
self.assertEqual(
|
784
|
+
self.downloader.get_snippet("Abstract available in PubMed"), ""
|
785
|
+
)
|
786
|
+
|
787
|
+
def test_get_snippet_non_placeholder_delegates_to_base(self):
|
788
|
+
"""Test that non-placeholder abstracts delegate to base class."""
|
789
|
+
with patch.object(BasePaperDownloader, "get_snippet", return_value="SNIP") as p:
|
790
|
+
out = self.downloader.get_snippet("Real abstract text")
|
791
|
+
p.assert_called_once_with("Real abstract text")
|
792
|
+
self.assertEqual(out, "SNIP")
|
793
|
+
|
794
|
+
def test_get_paper_identifier_info_without_pmcid_line(self):
|
795
|
+
"""Test paper identifier info formatting without PMCID."""
|
796
|
+
info = self.downloader.get_paper_identifier_info_public(
|
797
|
+
{"PMID": "999", "PMCID": "N/A"}
|
798
|
+
)
|
799
|
+
self.assertIn("(PMID: 999)", info)
|
800
|
+
self.assertNotIn("PMCID:", info)
|
801
|
+
|
802
|
+
|
803
|
+
class TestPubmedDownloaderMissingLineCoverage(unittest.TestCase):
|
804
|
+
"""Tests to cover missing lines 77-87 and 99-122."""
|
805
|
+
|
806
|
+
def setUp(self):
|
807
|
+
cfg = SimpleNamespace(
|
808
|
+
id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
|
809
|
+
oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
|
810
|
+
europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
|
811
|
+
pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
812
|
+
direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
813
|
+
ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
|
814
|
+
https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
|
815
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
816
|
+
request_timeout=30,
|
817
|
+
chunk_size=8192,
|
818
|
+
)
|
819
|
+
self.downloader = PubmedDownloaderTestShim(cfg)
|
820
|
+
|
821
|
+
def test_construct_pdf_url_empty_records_list(self):
|
822
|
+
"""Test construct_pdf_url with empty records list (covers line 77-78)."""
|
823
|
+
metadata = {"records": []}
|
824
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
825
|
+
self.assertEqual(result, "")
|
826
|
+
|
827
|
+
def test_construct_pdf_url_missing_records_key(self):
|
828
|
+
"""Test construct_pdf_url with missing records key (covers line 77-78)."""
|
829
|
+
metadata = {"other_key": "value"}
|
830
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
831
|
+
self.assertEqual(result, "")
|
832
|
+
|
833
|
+
def test_construct_pdf_url_empty_pmcid_string(self):
|
834
|
+
"""Test construct_pdf_url with empty pmcid string (covers line 83-85)."""
|
835
|
+
metadata = {"records": [{"pmcid": "", "doi": "10.1/x"}]}
|
836
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
837
|
+
self.assertEqual(result, "")
|
838
|
+
|
839
|
+
def test_construct_pdf_url_missing_pmcid_key(self):
|
840
|
+
"""Test construct_pdf_url with missing pmcid key (covers line 81, 83-85)."""
|
841
|
+
metadata = {"records": [{"doi": "10.1/x"}]}
|
842
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
843
|
+
self.assertEqual(result, "")
|
844
|
+
|
845
|
+
@patch.object(PubmedDownloaderTestShim, "_fetch_pdf_url_with_fallbacks")
|
846
|
+
def test_fetch_pdf_url_with_fallbacks_logging_and_return(self, mock_fallbacks):
|
847
|
+
"""Test _fetch_pdf_url_with_fallbacks method logging (covers lines 99-122)."""
|
848
|
+
mock_fallbacks.return_value = "http://test.pdf"
|
849
|
+
|
850
|
+
# Use the public wrapper for testing
|
851
|
+
result = self.downloader.fetch_pdf_url_with_fallbacks_production("PMC123456")
|
852
|
+
|
853
|
+
mock_fallbacks.assert_called_once_with("PMC123456")
|
854
|
+
self.assertEqual(result, "http://test.pdf")
|
855
|
+
|
856
|
+
def test_fetch_pdf_url_with_fallbacks_all_fail_with_logging(self):
|
857
|
+
"""Test _fetch_pdf_url_with_fallbacks when all methods fail with warning log."""
|
858
|
+
with (
|
859
|
+
patch.object(self.downloader, "_try_oa_api", return_value=""),
|
860
|
+
patch.object(self.downloader, "_try_europe_pmc", return_value=""),
|
861
|
+
patch.object(self.downloader, "_try_pmc_page_scraping", return_value=""),
|
862
|
+
patch.object(self.downloader, "_try_direct_pmc_url", return_value=""),
|
863
|
+
patch(
|
864
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.utils."
|
865
|
+
"pubmed_downloader.logger"
|
866
|
+
) as mock_logger,
|
867
|
+
):
|
868
|
+
|
869
|
+
result = self.downloader.fetch_pdf_url_with_fallbacks_production(
|
870
|
+
"PMC123456"
|
871
|
+
)
|
872
|
+
|
873
|
+
self.assertEqual(result, "")
|
874
|
+
# Verify the warning log is called
|
875
|
+
mock_logger.warning.assert_called_once_with(
|
876
|
+
"All PDF URL strategies failed for PMCID: %s", "PMC123456"
|
877
|
+
)
|
878
|
+
|
879
|
+
def test_fetch_pdf_url_with_fallbacks_oa_api_success_early_return(self):
|
880
|
+
"""Test _fetch_pdf_url_with_fallbacks when OA API succeeds on first try."""
|
881
|
+
with (
|
882
|
+
patch.object(
|
883
|
+
self.downloader, "_try_oa_api", return_value="http://oa.pdf"
|
884
|
+
) as mock_oa,
|
885
|
+
patch.object(self.downloader, "_try_europe_pmc") as mock_eu,
|
886
|
+
patch.object(self.downloader, "_try_pmc_page_scraping") as mock_scr,
|
887
|
+
patch.object(self.downloader, "_try_direct_pmc_url") as mock_dir,
|
888
|
+
patch(
|
889
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.utils."
|
890
|
+
"pubmed_downloader.logger"
|
891
|
+
) as mock_logger,
|
892
|
+
):
|
893
|
+
|
894
|
+
result = self.downloader.fetch_pdf_url_with_fallbacks_production(
|
895
|
+
"PMC123456"
|
896
|
+
)
|
897
|
+
|
898
|
+
self.assertEqual(result, "http://oa.pdf")
|
899
|
+
mock_oa.assert_called_once_with("PMC123456")
|
900
|
+
mock_eu.assert_not_called()
|
901
|
+
mock_scr.assert_not_called()
|
902
|
+
mock_dir.assert_not_called()
|
903
|
+
# Verify the initial info log is called
|
904
|
+
mock_logger.info.assert_called_with(
|
905
|
+
"Fetching PDF URL for PMCID: %s", "PMC123456"
|
906
|
+
)
|
907
|
+
|
908
|
+
def test_fetch_pdf_url_with_fallbacks_europe_pmc_success_after_oa_fail(self):
|
909
|
+
"""Test _fetch_pdf_url_with_fallbacks when Europe PMC succeeds after OA API fails."""
|
910
|
+
with (
|
911
|
+
patch.object(self.downloader, "_try_oa_api", return_value="") as mock_oa,
|
912
|
+
patch.object(
|
913
|
+
self.downloader, "_try_europe_pmc", return_value="http://eu.pdf"
|
914
|
+
) as mock_eu,
|
915
|
+
patch.object(self.downloader, "_try_pmc_page_scraping") as mock_scr,
|
916
|
+
patch.object(self.downloader, "_try_direct_pmc_url") as mock_dir,
|
917
|
+
):
|
918
|
+
|
919
|
+
result = self.downloader.fetch_pdf_url_with_fallbacks_production(
|
920
|
+
"PMC123456"
|
921
|
+
)
|
922
|
+
|
923
|
+
self.assertEqual(result, "http://eu.pdf")
|
924
|
+
mock_oa.assert_called_once_with("PMC123456")
|
925
|
+
mock_eu.assert_called_once_with("PMC123456")
|
926
|
+
mock_scr.assert_not_called()
|
927
|
+
mock_dir.assert_not_called()
|
928
|
+
|
929
|
+
def test_fetch_pdf_url_with_fallbacks_pmc_scraping_success_after_previous_fail(
|
930
|
+
self,
|
931
|
+
):
|
932
|
+
"""Test _fetch_pdf_url_with_fallbacks when PMC scraping succeeds."""
|
933
|
+
with (
|
934
|
+
patch.object(self.downloader, "_try_oa_api", return_value="") as mock_oa,
|
935
|
+
patch.object(
|
936
|
+
self.downloader, "_try_europe_pmc", return_value=""
|
937
|
+
) as mock_eu,
|
938
|
+
patch.object(
|
939
|
+
self.downloader, "_try_pmc_page_scraping", return_value="http://scr.pdf"
|
940
|
+
) as mock_scr,
|
941
|
+
patch.object(self.downloader, "_try_direct_pmc_url") as mock_dir,
|
942
|
+
):
|
943
|
+
|
944
|
+
result = self.downloader.fetch_pdf_url_with_fallbacks_production(
|
945
|
+
"PMC123456"
|
946
|
+
)
|
947
|
+
|
948
|
+
self.assertEqual(result, "http://scr.pdf")
|
949
|
+
mock_oa.assert_called_once_with("PMC123456")
|
950
|
+
mock_eu.assert_called_once_with("PMC123456")
|
951
|
+
mock_scr.assert_called_once_with("PMC123456")
|
952
|
+
mock_dir.assert_not_called()
|
953
|
+
|
954
|
+
def test_fetch_pdf_url_with_fallbacks_direct_pmc_success_last_resort(self):
|
955
|
+
"""Test _fetch_pdf_url_with_fallbacks when direct PMC succeeds as last resort."""
|
956
|
+
with (
|
957
|
+
patch.object(self.downloader, "_try_oa_api", return_value="") as mock_oa,
|
958
|
+
patch.object(
|
959
|
+
self.downloader, "_try_europe_pmc", return_value=""
|
960
|
+
) as mock_eu,
|
961
|
+
patch.object(
|
962
|
+
self.downloader, "_try_pmc_page_scraping", return_value=""
|
963
|
+
) as mock_scr,
|
964
|
+
patch.object(
|
965
|
+
self.downloader, "_try_direct_pmc_url", return_value="http://dir.pdf"
|
966
|
+
) as mock_dir,
|
967
|
+
):
|
968
|
+
|
969
|
+
result = self.downloader.fetch_pdf_url_with_fallbacks_production(
|
970
|
+
"PMC123456"
|
971
|
+
)
|
972
|
+
|
973
|
+
self.assertEqual(result, "http://dir.pdf")
|
974
|
+
mock_oa.assert_called_once_with("PMC123456")
|
975
|
+
mock_eu.assert_called_once_with("PMC123456")
|
976
|
+
mock_scr.assert_called_once_with("PMC123456")
|
977
|
+
mock_dir.assert_called_once_with("PMC123456")
|
978
|
+
|
979
|
+
|
980
|
+
class TestPubmedDownloaderProductionConstructPdfUrl(unittest.TestCase):
|
981
|
+
"""Test production construct_pdf_url method to hit the actual lines 77-87."""
|
982
|
+
|
983
|
+
def setUp(self):
|
984
|
+
cfg = SimpleNamespace(
|
985
|
+
id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
|
986
|
+
oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
|
987
|
+
europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
|
988
|
+
pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
989
|
+
direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
|
990
|
+
ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
|
991
|
+
https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
|
992
|
+
user_agent="Mozilla/5.0 (compatible; test-agent)",
|
993
|
+
request_timeout=30,
|
994
|
+
chunk_size=8192,
|
995
|
+
)
|
996
|
+
# Use the actual production class, not the test shim
|
997
|
+
self.downloader = PubmedDownloader(cfg)
|
998
|
+
|
999
|
+
def test_production_construct_pdf_url_no_records_key(self):
|
1000
|
+
"""Test production construct_pdf_url with no records key (covers line 77-78)."""
|
1001
|
+
metadata = {"other_key": "value"}
|
1002
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
1003
|
+
self.assertEqual(result, "")
|
1004
|
+
|
1005
|
+
def test_production_construct_pdf_url_empty_records_list(self):
|
1006
|
+
"""Test production construct_pdf_url with empty records list (covers line 77-78)."""
|
1007
|
+
metadata = {"records": []}
|
1008
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
1009
|
+
self.assertEqual(result, "")
|
1010
|
+
|
1011
|
+
def test_production_construct_pdf_url_missing_pmcid_key(self):
|
1012
|
+
"""Test production construct_pdf_url with missing pmcid key."""
|
1013
|
+
metadata = {"records": [{"doi": "10.1/x"}]}
|
1014
|
+
with patch(
|
1015
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.utils."
|
1016
|
+
"pubmed_downloader.logger"
|
1017
|
+
) as mock_logger:
|
1018
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
1019
|
+
|
1020
|
+
self.assertEqual(result, "")
|
1021
|
+
# Should log the "No PMCID available" message
|
1022
|
+
mock_logger.info.assert_called_once_with(
|
1023
|
+
"No PMCID available for PDF fetch: PMID %s", "12345678"
|
1024
|
+
)
|
1025
|
+
|
1026
|
+
def test_production_construct_pdf_url_empty_pmcid(self):
|
1027
|
+
"""Test production construct_pdf_url with empty pmcid."""
|
1028
|
+
metadata = {"records": [{"pmcid": "", "doi": "10.1/x"}]}
|
1029
|
+
with patch(
|
1030
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.utils."
|
1031
|
+
"pubmed_downloader.logger"
|
1032
|
+
) as mock_logger:
|
1033
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
1034
|
+
|
1035
|
+
self.assertEqual(result, "")
|
1036
|
+
# Should log the "No PMCID available" message
|
1037
|
+
mock_logger.info.assert_called_once_with(
|
1038
|
+
"No PMCID available for PDF fetch: PMID %s", "12345678"
|
1039
|
+
)
|
1040
|
+
|
1041
|
+
def test_production_construct_pdf_url_na_pmcid(self):
|
1042
|
+
"""Test production construct_pdf_url with N/A pmcid."""
|
1043
|
+
metadata = {"records": [{"pmcid": "N/A", "doi": "10.1/x"}]}
|
1044
|
+
with patch(
|
1045
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.utils."
|
1046
|
+
"pubmed_downloader.logger"
|
1047
|
+
) as mock_logger:
|
1048
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
1049
|
+
|
1050
|
+
self.assertEqual(result, "")
|
1051
|
+
# Should log the "No PMCID available" message
|
1052
|
+
mock_logger.info.assert_called_once_with(
|
1053
|
+
"No PMCID available for PDF fetch: PMID %s", "12345678"
|
1054
|
+
)
|
1055
|
+
|
1056
|
+
def test_production_construct_pdf_url_valid_pmcid_calls_fallbacks(self):
|
1057
|
+
"""Test production construct_pdf_url with valid pmcid calls fallbacks."""
|
1058
|
+
metadata = {"records": [{"pmcid": "PMC123456", "doi": "10.1/x"}]}
|
1059
|
+
with patch.object(
|
1060
|
+
self.downloader,
|
1061
|
+
"_fetch_pdf_url_with_fallbacks",
|
1062
|
+
return_value="http://test.pdf",
|
1063
|
+
) as mock_fallbacks:
|
1064
|
+
result = self.downloader.construct_pdf_url(metadata, "12345678")
|
1065
|
+
|
1066
|
+
self.assertEqual(result, "http://test.pdf")
|
1067
|
+
mock_fallbacks.assert_called_once_with("PMC123456")
|