aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  2. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
  3. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  4. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  6. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  7. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
  8. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
  9. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
  12. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
  13. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
  14. aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
  15. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  16. aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
  17. aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
  18. aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
  19. aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
  20. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
  21. aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
  22. aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
  23. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
  24. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
  25. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
  26. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
  28. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
  29. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
  30. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
  31. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
  32. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
  33. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
  34. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
  35. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
  36. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
  37. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
  38. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
  39. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
  40. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
  41. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
  42. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
  43. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
  44. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
  45. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
  46. /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
  47. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
  48. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
  49. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1067 @@
1
+ """
2
+ Unit tests for PubmedDownloader.
3
+ Tests PMID to PMCID conversion, XML parsing, and PDF URL extraction
4
+ from multiple sources. Uses a public shim to avoid accessing protected
5
+ members in tests.
6
+ """
7
+
8
+ # pylint: disable=too-many-lines
9
+ import unittest
10
+ from types import SimpleNamespace
11
+ from unittest.mock import Mock, patch
12
+
13
+ import requests
14
+
15
+ from aiagents4pharma.talk2scholars.tools.paper_download.utils.pubmed_downloader import (
16
+ BasePaperDownloader,
17
+ PubmedDownloader,
18
+ )
19
+
20
+
21
+ class PubmedDownloaderTestShim(PubmedDownloader):
22
+ """Thin public shim that forwards to the real implementation."""
23
+
24
+ __test__ = False # prevent pytest from collecting it as a test
25
+
26
+ # Public wrappers for protected helpers
27
+ def try_oa_api_public(self, pmcid: str) -> str:
28
+ """Public wrapper for _try_oa_api method."""
29
+ return self._try_oa_api(pmcid)
30
+
31
+ def try_europe_pmc_public(self, pmcid: str) -> str:
32
+ """Public wrapper for _try_europe_pmc method."""
33
+ return self._try_europe_pmc(pmcid)
34
+
35
+ def try_pmc_page_scraping_public(self, pmcid: str) -> str:
36
+ """Public wrapper for _try_pmc_page_scraping method."""
37
+ return self._try_pmc_page_scraping(pmcid)
38
+
39
+ def try_direct_pmc_url_public(self, pmcid: str) -> str:
40
+ """Public wrapper for _try_direct_pmc_url method."""
41
+ return self._try_direct_pmc_url(pmcid)
42
+
43
+ def fetch_pdf_url_with_fallbacks_public(self, pmcid: str) -> str:
44
+ """Same fallback order as production, but via public wrappers."""
45
+ for fn in (
46
+ self.try_oa_api_public,
47
+ self.try_europe_pmc_public,
48
+ self.try_pmc_page_scraping_public,
49
+ self.try_direct_pmc_url_public,
50
+ ):
51
+ url = fn(pmcid)
52
+ if url:
53
+ return url
54
+ return ""
55
+
56
+ # IMPORTANT: override to use the shim's public chain so tests can patch it
57
+ def construct_pdf_url(self, metadata, identifier): # same signature
58
+ """Test-friendly override that uses the shim's public fallback chain."""
59
+ if "records" not in metadata or not metadata["records"]:
60
+ return ""
61
+ pmcid = metadata["records"][0].get("pmcid", "")
62
+ if not pmcid or pmcid == "N/A":
63
+ return ""
64
+ return self.fetch_pdf_url_with_fallbacks_public(pmcid)
65
+
66
+ # Public accessors for identifier helpers (avoid protected-access in tests)
67
+ def get_paper_identifier_info_public(self, paper):
68
+ """Public wrapper for _get_paper_identifier_info method."""
69
+ return self._get_paper_identifier_info(paper)
70
+
71
+ def add_service_identifier_public(self, entry, identifier):
72
+ """Public wrapper for _add_service_identifier method."""
73
+ return self._add_service_identifier(entry, identifier)
74
+
75
+ def fetch_pdf_url_with_fallbacks_production(self, pmcid: str) -> str:
76
+ """Public wrapper for _fetch_pdf_url_with_fallbacks method."""
77
+ return self._fetch_pdf_url_with_fallbacks(pmcid)
78
+
79
+
80
+ class TestPubmedDownloaderBasics(unittest.TestCase):
81
+ """Basic metadata + OA API parsing tests (hit the production code)."""
82
+
83
+ def setUp(self):
84
+ cfg = SimpleNamespace(
85
+ id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
86
+ oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
87
+ europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
88
+ pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
89
+ direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
90
+ ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
91
+ https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
92
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
93
+ request_timeout=30,
94
+ chunk_size=8192,
95
+ )
96
+ self.downloader = PubmedDownloaderTestShim(cfg)
97
+
98
+ def test_initialization(self):
99
+ """Sanity check config wiring."""
100
+ self.assertIn("idconv", self.downloader.id_converter_url)
101
+ self.assertIn("oa.fcgi", self.downloader.oa_api_url)
102
+
103
+ @patch("requests.get")
104
+ def test_fetch_metadata_success(self, mock_get):
105
+ """Successful PMID→PMCID conversion."""
106
+ resp = Mock()
107
+ resp.json.return_value = {
108
+ "records": [{"pmid": "12345678", "pmcid": "PMC123456", "doi": "10.1/x"}]
109
+ }
110
+ resp.raise_for_status = Mock()
111
+ mock_get.return_value = resp
112
+
113
+ data = self.downloader.fetch_metadata("12345678")
114
+ mock_get.assert_called_once()
115
+ self.assertIn("records", data)
116
+ self.assertEqual(data["records"][0]["pmcid"], "PMC123456")
117
+
118
+ @patch("requests.get")
119
+ def test_fetch_metadata_no_records(self, mock_get):
120
+ """Test fetch_metadata with empty records."""
121
+ resp = Mock()
122
+ resp.json.return_value = {"records": []}
123
+ resp.raise_for_status = Mock()
124
+ mock_get.return_value = resp
125
+ with self.assertRaises(RuntimeError):
126
+ self.downloader.fetch_metadata("12345678")
127
+
128
+ @patch("requests.get")
129
+ def test_fetch_metadata_network_error(self, mock_get):
130
+ """Test fetch_metadata with network error."""
131
+ mock_get.side_effect = requests.RequestException("down")
132
+ with self.assertRaises(requests.RequestException):
133
+ self.downloader.fetch_metadata("12345678")
134
+
135
+ # ---- OA API paths (cover lines ~77–87, 99–122) ----
136
+
137
+ @patch("requests.get")
138
+ def test_oa_api_xml_error_node_returns_empty(self, mock_get):
139
+ """<error> node -> return empty string."""
140
+ resp = Mock()
141
+ resp.text = (
142
+ '<?xml version="1.0"?><OA><error code="idDoesNotExist">'
143
+ "Invalid PMC ID</error></OA>"
144
+ )
145
+ resp.raise_for_status = Mock()
146
+ mock_get.return_value = resp
147
+
148
+ out = self.downloader.try_oa_api_public("PMC999999")
149
+ self.assertEqual(out, "")
150
+
151
+ @patch("requests.get")
152
+ def test_oa_api_pdf_link_success(self, mock_get):
153
+ """<link format='pdf' href='https://...'> -> return the https link."""
154
+ resp = Mock()
155
+ resp.text = (
156
+ '<?xml version="1.0"?><OA><records><record>'
157
+ '<link format="pdf" '
158
+ 'href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1/pdf/a.pdf"/>'
159
+ "</record></records></OA>"
160
+ )
161
+ resp.raise_for_status = Mock()
162
+ mock_get.return_value = resp
163
+
164
+ out = self.downloader.try_oa_api_public("PMC1")
165
+ self.assertTrue(out.endswith("/PMC1/pdf/a.pdf"))
166
+
167
+ @patch("requests.get")
168
+ def test_oa_api_ftp_link_converts_to_https(self, mock_get):
169
+ """ftp:// link -> converted to https:// base (covers conversion branch)."""
170
+ resp = Mock()
171
+ resp.text = (
172
+ '<?xml version="1.0"?><OA><records><record>'
173
+ '<link format="pdf" '
174
+ 'href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/a/b/c.pdf"/>'
175
+ "</record></records></OA>"
176
+ )
177
+ resp.raise_for_status = Mock()
178
+ mock_get.return_value = resp
179
+
180
+ out = self.downloader.try_oa_api_public("PMC2")
181
+ self.assertTrue(out.startswith("https://www.ncbi.nlm.nih.gov/pmc"))
182
+ self.assertTrue(out.endswith("c.pdf"))
183
+
184
+ @patch("requests.get")
185
+ def test_oa_api_network_exception_returns_empty(self, mock_get):
186
+ """Test OA API with network exception returns empty string."""
187
+ mock_get.side_effect = requests.RequestException("net")
188
+ out = self.downloader.try_oa_api_public("PMC3")
189
+ self.assertEqual(out, "")
190
+
191
+
192
+ class TestPubmedDownloaderOAAPI(unittest.TestCase):
193
+ """Tests for OA API and FTP->HTTPS conversion."""
194
+
195
+ def setUp(self):
196
+ cfg = SimpleNamespace(
197
+ id_converter_url="",
198
+ oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
199
+ europe_pmc_base_url="",
200
+ pmc_page_base_url="",
201
+ direct_pmc_pdf_base_url="",
202
+ ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
203
+ https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
204
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
205
+ request_timeout=30,
206
+ chunk_size=8192,
207
+ )
208
+ self.downloader = PubmedDownloaderTestShim(cfg)
209
+
210
+ @patch("requests.get")
211
+ def test_try_oa_api_success(self, mock_get):
212
+ """Test successful OA API response."""
213
+ mock_response = Mock()
214
+ mock_response.text = """<?xml version="1.0" encoding="UTF-8"?>
215
+ <OA>
216
+ <records>
217
+ <record>
218
+ <link format="pdf"
219
+ href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf"/>
220
+ </record>
221
+ </records>
222
+ </OA>"""
223
+ mock_response.raise_for_status = Mock()
224
+ mock_get.return_value = mock_response
225
+
226
+ result = self.downloader.try_oa_api_public("PMC123456")
227
+ expected_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC123456"
228
+ mock_get.assert_called_once_with(expected_url, timeout=30)
229
+ self.assertEqual(
230
+ result,
231
+ "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf",
232
+ )
233
+ self.assertIn("PMC123456", result)
234
+
235
+ @patch("requests.get")
236
+ def test_try_oa_api_error_response(self, mock_get):
237
+ """Test OA API error response."""
238
+ mock_response = Mock()
239
+ mock_response.text = """<?xml version="1.0" encoding="UTF-8"?>
240
+ <OA>
241
+ <error code="idDoesNotExist">Invalid PMC ID</error>
242
+ </OA>"""
243
+ mock_response.raise_for_status = Mock()
244
+ mock_get.return_value = mock_response
245
+ result = self.downloader.try_oa_api_public("PMC123456")
246
+ self.assertEqual(result, "")
247
+
248
+ @patch("requests.get")
249
+ def test_try_oa_api_network_error(self, mock_get):
250
+ """Test OA API with network error."""
251
+ mock_get.side_effect = requests.RequestException("Network error")
252
+ result = self.downloader.try_oa_api_public("PMC123456")
253
+ self.assertEqual(result, "")
254
+
255
+ def test_ftp_to_https_conversion(self):
256
+ """Test FTP to HTTPS URL conversion."""
257
+ xml = """<?xml version="1.0" encoding="UTF-8"?>
258
+ <OA>
259
+ <records>
260
+ <record>
261
+ <link format="pdf"
262
+ href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/test.pdf"/>
263
+ </record>
264
+ </records>
265
+ </OA>"""
266
+ with patch("requests.get") as mock_get:
267
+ resp = Mock()
268
+ resp.text = xml
269
+ resp.raise_for_status = Mock()
270
+ mock_get.return_value = resp
271
+ result = self.downloader.try_oa_api_public("PMC123456")
272
+ self.assertTrue(result.startswith("https://www.ncbi.nlm.nih.gov/pmc"))
273
+ self.assertNotIn("ftp://", result)
274
+ self.assertIn("test.pdf", result)
275
+
276
+
277
+ class TestPubmedDownloaderEuropePMC(unittest.TestCase):
278
+ """Europe PMC link checking."""
279
+
280
+ def setUp(self):
281
+ cfg = SimpleNamespace(
282
+ id_converter_url="",
283
+ oa_api_url="",
284
+ europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
285
+ pmc_page_base_url="",
286
+ direct_pmc_pdf_base_url="",
287
+ ftp_base_url="",
288
+ https_base_url="",
289
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
290
+ request_timeout=30,
291
+ chunk_size=8192,
292
+ )
293
+ self.downloader = PubmedDownloaderTestShim(cfg)
294
+
295
+ @patch("requests.head")
296
+ def test_try_europe_pmc_success(self, mock_head):
297
+ """Test successful Europe PMC response."""
298
+ resp = Mock()
299
+ resp.status_code = 200
300
+ mock_head.return_value = resp
301
+ result = self.downloader.try_europe_pmc_public("PMC123456")
302
+ expected = (
303
+ "https://www.ebi.ac.uk/europepmc/webservices/rest"
304
+ "?accid=PMC123456&blobtype=pdf"
305
+ )
306
+ mock_head.assert_called_once_with(expected, timeout=30)
307
+ self.assertEqual(result, expected)
308
+
309
+ @patch("requests.head")
310
+ def test_try_europe_pmc_not_found(self, mock_head):
311
+ """Test Europe PMC not found response."""
312
+ resp = Mock()
313
+ resp.status_code = 404
314
+ mock_head.return_value = resp
315
+ self.assertEqual(self.downloader.try_europe_pmc_public("PMC123456"), "")
316
+
317
+ @patch("requests.head")
318
+ def test_try_europe_pmc_network_error(self, mock_head):
319
+ """Test Europe PMC with network error."""
320
+ mock_head.side_effect = requests.RequestException("Network error")
321
+ self.assertEqual(self.downloader.try_europe_pmc_public("PMC123456"), "")
322
+
323
+
324
+ class TestPubmedDownloaderPMCScrape(unittest.TestCase):
325
+ """Scraping from PMC page."""
326
+
327
+ def setUp(self):
328
+ cfg = SimpleNamespace(
329
+ id_converter_url="",
330
+ oa_api_url="",
331
+ europe_pmc_base_url="",
332
+ pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
333
+ direct_pmc_pdf_base_url="",
334
+ ftp_base_url="",
335
+ https_base_url="",
336
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
337
+ request_timeout=30,
338
+ chunk_size=8192,
339
+ )
340
+ self.downloader = PubmedDownloaderTestShim(cfg)
341
+
342
+ @patch("requests.get")
343
+ def test_try_pmc_page_scraping_success(self, mock_get):
344
+ """Test successful PMC page scraping."""
345
+ resp = Mock()
346
+ html = (
347
+ '<html><head><meta name="citation_pdf_url" '
348
+ 'content="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf">'
349
+ "</head></html>"
350
+ )
351
+ resp.content = html.encode()
352
+ resp.raise_for_status = Mock()
353
+ mock_get.return_value = resp
354
+
355
+ result = self.downloader.try_pmc_page_scraping_public("PMC123456")
356
+
357
+ expected_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/"
358
+ expected_headers = {"User-Agent": "Mozilla/5.0 (compatible; test-agent)"}
359
+ mock_get.assert_called_once_with(
360
+ expected_url, headers=expected_headers, timeout=30
361
+ )
362
+ self.assertEqual(
363
+ result, "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf"
364
+ )
365
+
366
+ @patch("requests.get")
367
+ def test_try_pmc_page_scraping_no_pdf(self, mock_get):
368
+ """Test PMC page scraping with no PDF found."""
369
+ resp = Mock()
370
+ resp.content = "<html><head></head></html>".encode()
371
+ resp.raise_for_status = Mock()
372
+ mock_get.return_value = resp
373
+ self.assertEqual(self.downloader.try_pmc_page_scraping_public("PMC123456"), "")
374
+
375
+ @patch("requests.get")
376
+ def test_try_pmc_page_scraping_network_error(self, mock_get):
377
+ """Test PMC page scraping with network error."""
378
+ mock_get.side_effect = requests.RequestException("Network error")
379
+ self.assertEqual(self.downloader.try_pmc_page_scraping_public("PMC123456"), "")
380
+
381
+
382
+ class TestPubmedDownloaderDirectPMC(unittest.TestCase):
383
+ """Direct PMC PDF attempts."""
384
+
385
+ def setUp(self):
386
+ cfg = SimpleNamespace(
387
+ id_converter_url="",
388
+ oa_api_url="",
389
+ europe_pmc_base_url="",
390
+ pmc_page_base_url="",
391
+ direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
392
+ ftp_base_url="",
393
+ https_base_url="",
394
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
395
+ request_timeout=30,
396
+ chunk_size=8192,
397
+ )
398
+ self.downloader = PubmedDownloaderTestShim(cfg)
399
+
400
+ @patch("requests.head")
401
+ def test_try_direct_pmc_url_success(self, mock_head):
402
+ """Test successful direct PMC URL access."""
403
+ resp = Mock()
404
+ resp.status_code = 200
405
+ mock_head.return_value = resp
406
+ result = self.downloader.try_direct_pmc_url_public("PMC123456")
407
+ expected = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/"
408
+ mock_head.assert_called_once_with(expected, timeout=30)
409
+ self.assertEqual(result, expected)
410
+
411
+ @patch("requests.head")
412
+ def test_try_direct_pmc_url_not_found(self, mock_head):
413
+ """Test direct PMC URL not found."""
414
+ resp = Mock()
415
+ resp.status_code = 404
416
+ mock_head.return_value = resp
417
+ self.assertEqual(self.downloader.try_direct_pmc_url_public("PMC123456"), "")
418
+
419
+ @patch("requests.head")
420
+ def test_try_direct_pmc_url_exception(self, mock_head):
421
+ """Test direct PMC URL with exception."""
422
+ mock_head.side_effect = requests.RequestException("Network error")
423
+ result = self.downloader.try_direct_pmc_url_public("PMC123456")
424
+ self.assertEqual(result, "")
425
+ expected_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/"
426
+ mock_head.assert_called_once_with(expected_url, timeout=30)
427
+
428
+
429
+ class TestPubmedDownloaderConstructAndFallbacks(unittest.TestCase):
430
+ """Construct URL + fallback chains via public wrappers (no protected access)."""
431
+
432
+ def setUp(self):
433
+ cfg = SimpleNamespace(
434
+ id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
435
+ oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
436
+ europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
437
+ pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
438
+ direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
439
+ ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
440
+ https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
441
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
442
+ request_timeout=30,
443
+ chunk_size=8192,
444
+ )
445
+ self.downloader = PubmedDownloaderTestShim(cfg)
446
+
447
+ def test_construct_pdf_url_success(self):
448
+ """construct_pdf_url routes to the public fallback chain in tests."""
449
+ metadata = {"records": [{"pmcid": "PMC123456", "doi": "10.1234/test"}]}
450
+ with patch.object(
451
+ self.downloader,
452
+ "fetch_pdf_url_with_fallbacks_public",
453
+ return_value="http://test.pdf",
454
+ ) as mock_fetch:
455
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
456
+ self.assertEqual(result, "http://test.pdf")
457
+ mock_fetch.assert_called_once_with("PMC123456")
458
+
459
+ def test_construct_pdf_url_no_records(self):
460
+ """Test construct_pdf_url with no records."""
461
+ self.assertEqual(self.downloader.construct_pdf_url({}, "x"), "")
462
+
463
+ def test_construct_pdf_url_no_pmcid(self):
464
+ """Test construct_pdf_url with no PMCID."""
465
+ md = {"records": [{"pmcid": "N/A", "doi": "10.1/x"}]}
466
+ self.assertEqual(self.downloader.construct_pdf_url(md, "x"), "")
467
+
468
+ def test_fetch_pdf_url_with_fallbacks_europe_pmc_success(self):
469
+ """Test fallback chain with Europe PMC success."""
470
+ with (
471
+ patch.object(self.downloader, "try_oa_api_public", return_value="") as m_oa,
472
+ patch.object(
473
+ self.downloader, "try_europe_pmc_public", return_value="http://eu.pdf"
474
+ ) as m_eu,
475
+ patch.object(self.downloader, "try_pmc_page_scraping_public") as m_scr,
476
+ patch.object(self.downloader, "try_direct_pmc_url_public") as m_dir,
477
+ ):
478
+ out = self.downloader.fetch_pdf_url_with_fallbacks_public("PMC123456")
479
+ self.assertEqual(out, "http://eu.pdf")
480
+ m_oa.assert_called_once_with("PMC123456")
481
+ m_eu.assert_called_once_with("PMC123456")
482
+ m_scr.assert_not_called()
483
+ m_dir.assert_not_called()
484
+
485
+ def test_fetch_pdf_url_with_fallbacks_multiple_sources(self):
486
+ """Test fallback chain through multiple sources."""
487
+ with (
488
+ patch.object(self.downloader, "try_oa_api_public", return_value="") as m_oa,
489
+ patch.object(
490
+ self.downloader, "try_europe_pmc_public", return_value=""
491
+ ) as m_eu,
492
+ patch.object(
493
+ self.downloader,
494
+ "try_pmc_page_scraping_public",
495
+ return_value="http://test.pdf",
496
+ ) as m_scr,
497
+ patch.object(
498
+ self.downloader, "try_direct_pmc_url_public", return_value=""
499
+ ) as m_dir,
500
+ ):
501
+ out = self.downloader.fetch_pdf_url_with_fallbacks_public("PMC123456")
502
+ self.assertEqual(out, "http://test.pdf")
503
+ m_oa.assert_called_once_with("PMC123456")
504
+ m_eu.assert_called_once_with("PMC123456")
505
+ m_scr.assert_called_once_with("PMC123456")
506
+ m_dir.assert_not_called()
507
+
508
+ def test_fetch_pdf_url_with_fallbacks_direct_pmc_success(self):
509
+ """Test fallback chain with direct PMC success."""
510
+ with (
511
+ patch.object(self.downloader, "try_oa_api_public", return_value="") as m_oa,
512
+ patch.object(
513
+ self.downloader, "try_europe_pmc_public", return_value=""
514
+ ) as m_eu,
515
+ patch.object(
516
+ self.downloader, "try_pmc_page_scraping_public", return_value=""
517
+ ) as m_scr,
518
+ patch.object(
519
+ self.downloader,
520
+ "try_direct_pmc_url_public",
521
+ return_value="http://direct.pdf",
522
+ ) as m_dir,
523
+ ):
524
+ out = self.downloader.fetch_pdf_url_with_fallbacks_public("PMC123456")
525
+ self.assertEqual(out, "http://direct.pdf")
526
+ m_oa.assert_called_once_with("PMC123456")
527
+ m_eu.assert_called_once_with("PMC123456")
528
+ m_scr.assert_called_once_with("PMC123456")
529
+ m_dir.assert_called_once_with("PMC123456")
530
+
531
+ def test_fetch_pdf_url_with_fallbacks_all_fail(self):
532
+ """Test fallback chain when all sources fail."""
533
+ with (
534
+ patch.object(self.downloader, "try_oa_api_public", return_value="") as m_oa,
535
+ patch.object(
536
+ self.downloader, "try_europe_pmc_public", return_value=""
537
+ ) as m_eu,
538
+ patch.object(
539
+ self.downloader, "try_pmc_page_scraping_public", return_value=""
540
+ ) as m_scr,
541
+ patch.object(
542
+ self.downloader, "try_direct_pmc_url_public", return_value=""
543
+ ) as m_dir,
544
+ ):
545
+ out = self.downloader.fetch_pdf_url_with_fallbacks_public("PMC123456")
546
+ self.assertEqual(out, "")
547
+ m_oa.assert_called_once_with("PMC123456")
548
+ m_eu.assert_called_once_with("PMC123456")
549
+ m_scr.assert_called_once_with("PMC123456")
550
+ m_dir.assert_called_once_with("PMC123456")
551
+
552
+ def test_identifier_helper_wrappers(self):
553
+ """Covers _get_paper_identifier_info and _add_service_identifier via wrappers."""
554
+ paper = {"PMID": "12345678", "PMCID": "PMC9"}
555
+ info = self.downloader.get_paper_identifier_info_public(paper)
556
+ self.assertIn("PMID: 12345678", info)
557
+ self.assertIn("PMCID: PMC9", info)
558
+
559
+ entry = {}
560
+ self.downloader.add_service_identifier_public(entry, "11122233")
561
+ self.assertEqual(
562
+ entry,
563
+ {"PMID": "11122233", "PMCID": "N/A", "DOI": "N/A", "Journal": "N/A"},
564
+ )
565
+
566
+
567
+ class TestPubmedDownloaderIntegration(unittest.TestCase):
568
+ """Integration tests for PubmedDownloader workflow."""
569
+
570
+ def setUp(self):
571
+ cfg = SimpleNamespace(
572
+ id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
573
+ oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
574
+ europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
575
+ pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
576
+ direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
577
+ ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
578
+ https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
579
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
580
+ request_timeout=30,
581
+ chunk_size=8192,
582
+ )
583
+ self.downloader = PubmedDownloaderTestShim(cfg)
584
+
585
+ @patch("requests.get")
586
+ def test_full_workflow_pmid_to_pdf(self, mock_get):
587
+ """Test full workflow from PMID to PDF URL."""
588
+ metadata_response = Mock()
589
+ metadata_response.json.return_value = {
590
+ "records": [
591
+ {"pmid": "12345678", "pmcid": "PMC123456", "doi": "10.1234/test"}
592
+ ]
593
+ }
594
+ metadata_response.raise_for_status = Mock()
595
+
596
+ oa_response = Mock()
597
+ oa_response.text = """<?xml version="1.0" encoding="UTF-8"?>
598
+ <OA><records><record>
599
+ <link format="pdf"
600
+ href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/test.pdf"/>
601
+ </record></records></OA>"""
602
+ oa_response.raise_for_status = Mock()
603
+
604
+ def get_side_effect(url, *_, **__):
605
+ if "idconv" in url:
606
+ return metadata_response
607
+ if "oa.fcgi" in url:
608
+ return oa_response
609
+ return None
610
+
611
+ mock_get.side_effect = get_side_effect
612
+
613
+ identifier = "12345678"
614
+ metadata = self.downloader.fetch_metadata(identifier)
615
+ pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
616
+
617
+ self.assertEqual(metadata["records"][0]["pmid"], "12345678")
618
+ self.assertEqual(metadata["records"][0]["pmcid"], "PMC123456")
619
+ self.assertIn("PMC123456", pdf_url)
620
+ self.assertTrue(pdf_url.startswith("https://"))
621
+ self.assertEqual(mock_get.call_count, 2)
622
+ self.assertIn("idconv", mock_get.call_args_list[0][0][0])
623
+ self.assertIn("oa.fcgi", mock_get.call_args_list[1][0][0])
624
+
625
+ # Test the None return path in get_side_effect
626
+ result = get_side_effect("https://unknown-api.com/test")
627
+ self.assertIsNone(result)
628
+
629
+ @patch("requests.get")
630
+ def test_workflow_with_fallback_sources(self, mock_get):
631
+ """Test workflow with fallback to alternative sources."""
632
+ metadata_response = Mock()
633
+ metadata_response.json.return_value = {
634
+ "records": [
635
+ {"pmid": "12345678", "pmcid": "PMC123456", "doi": "10.1234/test"}
636
+ ]
637
+ }
638
+ metadata_response.raise_for_status = Mock()
639
+
640
+ oa_response = Mock()
641
+ oa_response.text = """<?xml version="1.0" encoding="UTF-8"?>
642
+ <OA><error code="idDoesNotExist">Invalid PMC ID</error></OA>"""
643
+ oa_response.raise_for_status = Mock()
644
+
645
+ scrape_response = Mock()
646
+ html = (
647
+ '<html><head><meta name="citation_pdf_url" '
648
+ 'content="https://www.ncbi.nlm.nih.gov/pmc/articles/'
649
+ 'PMC123456/pdf/fallback.pdf"></head></html>'
650
+ )
651
+ scrape_response.content = html.encode()
652
+ scrape_response.raise_for_status = Mock()
653
+
654
+ mock_get.side_effect = [metadata_response, oa_response, scrape_response]
655
+
656
+ with patch("requests.head") as mock_head:
657
+ mock_head.return_value.status_code = 404
658
+ identifier = "12345678"
659
+ metadata = self.downloader.fetch_metadata(identifier)
660
+ pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
661
+
662
+ self.assertEqual(
663
+ pdf_url,
664
+ "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC123456/pdf/fallback.pdf",
665
+ )
666
+ self.assertEqual(mock_get.call_count, 3)
667
+ mock_head.assert_called_once()
668
+
669
+
670
+ class TestPubmedDownloaderOAAPINoLink(unittest.TestCase):
671
+ """Test OA API responses without PDF links."""
672
+
673
+ def setUp(self):
674
+ """Set up test configuration."""
675
+ cfg = SimpleNamespace(
676
+ id_converter_url="",
677
+ oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
678
+ europe_pmc_base_url="",
679
+ pmc_page_base_url="",
680
+ direct_pmc_pdf_base_url="",
681
+ ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
682
+ https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
683
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
684
+ request_timeout=30,
685
+ chunk_size=8192,
686
+ )
687
+ self.downloader = PubmedDownloaderTestShim(cfg)
688
+
689
+ @patch("requests.get")
690
+ def test_oa_api_ok_but_no_pdf_link_returns_empty(self, mock_get):
691
+ """Covers OA API response that has no <link format='pdf'>."""
692
+ resp = Mock()
693
+ resp.text = """<?xml version="1.0"?>
694
+ <OA><records><record><link format="tgz" href="https://x/y.tgz"/></record></records></OA>"""
695
+ resp.raise_for_status = Mock()
696
+ mock_get.return_value = resp
697
+ out = self.downloader.try_oa_api_public("PMCNOPOINTER")
698
+ self.assertEqual(out, "")
699
+
700
+
701
+ class TestPubmedDownloaderExtractMetadata(unittest.TestCase):
702
+ """Test extract_paper_metadata method functionality."""
703
+
704
+ def setUp(self):
705
+ """Set up test configuration for extract metadata tests."""
706
+ cfg = SimpleNamespace(
707
+ id_converter_url="",
708
+ oa_api_url="",
709
+ europe_pmc_base_url="",
710
+ pmc_page_base_url="",
711
+ direct_pmc_pdf_base_url="",
712
+ ftp_base_url="",
713
+ https_base_url="",
714
+ user_agent="ua",
715
+ request_timeout=5,
716
+ chunk_size=1024,
717
+ )
718
+ self.downloader = PubmedDownloaderTestShim(cfg)
719
+
720
+ def test_extract_metadata_raises_when_no_records(self):
721
+ """Test that extract_metadata raises RuntimeError when no records."""
722
+ with self.assertRaises(RuntimeError):
723
+ self.downloader.extract_paper_metadata({}, "123", None)
724
+
725
+ def test_extract_metadata_with_pdf_result(self):
726
+ """Test extract_metadata with PDF result tuple."""
727
+ metadata = {"records": [{"pmcid": "PMC1", "doi": "10.1/x"}]}
728
+ pdf_result = ("/tmp/file.pdf", "custom.pdf")
729
+ out = self.downloader.extract_paper_metadata(metadata, "12345678", pdf_result)
730
+ self.assertEqual(out["access_type"], "open_access_downloaded")
731
+ self.assertEqual(out["URL"], "/tmp/file.pdf")
732
+ self.assertEqual(out["pdf_url"], "/tmp/file.pdf")
733
+ self.assertEqual(out["filename"], "custom.pdf")
734
+ self.assertEqual(out["PMCID"], "PMC1")
735
+ self.assertEqual(out["PMID"], "12345678")
736
+
737
+ def test_extract_metadata_without_pdf_with_pmcid(self):
738
+ """Test extract_metadata without PDF but with valid PMCID."""
739
+ metadata = {"records": [{"pmcid": "PMC9", "doi": "10.1/x"}]}
740
+ out = self.downloader.extract_paper_metadata(metadata, "42", None)
741
+ self.assertEqual(out["access_type"], "abstract_only")
742
+ self.assertEqual(out["filename"], "pmid_42.pdf")
743
+ self.assertEqual(out["URL"], "")
744
+ self.assertEqual(out["pdf_url"], "")
745
+
746
+ def test_extract_metadata_without_pdf_no_pmcid(self):
747
+ """Test extract_metadata without PDF and no PMCID."""
748
+ metadata = {"records": [{"pmcid": "N/A", "doi": "10.1/x"}]}
749
+ out = self.downloader.extract_paper_metadata(metadata, "42", None)
750
+ self.assertEqual(out["access_type"], "no_pmcid")
751
+ self.assertEqual(out["filename"], "pmid_42.pdf")
752
+
753
+
754
+ class TestPubmedDownloaderHelpers(unittest.TestCase):
755
+ """Test helper methods and utility functions."""
756
+
757
+ def setUp(self):
758
+ """Set up test configuration with helper downloader."""
759
+ cfg = SimpleNamespace(
760
+ id_converter_url="",
761
+ oa_api_url="",
762
+ europe_pmc_base_url="",
763
+ pmc_page_base_url="",
764
+ direct_pmc_pdf_base_url="",
765
+ ftp_base_url="",
766
+ https_base_url="",
767
+ user_agent="ua",
768
+ request_timeout=5,
769
+ chunk_size=1024,
770
+ )
771
+ self.downloader = PubmedDownloaderTestShim(cfg)
772
+
773
+ def test_service_and_identifier_names_and_default_filename(self):
774
+ """Test service name, identifier name, and default filename generation."""
775
+ self.assertEqual(self.downloader.get_service_name(), "PubMed")
776
+ self.assertEqual(self.downloader.get_identifier_name(), "PMID")
777
+ self.assertEqual(self.downloader.get_default_filename("777"), "pmid_777.pdf")
778
+
779
+ def test_get_snippet_placeholders_return_empty(self):
780
+ """Test that placeholder abstracts return empty snippets."""
781
+ self.assertEqual(self.downloader.get_snippet(""), "")
782
+ self.assertEqual(self.downloader.get_snippet("N/A"), "")
783
+ self.assertEqual(
784
+ self.downloader.get_snippet("Abstract available in PubMed"), ""
785
+ )
786
+
787
+ def test_get_snippet_non_placeholder_delegates_to_base(self):
788
+ """Test that non-placeholder abstracts delegate to base class."""
789
+ with patch.object(BasePaperDownloader, "get_snippet", return_value="SNIP") as p:
790
+ out = self.downloader.get_snippet("Real abstract text")
791
+ p.assert_called_once_with("Real abstract text")
792
+ self.assertEqual(out, "SNIP")
793
+
794
+ def test_get_paper_identifier_info_without_pmcid_line(self):
795
+ """Test paper identifier info formatting without PMCID."""
796
+ info = self.downloader.get_paper_identifier_info_public(
797
+ {"PMID": "999", "PMCID": "N/A"}
798
+ )
799
+ self.assertIn("(PMID: 999)", info)
800
+ self.assertNotIn("PMCID:", info)
801
+
802
+
803
+ class TestPubmedDownloaderMissingLineCoverage(unittest.TestCase):
804
+ """Tests to cover missing lines 77-87 and 99-122."""
805
+
806
+ def setUp(self):
807
+ cfg = SimpleNamespace(
808
+ id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
809
+ oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
810
+ europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
811
+ pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
812
+ direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
813
+ ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
814
+ https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
815
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
816
+ request_timeout=30,
817
+ chunk_size=8192,
818
+ )
819
+ self.downloader = PubmedDownloaderTestShim(cfg)
820
+
821
+ def test_construct_pdf_url_empty_records_list(self):
822
+ """Test construct_pdf_url with empty records list (covers line 77-78)."""
823
+ metadata = {"records": []}
824
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
825
+ self.assertEqual(result, "")
826
+
827
+ def test_construct_pdf_url_missing_records_key(self):
828
+ """Test construct_pdf_url with missing records key (covers line 77-78)."""
829
+ metadata = {"other_key": "value"}
830
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
831
+ self.assertEqual(result, "")
832
+
833
+ def test_construct_pdf_url_empty_pmcid_string(self):
834
+ """Test construct_pdf_url with empty pmcid string (covers line 83-85)."""
835
+ metadata = {"records": [{"pmcid": "", "doi": "10.1/x"}]}
836
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
837
+ self.assertEqual(result, "")
838
+
839
+ def test_construct_pdf_url_missing_pmcid_key(self):
840
+ """Test construct_pdf_url with missing pmcid key (covers line 81, 83-85)."""
841
+ metadata = {"records": [{"doi": "10.1/x"}]}
842
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
843
+ self.assertEqual(result, "")
844
+
845
+ @patch.object(PubmedDownloaderTestShim, "_fetch_pdf_url_with_fallbacks")
846
+ def test_fetch_pdf_url_with_fallbacks_logging_and_return(self, mock_fallbacks):
847
+ """Test _fetch_pdf_url_with_fallbacks method logging (covers lines 99-122)."""
848
+ mock_fallbacks.return_value = "http://test.pdf"
849
+
850
+ # Use the public wrapper for testing
851
+ result = self.downloader.fetch_pdf_url_with_fallbacks_production("PMC123456")
852
+
853
+ mock_fallbacks.assert_called_once_with("PMC123456")
854
+ self.assertEqual(result, "http://test.pdf")
855
+
856
+ def test_fetch_pdf_url_with_fallbacks_all_fail_with_logging(self):
857
+ """Test _fetch_pdf_url_with_fallbacks when all methods fail with warning log."""
858
+ with (
859
+ patch.object(self.downloader, "_try_oa_api", return_value=""),
860
+ patch.object(self.downloader, "_try_europe_pmc", return_value=""),
861
+ patch.object(self.downloader, "_try_pmc_page_scraping", return_value=""),
862
+ patch.object(self.downloader, "_try_direct_pmc_url", return_value=""),
863
+ patch(
864
+ "aiagents4pharma.talk2scholars.tools.paper_download.utils."
865
+ "pubmed_downloader.logger"
866
+ ) as mock_logger,
867
+ ):
868
+
869
+ result = self.downloader.fetch_pdf_url_with_fallbacks_production(
870
+ "PMC123456"
871
+ )
872
+
873
+ self.assertEqual(result, "")
874
+ # Verify the warning log is called
875
+ mock_logger.warning.assert_called_once_with(
876
+ "All PDF URL strategies failed for PMCID: %s", "PMC123456"
877
+ )
878
+
879
+ def test_fetch_pdf_url_with_fallbacks_oa_api_success_early_return(self):
880
+ """Test _fetch_pdf_url_with_fallbacks when OA API succeeds on first try."""
881
+ with (
882
+ patch.object(
883
+ self.downloader, "_try_oa_api", return_value="http://oa.pdf"
884
+ ) as mock_oa,
885
+ patch.object(self.downloader, "_try_europe_pmc") as mock_eu,
886
+ patch.object(self.downloader, "_try_pmc_page_scraping") as mock_scr,
887
+ patch.object(self.downloader, "_try_direct_pmc_url") as mock_dir,
888
+ patch(
889
+ "aiagents4pharma.talk2scholars.tools.paper_download.utils."
890
+ "pubmed_downloader.logger"
891
+ ) as mock_logger,
892
+ ):
893
+
894
+ result = self.downloader.fetch_pdf_url_with_fallbacks_production(
895
+ "PMC123456"
896
+ )
897
+
898
+ self.assertEqual(result, "http://oa.pdf")
899
+ mock_oa.assert_called_once_with("PMC123456")
900
+ mock_eu.assert_not_called()
901
+ mock_scr.assert_not_called()
902
+ mock_dir.assert_not_called()
903
+ # Verify the initial info log is called
904
+ mock_logger.info.assert_called_with(
905
+ "Fetching PDF URL for PMCID: %s", "PMC123456"
906
+ )
907
+
908
+ def test_fetch_pdf_url_with_fallbacks_europe_pmc_success_after_oa_fail(self):
909
+ """Test _fetch_pdf_url_with_fallbacks when Europe PMC succeeds after OA API fails."""
910
+ with (
911
+ patch.object(self.downloader, "_try_oa_api", return_value="") as mock_oa,
912
+ patch.object(
913
+ self.downloader, "_try_europe_pmc", return_value="http://eu.pdf"
914
+ ) as mock_eu,
915
+ patch.object(self.downloader, "_try_pmc_page_scraping") as mock_scr,
916
+ patch.object(self.downloader, "_try_direct_pmc_url") as mock_dir,
917
+ ):
918
+
919
+ result = self.downloader.fetch_pdf_url_with_fallbacks_production(
920
+ "PMC123456"
921
+ )
922
+
923
+ self.assertEqual(result, "http://eu.pdf")
924
+ mock_oa.assert_called_once_with("PMC123456")
925
+ mock_eu.assert_called_once_with("PMC123456")
926
+ mock_scr.assert_not_called()
927
+ mock_dir.assert_not_called()
928
+
929
+ def test_fetch_pdf_url_with_fallbacks_pmc_scraping_success_after_previous_fail(
930
+ self,
931
+ ):
932
+ """Test _fetch_pdf_url_with_fallbacks when PMC scraping succeeds."""
933
+ with (
934
+ patch.object(self.downloader, "_try_oa_api", return_value="") as mock_oa,
935
+ patch.object(
936
+ self.downloader, "_try_europe_pmc", return_value=""
937
+ ) as mock_eu,
938
+ patch.object(
939
+ self.downloader, "_try_pmc_page_scraping", return_value="http://scr.pdf"
940
+ ) as mock_scr,
941
+ patch.object(self.downloader, "_try_direct_pmc_url") as mock_dir,
942
+ ):
943
+
944
+ result = self.downloader.fetch_pdf_url_with_fallbacks_production(
945
+ "PMC123456"
946
+ )
947
+
948
+ self.assertEqual(result, "http://scr.pdf")
949
+ mock_oa.assert_called_once_with("PMC123456")
950
+ mock_eu.assert_called_once_with("PMC123456")
951
+ mock_scr.assert_called_once_with("PMC123456")
952
+ mock_dir.assert_not_called()
953
+
954
+ def test_fetch_pdf_url_with_fallbacks_direct_pmc_success_last_resort(self):
955
+ """Test _fetch_pdf_url_with_fallbacks when direct PMC succeeds as last resort."""
956
+ with (
957
+ patch.object(self.downloader, "_try_oa_api", return_value="") as mock_oa,
958
+ patch.object(
959
+ self.downloader, "_try_europe_pmc", return_value=""
960
+ ) as mock_eu,
961
+ patch.object(
962
+ self.downloader, "_try_pmc_page_scraping", return_value=""
963
+ ) as mock_scr,
964
+ patch.object(
965
+ self.downloader, "_try_direct_pmc_url", return_value="http://dir.pdf"
966
+ ) as mock_dir,
967
+ ):
968
+
969
+ result = self.downloader.fetch_pdf_url_with_fallbacks_production(
970
+ "PMC123456"
971
+ )
972
+
973
+ self.assertEqual(result, "http://dir.pdf")
974
+ mock_oa.assert_called_once_with("PMC123456")
975
+ mock_eu.assert_called_once_with("PMC123456")
976
+ mock_scr.assert_called_once_with("PMC123456")
977
+ mock_dir.assert_called_once_with("PMC123456")
978
+
979
+
980
+ class TestPubmedDownloaderProductionConstructPdfUrl(unittest.TestCase):
981
+ """Test production construct_pdf_url method to hit the actual lines 77-87."""
982
+
983
+ def setUp(self):
984
+ cfg = SimpleNamespace(
985
+ id_converter_url="https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0",
986
+ oa_api_url="https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi",
987
+ europe_pmc_base_url="https://www.ebi.ac.uk/europepmc/webservices/rest",
988
+ pmc_page_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
989
+ direct_pmc_pdf_base_url="https://www.ncbi.nlm.nih.gov/pmc/articles",
990
+ ftp_base_url="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc",
991
+ https_base_url="https://www.ncbi.nlm.nih.gov/pmc",
992
+ user_agent="Mozilla/5.0 (compatible; test-agent)",
993
+ request_timeout=30,
994
+ chunk_size=8192,
995
+ )
996
+ # Use the actual production class, not the test shim
997
+ self.downloader = PubmedDownloader(cfg)
998
+
999
+ def test_production_construct_pdf_url_no_records_key(self):
1000
+ """Test production construct_pdf_url with no records key (covers line 77-78)."""
1001
+ metadata = {"other_key": "value"}
1002
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
1003
+ self.assertEqual(result, "")
1004
+
1005
+ def test_production_construct_pdf_url_empty_records_list(self):
1006
+ """Test production construct_pdf_url with empty records list (covers line 77-78)."""
1007
+ metadata = {"records": []}
1008
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
1009
+ self.assertEqual(result, "")
1010
+
1011
+ def test_production_construct_pdf_url_missing_pmcid_key(self):
1012
+ """Test production construct_pdf_url with missing pmcid key."""
1013
+ metadata = {"records": [{"doi": "10.1/x"}]}
1014
+ with patch(
1015
+ "aiagents4pharma.talk2scholars.tools.paper_download.utils."
1016
+ "pubmed_downloader.logger"
1017
+ ) as mock_logger:
1018
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
1019
+
1020
+ self.assertEqual(result, "")
1021
+ # Should log the "No PMCID available" message
1022
+ mock_logger.info.assert_called_once_with(
1023
+ "No PMCID available for PDF fetch: PMID %s", "12345678"
1024
+ )
1025
+
1026
+ def test_production_construct_pdf_url_empty_pmcid(self):
1027
+ """Test production construct_pdf_url with empty pmcid."""
1028
+ metadata = {"records": [{"pmcid": "", "doi": "10.1/x"}]}
1029
+ with patch(
1030
+ "aiagents4pharma.talk2scholars.tools.paper_download.utils."
1031
+ "pubmed_downloader.logger"
1032
+ ) as mock_logger:
1033
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
1034
+
1035
+ self.assertEqual(result, "")
1036
+ # Should log the "No PMCID available" message
1037
+ mock_logger.info.assert_called_once_with(
1038
+ "No PMCID available for PDF fetch: PMID %s", "12345678"
1039
+ )
1040
+
1041
+ def test_production_construct_pdf_url_na_pmcid(self):
1042
+ """Test production construct_pdf_url with N/A pmcid."""
1043
+ metadata = {"records": [{"pmcid": "N/A", "doi": "10.1/x"}]}
1044
+ with patch(
1045
+ "aiagents4pharma.talk2scholars.tools.paper_download.utils."
1046
+ "pubmed_downloader.logger"
1047
+ ) as mock_logger:
1048
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
1049
+
1050
+ self.assertEqual(result, "")
1051
+ # Should log the "No PMCID available" message
1052
+ mock_logger.info.assert_called_once_with(
1053
+ "No PMCID available for PDF fetch: PMID %s", "12345678"
1054
+ )
1055
+
1056
+ def test_production_construct_pdf_url_valid_pmcid_calls_fallbacks(self):
1057
+ """Test production construct_pdf_url with valid pmcid calls fallbacks."""
1058
+ metadata = {"records": [{"pmcid": "PMC123456", "doi": "10.1/x"}]}
1059
+ with patch.object(
1060
+ self.downloader,
1061
+ "_fetch_pdf_url_with_fallbacks",
1062
+ return_value="http://test.pdf",
1063
+ ) as mock_fallbacks:
1064
+ result = self.downloader.construct_pdf_url(metadata, "12345678")
1065
+
1066
+ self.assertEqual(result, "http://test.pdf")
1067
+ mock_fallbacks.assert_called_once_with("PMC123456")