aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
- aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
- aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
- aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
- aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
- aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
- aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
- aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
- aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
- /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,697 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for BiorxivDownloader.
|
3
|
+
Tests CloudScraper integration, JSON API interaction, and PDF download with CloudFlare protection.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import unittest
|
7
|
+
from unittest.mock import Mock, patch
|
8
|
+
|
9
|
+
import requests
|
10
|
+
|
11
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.utils.biorxiv_downloader import (
|
12
|
+
BiorxivDownloader,
|
13
|
+
)
|
14
|
+
|
15
|
+
|
16
|
+
class BiorxivDownloaderTestShim(BiorxivDownloader):
|
17
|
+
"""biorxiv_downloader test shim to expose protected methods."""
|
18
|
+
|
19
|
+
__test__ = False
|
20
|
+
|
21
|
+
def set_scraper(self, scraper):
|
22
|
+
"""set_scraper is a public method to set the scraper."""
|
23
|
+
self._scraper = scraper
|
24
|
+
|
25
|
+
def get_scraper_public(self):
|
26
|
+
"""get_scraper_public is a public method to access the scraper."""
|
27
|
+
return self._get_scraper()
|
28
|
+
|
29
|
+
def visit_landing_page_public(self, scraper, pdf_url, identifier):
|
30
|
+
"""call visit_landing_page with public access."""
|
31
|
+
return self._visit_landing_page(scraper, pdf_url, identifier)
|
32
|
+
|
33
|
+
def save_pdf_to_temp_public(self, response):
|
34
|
+
"""save_pdf_to_temp_public is a public method to save PDF response."""
|
35
|
+
return self._save_pdf_to_temp(response)
|
36
|
+
|
37
|
+
def extract_filename_public(self, response, identifier):
|
38
|
+
"""extract_filename_public is a public method to extract filename from response."""
|
39
|
+
return self._extract_filename(response, identifier)
|
40
|
+
|
41
|
+
def extract_basic_metadata_public(self, paper, identifier):
|
42
|
+
"""extract_basic_metadata_public is a public method to extract basic metadata."""
|
43
|
+
return self._extract_basic_metadata(paper, identifier)
|
44
|
+
|
45
|
+
def extract_authors_public(self, authors_str):
|
46
|
+
"""extract_authors_public is a public method to extract authors from a string."""
|
47
|
+
return self._extract_authors(authors_str)
|
48
|
+
|
49
|
+
def get_paper_identifier_info_public(self, paper):
|
50
|
+
"""get_paper_identifier_info_public is a public method to get paper identifier info."""
|
51
|
+
return self._get_paper_identifier_info(paper)
|
52
|
+
|
53
|
+
def add_service_identifier_public(self, entry, identifier):
|
54
|
+
"""add_service_identifier_public is a public method to add service identifier."""
|
55
|
+
self._add_service_identifier(entry, identifier)
|
56
|
+
|
57
|
+
|
58
|
+
class TestBiorxivDownloader(unittest.TestCase):
|
59
|
+
"""Tests for the BiorxivDownloader class."""
|
60
|
+
|
61
|
+
@patch("cloudscraper.create_scraper")
|
62
|
+
def setUp(self, mock_create_scraper):
|
63
|
+
"""Set up test fixtures."""
|
64
|
+
self.mock_config = Mock()
|
65
|
+
self.mock_config.api_url = "https://api.biorxiv.org/details"
|
66
|
+
self.mock_config.pdf_url_template = (
|
67
|
+
"https://www.biorxiv.org/content/{doi}v{version}.full.pdf"
|
68
|
+
)
|
69
|
+
self.mock_config.user_agent = "test-agent"
|
70
|
+
self.mock_config.cf_clearance_timeout = 10
|
71
|
+
self.mock_config.request_timeout = 30
|
72
|
+
self.mock_config.chunk_size = 8192
|
73
|
+
self.mock_config.session_reuse = True
|
74
|
+
self.mock_config.default_version = "1"
|
75
|
+
self.mock_config.browser_config = {"type": "custom"}
|
76
|
+
|
77
|
+
# Mock the scraper creation during initialization
|
78
|
+
mock_scraper = Mock()
|
79
|
+
mock_create_scraper.return_value = mock_scraper
|
80
|
+
|
81
|
+
self.downloader = BiorxivDownloaderTestShim(self.mock_config)
|
82
|
+
self.initial_scraper = mock_scraper
|
83
|
+
self.downloader.set_scraper(mock_scraper)
|
84
|
+
|
85
|
+
# Sample bioRxiv API response
|
86
|
+
self.sample_json_response = {
|
87
|
+
"collection": [
|
88
|
+
{
|
89
|
+
"title": "Test BioRxiv Paper",
|
90
|
+
"authors": "John Doe; Jane Smith",
|
91
|
+
"abstract": "This is a test abstract for bioRxiv paper.",
|
92
|
+
"date": "2023-01-01",
|
93
|
+
"category": "Biochemistry",
|
94
|
+
"version": "2",
|
95
|
+
"doi": "10.1101/2023.01.01.123456",
|
96
|
+
}
|
97
|
+
]
|
98
|
+
}
|
99
|
+
|
100
|
+
def test_initialization(self):
|
101
|
+
"""Test BiorxivDownloader initialization."""
|
102
|
+
self.assertEqual(self.downloader.api_url, "https://api.biorxiv.org/details")
|
103
|
+
self.assertEqual(
|
104
|
+
self.downloader.pdf_url_template,
|
105
|
+
"https://www.biorxiv.org/content/{doi}v{version}.full.pdf",
|
106
|
+
)
|
107
|
+
self.assertEqual(self.downloader.user_agent, "test-agent")
|
108
|
+
self.assertEqual(self.downloader.cf_clearance_timeout, 10)
|
109
|
+
self.assertIsNotNone(self.downloader.get_scraper_public())
|
110
|
+
|
111
|
+
def test_fetch_metadata_success(self):
|
112
|
+
"""Test successful metadata fetching from bioRxiv API."""
|
113
|
+
mock_scraper = Mock()
|
114
|
+
mock_response = Mock()
|
115
|
+
mock_response.json.return_value = self.sample_json_response
|
116
|
+
mock_response.raise_for_status = Mock()
|
117
|
+
mock_scraper.get.return_value = mock_response
|
118
|
+
|
119
|
+
# Mock the existing scraper
|
120
|
+
self.downloader.set_scraper(mock_scraper)
|
121
|
+
|
122
|
+
result = self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
|
123
|
+
|
124
|
+
# Verify API call
|
125
|
+
expected_url = (
|
126
|
+
"https://api.biorxiv.org/details/biorxiv/10.1101/2023.01.01.123456/na/json"
|
127
|
+
)
|
128
|
+
mock_scraper.get.assert_called_once_with(expected_url, timeout=30)
|
129
|
+
mock_response.raise_for_status.assert_called_once()
|
130
|
+
|
131
|
+
# Verify JSON parsing
|
132
|
+
self.assertEqual(result, self.sample_json_response)
|
133
|
+
|
134
|
+
def test_fetch_metadata_network_error(self):
|
135
|
+
"""Test fetch_metadata with network error."""
|
136
|
+
mock_scraper = Mock()
|
137
|
+
mock_scraper.get.side_effect = requests.RequestException("Network error")
|
138
|
+
self.downloader.set_scraper(mock_scraper)
|
139
|
+
|
140
|
+
with self.assertRaises(requests.RequestException):
|
141
|
+
self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
|
142
|
+
|
143
|
+
def test_fetch_metadata_no_collection_data(self):
|
144
|
+
"""Test fetch_metadata when API response has no collection data."""
|
145
|
+
mock_scraper = Mock()
|
146
|
+
mock_response = Mock()
|
147
|
+
mock_response.json.return_value = {} # Empty response
|
148
|
+
mock_response.raise_for_status = Mock()
|
149
|
+
mock_scraper.get.return_value = mock_response
|
150
|
+
self.downloader.set_scraper(mock_scraper)
|
151
|
+
|
152
|
+
with self.assertRaises(RuntimeError) as context:
|
153
|
+
self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
|
154
|
+
|
155
|
+
self.assertIn("No collection data found", str(context.exception))
|
156
|
+
|
157
|
+
def test_construct_pdf_url_variants(self):
|
158
|
+
"""PDF URL construction: normal, missing collection, default version."""
|
159
|
+
# Success
|
160
|
+
self.assertEqual(
|
161
|
+
self.downloader.construct_pdf_url(
|
162
|
+
self.sample_json_response, "10.1101/2023.01.01.123456"
|
163
|
+
),
|
164
|
+
"https://www.biorxiv.org/content/10.1101/2023.01.01.123456v2.full.pdf",
|
165
|
+
)
|
166
|
+
# No collection
|
167
|
+
self.assertEqual(
|
168
|
+
self.downloader.construct_pdf_url({}, "10.1101/2023.01.01.123456"),
|
169
|
+
"",
|
170
|
+
)
|
171
|
+
# Default version
|
172
|
+
meta_default = {"collection": [{"title": "Test Paper"}]}
|
173
|
+
self.assertEqual(
|
174
|
+
self.downloader.construct_pdf_url(
|
175
|
+
meta_default, "10.1101/2023.01.01.123456"
|
176
|
+
),
|
177
|
+
"https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf",
|
178
|
+
)
|
179
|
+
|
180
|
+
@patch("tempfile.NamedTemporaryFile")
|
181
|
+
def test_download_pdf_to_temp_success(self, mock_tempfile):
|
182
|
+
"""Test successful PDF download with CloudScraper."""
|
183
|
+
# Setup mock scraper
|
184
|
+
mock_scraper = Mock()
|
185
|
+
self.downloader.set_scraper(mock_scraper)
|
186
|
+
|
187
|
+
# Mock landing page response
|
188
|
+
mock_landing_response = Mock()
|
189
|
+
mock_landing_response.raise_for_status = Mock()
|
190
|
+
|
191
|
+
# Mock PDF download response
|
192
|
+
mock_pdf_response = Mock()
|
193
|
+
mock_pdf_response.raise_for_status = Mock()
|
194
|
+
mock_pdf_response.iter_content.return_value = [
|
195
|
+
b"PDF content chunk 1",
|
196
|
+
b"PDF content chunk 2",
|
197
|
+
]
|
198
|
+
mock_pdf_response.headers = {
|
199
|
+
"Content-Disposition": 'attachment; filename="paper.pdf"'
|
200
|
+
}
|
201
|
+
|
202
|
+
mock_scraper.get.side_effect = [mock_landing_response, mock_pdf_response]
|
203
|
+
|
204
|
+
# Mock temporary file
|
205
|
+
mock_temp_file = Mock()
|
206
|
+
mock_temp_file.name = "/tmp/test.pdf"
|
207
|
+
mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
|
208
|
+
mock_temp_file.__exit__ = Mock(return_value=None)
|
209
|
+
mock_tempfile.return_value = mock_temp_file
|
210
|
+
|
211
|
+
pdf_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
|
212
|
+
result = self.downloader.download_pdf_to_temp(
|
213
|
+
pdf_url, "10.1101/2023.01.01.123456"
|
214
|
+
)
|
215
|
+
|
216
|
+
# Verify result
|
217
|
+
self.assertEqual(result, ("/tmp/test.pdf", "paper.pdf"))
|
218
|
+
|
219
|
+
# Verify landing page visit
|
220
|
+
landing_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
|
221
|
+
mock_scraper.get.assert_any_call(landing_url, timeout=30)
|
222
|
+
|
223
|
+
# Verify PDF download
|
224
|
+
mock_scraper.get.assert_any_call(pdf_url, timeout=30, stream=True)
|
225
|
+
|
226
|
+
# Verify file writing
|
227
|
+
mock_temp_file.write.assert_any_call(b"PDF content chunk 1")
|
228
|
+
mock_temp_file.write.assert_any_call(b"PDF content chunk 2")
|
229
|
+
|
230
|
+
def test_download_pdf_to_temp_error_variants(self):
|
231
|
+
"""Download errors: empty URL and network failure."""
|
232
|
+
# Empty URL
|
233
|
+
self.assertIsNone(self.downloader.download_pdf_to_temp("", "10.1101/x"))
|
234
|
+
|
235
|
+
# Network error
|
236
|
+
mock_scraper = Mock()
|
237
|
+
mock_scraper.get.side_effect = requests.RequestException("Network error")
|
238
|
+
self.downloader.set_scraper(mock_scraper)
|
239
|
+
url = "https://www.biorxiv.org/content/10.1101/xv1.full.pdf"
|
240
|
+
self.assertIsNone(self.downloader.download_pdf_to_temp(url, "10.1101/x"))
|
241
|
+
|
242
|
+
@patch("cloudscraper.create_scraper")
|
243
|
+
def test_get_scraper_new_and_existing(self, mock_create):
|
244
|
+
"""_get_scraper creates when missing and reuses when present."""
|
245
|
+
# New scraper
|
246
|
+
self.downloader.set_scraper(None)
|
247
|
+
new_scraper = Mock()
|
248
|
+
mock_create.return_value = new_scraper
|
249
|
+
got = self.downloader.get_scraper_public()
|
250
|
+
self.assertIs(got, new_scraper)
|
251
|
+
mock_create.assert_called_once_with(browser={"custom": "test-agent"}, delay=10)
|
252
|
+
|
253
|
+
# Existing scraper
|
254
|
+
self.downloader.set_scraper(new_scraper)
|
255
|
+
got2 = self.downloader.get_scraper_public()
|
256
|
+
self.assertIs(got2, new_scraper)
|
257
|
+
|
258
|
+
def test_visit_landing_page_variants(self):
|
259
|
+
"""Landing page visit happens only for .full.pdf URLs."""
|
260
|
+
mock_scraper = Mock()
|
261
|
+
ok = Mock()
|
262
|
+
ok.raise_for_status = Mock()
|
263
|
+
mock_scraper.get.return_value = ok
|
264
|
+
|
265
|
+
# Case 1: with .full.pdf -> should visit landing
|
266
|
+
pdf_url_full = (
|
267
|
+
"https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
|
268
|
+
)
|
269
|
+
self.downloader.visit_landing_page_public(
|
270
|
+
mock_scraper, pdf_url_full, "10.1101/2023.01.01.123456"
|
271
|
+
)
|
272
|
+
expected = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
|
273
|
+
mock_scraper.get.assert_called_with(expected, timeout=30)
|
274
|
+
|
275
|
+
# Case 2: no .full.pdf -> no call
|
276
|
+
mock_scraper.get.reset_mock()
|
277
|
+
pdf_url_plain = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
|
278
|
+
self.downloader.visit_landing_page_public(
|
279
|
+
mock_scraper, pdf_url_plain, "10.1101/2023.01.01.123456"
|
280
|
+
)
|
281
|
+
mock_scraper.get.assert_not_called()
|
282
|
+
|
283
|
+
@patch("tempfile.NamedTemporaryFile")
|
284
|
+
def test_save_pdf_to_temp(self, mock_tempfile):
|
285
|
+
"""Test saving PDF response to temporary file."""
|
286
|
+
mock_response = Mock()
|
287
|
+
mock_response.iter_content.return_value = [
|
288
|
+
b"chunk1",
|
289
|
+
b"chunk2",
|
290
|
+
None,
|
291
|
+
b"chunk3",
|
292
|
+
] # Include None chunk
|
293
|
+
|
294
|
+
mock_temp_file = Mock()
|
295
|
+
mock_temp_file.name = "/tmp/saved.pdf"
|
296
|
+
mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
|
297
|
+
mock_temp_file.__exit__ = Mock(return_value=None)
|
298
|
+
mock_tempfile.return_value = mock_temp_file
|
299
|
+
|
300
|
+
result = self.downloader.save_pdf_to_temp_public(mock_response)
|
301
|
+
|
302
|
+
self.assertEqual(result, "/tmp/saved.pdf")
|
303
|
+
|
304
|
+
# Verify chunks were written (None chunk should be skipped)
|
305
|
+
mock_temp_file.write.assert_any_call(b"chunk1")
|
306
|
+
mock_temp_file.write.assert_any_call(b"chunk2")
|
307
|
+
mock_temp_file.write.assert_any_call(b"chunk3")
|
308
|
+
self.assertEqual(mock_temp_file.write.call_count, 3)
|
309
|
+
|
310
|
+
def test_extract_filename_variants(self):
|
311
|
+
"""Filename extraction across header variants and regex-exception path."""
|
312
|
+
cases = [
|
313
|
+
(
|
314
|
+
{"Content-Disposition": 'attachment; filename="test-paper.pdf"'},
|
315
|
+
"test-paper.pdf",
|
316
|
+
False,
|
317
|
+
),
|
318
|
+
({}, "default.pdf", False),
|
319
|
+
({"Content-Disposition": "invalid header format"}, "default.pdf", False),
|
320
|
+
(
|
321
|
+
{"Content-Disposition": 'attachment; filename="test.pdf"'},
|
322
|
+
"default.pdf",
|
323
|
+
True,
|
324
|
+
), # trigger exception path
|
325
|
+
]
|
326
|
+
for headers, expected, raise_regex in cases:
|
327
|
+
with self.subTest(
|
328
|
+
headers=headers, expected=expected, raise_regex=raise_regex
|
329
|
+
):
|
330
|
+
resp = Mock()
|
331
|
+
resp.headers = headers
|
332
|
+
if raise_regex:
|
333
|
+
with patch(
|
334
|
+
"re.search",
|
335
|
+
side_effect=requests.RequestException("Regex error"),
|
336
|
+
):
|
337
|
+
with patch.object(
|
338
|
+
self.downloader,
|
339
|
+
"get_default_filename",
|
340
|
+
return_value="default.pdf",
|
341
|
+
):
|
342
|
+
got = self.downloader.extract_filename_public(
|
343
|
+
resp, "10.1101/test"
|
344
|
+
)
|
345
|
+
else:
|
346
|
+
with patch.object(
|
347
|
+
self.downloader,
|
348
|
+
"get_default_filename",
|
349
|
+
return_value="default.pdf",
|
350
|
+
):
|
351
|
+
got = self.downloader.extract_filename_public(
|
352
|
+
resp, "10.1101/test"
|
353
|
+
)
|
354
|
+
self.assertEqual(got, expected)
|
355
|
+
|
356
|
+
def test_extract_paper_metadata_success(self):
|
357
|
+
"""Test successful paper metadata extraction."""
|
358
|
+
metadata = self.sample_json_response
|
359
|
+
pdf_result = ("/tmp/paper.pdf", "biorxiv_paper.pdf")
|
360
|
+
|
361
|
+
result = self.downloader.extract_paper_metadata(
|
362
|
+
metadata, "10.1101/2023.01.01.123456", pdf_result
|
363
|
+
)
|
364
|
+
|
365
|
+
expected = {
|
366
|
+
"Title": "Test BioRxiv Paper",
|
367
|
+
"Authors": ["John Doe", "Jane Smith"],
|
368
|
+
"Abstract": "This is a test abstract for bioRxiv paper.",
|
369
|
+
"Publication Date": "2023-01-01",
|
370
|
+
"DOI": "10.1101/2023.01.01.123456",
|
371
|
+
"Category": "Biochemistry",
|
372
|
+
"Version": "2",
|
373
|
+
"source": "biorxiv",
|
374
|
+
"server": "biorxiv",
|
375
|
+
"URL": "/tmp/paper.pdf",
|
376
|
+
"pdf_url": "/tmp/paper.pdf",
|
377
|
+
"filename": "biorxiv_paper.pdf",
|
378
|
+
"access_type": "open_access_downloaded",
|
379
|
+
"temp_file_path": "/tmp/paper.pdf",
|
380
|
+
}
|
381
|
+
|
382
|
+
self.assertEqual(result, expected)
|
383
|
+
|
384
|
+
def test_extract_paper_metadata_no_pdf_result(self):
|
385
|
+
"""Test metadata extraction when PDF download failed."""
|
386
|
+
metadata = self.sample_json_response
|
387
|
+
pdf_result = None # No PDF download result
|
388
|
+
|
389
|
+
result = self.downloader.extract_paper_metadata(
|
390
|
+
metadata, "10.1101/2023.01.01.123456", pdf_result
|
391
|
+
)
|
392
|
+
|
393
|
+
# Should still have basic metadata but with download_failed access type
|
394
|
+
self.assertEqual(result["Title"], "Test BioRxiv Paper")
|
395
|
+
self.assertEqual(result["access_type"], "download_failed")
|
396
|
+
self.assertEqual(result["URL"], "")
|
397
|
+
self.assertEqual(result["pdf_url"], "")
|
398
|
+
self.assertEqual(result["temp_file_path"], "")
|
399
|
+
self.assertEqual(
|
400
|
+
result["filename"], "10_1101_2023_01_01_123456.pdf"
|
401
|
+
) # Default filename
|
402
|
+
|
403
|
+
def test_extract_paper_metadata_no_collection(self):
|
404
|
+
"""Test metadata extraction with missing collection."""
|
405
|
+
metadata = {}
|
406
|
+
|
407
|
+
with self.assertRaises(RuntimeError) as context:
|
408
|
+
self.downloader.extract_paper_metadata(
|
409
|
+
metadata, "10.1101/2023.01.01.123456", None
|
410
|
+
)
|
411
|
+
|
412
|
+
self.assertIn("No collection data found", str(context.exception))
|
413
|
+
|
414
|
+
def test_extract_basic_metadata(self):
|
415
|
+
"""Test basic metadata extraction helper method."""
|
416
|
+
paper = self.sample_json_response["collection"][0]
|
417
|
+
|
418
|
+
result = self.downloader.extract_basic_metadata_public(
|
419
|
+
paper, "10.1101/2023.01.01.123456"
|
420
|
+
)
|
421
|
+
|
422
|
+
expected = {
|
423
|
+
"Title": "Test BioRxiv Paper",
|
424
|
+
"Authors": ["John Doe", "Jane Smith"],
|
425
|
+
"Abstract": "This is a test abstract for bioRxiv paper.",
|
426
|
+
"Publication Date": "2023-01-01",
|
427
|
+
"DOI": "10.1101/2023.01.01.123456",
|
428
|
+
"Category": "Biochemistry",
|
429
|
+
"Version": "2",
|
430
|
+
"source": "biorxiv",
|
431
|
+
"server": "biorxiv",
|
432
|
+
}
|
433
|
+
|
434
|
+
self.assertEqual(result, expected)
|
435
|
+
|
436
|
+
def test_extract_authors_variants(self):
|
437
|
+
"""Author parsing for semicolon list and empty string."""
|
438
|
+
self.assertEqual(
|
439
|
+
self.downloader.extract_authors_public("John Doe; Jane Smith; Bob Johnson"),
|
440
|
+
["John Doe", "Jane Smith", "Bob Johnson"],
|
441
|
+
)
|
442
|
+
self.assertEqual(self.downloader.extract_authors_public(""), [])
|
443
|
+
|
444
|
+
def test_service_and_identifier_helpers(self):
|
445
|
+
"""Service name, identifier name, and default filename."""
|
446
|
+
self.assertEqual(self.downloader.get_service_name(), "bioRxiv")
|
447
|
+
self.assertEqual(self.downloader.get_identifier_name(), "DOI")
|
448
|
+
self.assertEqual(
|
449
|
+
self.downloader.get_default_filename("10.1101/2023.01.01.123456"),
|
450
|
+
"10_1101_2023_01_01_123456.pdf",
|
451
|
+
)
|
452
|
+
|
453
|
+
def test_get_paper_identifier_info(self):
|
454
|
+
"""Test _get_paper_identifier_info method."""
|
455
|
+
paper = {
|
456
|
+
"DOI": "10.1101/2023.01.01.123456",
|
457
|
+
"Publication Date": "2023-01-01",
|
458
|
+
"Category": "Biology",
|
459
|
+
}
|
460
|
+
|
461
|
+
result = self.downloader.get_paper_identifier_info_public(paper)
|
462
|
+
|
463
|
+
self.assertIn("10.1101/2023.01.01.123456", result)
|
464
|
+
self.assertIn("2023-01-01", result)
|
465
|
+
self.assertIn("Biology", result)
|
466
|
+
|
467
|
+
def test_add_service_identifier(self):
|
468
|
+
"""Test _add_service_identifier method."""
|
469
|
+
entry = {}
|
470
|
+
|
471
|
+
self.downloader.add_service_identifier_public(
|
472
|
+
entry, "10.1101/2023.01.01.123456"
|
473
|
+
)
|
474
|
+
|
475
|
+
self.assertEqual(entry["DOI"], "10.1101/2023.01.01.123456")
|
476
|
+
self.assertEqual(entry["server"], "biorxiv")
|
477
|
+
|
478
|
+
|
479
|
+
class TestBiorxivDownloaderIntegration(unittest.TestCase):
|
480
|
+
"""Integration tests for BiorxivDownloader workflow."""
|
481
|
+
|
482
|
+
@patch("cloudscraper.create_scraper")
|
483
|
+
def setUp(self, mock_create_scraper):
|
484
|
+
"""Set up integration test fixtures."""
|
485
|
+
self.mock_config = Mock()
|
486
|
+
self.mock_config.api_url = "https://api.biorxiv.org/details"
|
487
|
+
self.mock_config.pdf_url_template = (
|
488
|
+
"https://www.biorxiv.org/content/{doi}v{version}.full.pdf"
|
489
|
+
)
|
490
|
+
self.mock_config.user_agent = "test-agent"
|
491
|
+
self.mock_config.cf_clearance_timeout = 10
|
492
|
+
self.mock_config.request_timeout = 30
|
493
|
+
self.mock_config.chunk_size = 8192
|
494
|
+
self.mock_config.session_reuse = True
|
495
|
+
self.mock_config.default_version = "1"
|
496
|
+
self.mock_config.browser_config = {"type": "custom"}
|
497
|
+
|
498
|
+
# Mock the scraper creation during initialization
|
499
|
+
mock_scraper = Mock()
|
500
|
+
mock_create_scraper.return_value = mock_scraper
|
501
|
+
|
502
|
+
self.downloader = BiorxivDownloaderTestShim(self.mock_config)
|
503
|
+
|
504
|
+
self.sample_response = {
|
505
|
+
"collection": [
|
506
|
+
{
|
507
|
+
"title": "Integration Test Paper",
|
508
|
+
"authors": "Test Author",
|
509
|
+
"abstract": "Integration test abstract.",
|
510
|
+
"date": "2023-01-01",
|
511
|
+
"category": "Biology",
|
512
|
+
"version": "1",
|
513
|
+
"doi": "10.1101/2023.01.01.123456",
|
514
|
+
}
|
515
|
+
]
|
516
|
+
}
|
517
|
+
|
518
|
+
@patch("tempfile.NamedTemporaryFile")
|
519
|
+
def test_full_paper_processing_workflow(self, mock_tempfile):
|
520
|
+
"""Test the complete workflow from DOI to processed paper data."""
|
521
|
+
# Mock scraper responses
|
522
|
+
mock_scraper = Mock()
|
523
|
+
mock_metadata_response = Mock()
|
524
|
+
mock_metadata_response.json.return_value = self.sample_response
|
525
|
+
mock_metadata_response.raise_for_status = Mock()
|
526
|
+
|
527
|
+
# Mock landing page and PDF responses for download
|
528
|
+
mock_landing_response = Mock()
|
529
|
+
mock_landing_response.raise_for_status = Mock()
|
530
|
+
|
531
|
+
mock_pdf_response = Mock()
|
532
|
+
mock_pdf_response.raise_for_status = Mock()
|
533
|
+
mock_pdf_response.iter_content.return_value = [b"PDF data"]
|
534
|
+
mock_pdf_response.headers = {}
|
535
|
+
|
536
|
+
# First call for metadata, then landing page, then PDF download
|
537
|
+
mock_scraper.get.side_effect = [
|
538
|
+
mock_metadata_response,
|
539
|
+
mock_landing_response,
|
540
|
+
mock_pdf_response,
|
541
|
+
]
|
542
|
+
self.downloader.set_scraper(mock_scraper)
|
543
|
+
|
544
|
+
# Mock temporary file
|
545
|
+
mock_temp_file = Mock()
|
546
|
+
mock_temp_file.name = "/tmp/integration.pdf"
|
547
|
+
mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
|
548
|
+
mock_temp_file.__exit__ = Mock(return_value=None)
|
549
|
+
mock_tempfile.return_value = mock_temp_file
|
550
|
+
|
551
|
+
# Simulate the workflow
|
552
|
+
identifier = "10.1101/2023.01.01.123456"
|
553
|
+
|
554
|
+
# Step 1: Fetch metadata
|
555
|
+
metadata = self.downloader.fetch_metadata(identifier)
|
556
|
+
|
557
|
+
# Step 2: Construct PDF URL
|
558
|
+
pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
|
559
|
+
|
560
|
+
# Step 3: Download PDF
|
561
|
+
pdf_result = self.downloader.download_pdf_to_temp(pdf_url, identifier)
|
562
|
+
|
563
|
+
# Step 4: Extract metadata
|
564
|
+
paper_data = self.downloader.extract_paper_metadata(
|
565
|
+
metadata, identifier, pdf_result
|
566
|
+
)
|
567
|
+
|
568
|
+
# Verify the complete workflow
|
569
|
+
self.assertEqual(paper_data["Title"], "Integration Test Paper")
|
570
|
+
self.assertEqual(paper_data["Authors"], ["Test Author"])
|
571
|
+
self.assertEqual(paper_data["access_type"], "open_access_downloaded")
|
572
|
+
self.assertEqual(paper_data["temp_file_path"], "/tmp/integration.pdf")
|
573
|
+
|
574
|
+
expected_pdf_url = (
|
575
|
+
"https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
|
576
|
+
)
|
577
|
+
self.assertEqual(pdf_url, expected_pdf_url)
|
578
|
+
|
579
|
+
# Verify 3 calls: metadata, landing page, PDF
|
580
|
+
self.assertEqual(mock_scraper.get.call_count, 3)
|
581
|
+
|
582
|
+
def test_workflow_with_existing_scraper(self):
|
583
|
+
"""Test workflow reusing existing scraper instance."""
|
584
|
+
# Set existing scraper
|
585
|
+
existing_scraper = Mock()
|
586
|
+
|
587
|
+
# Mock API response for metadata
|
588
|
+
mock_response = Mock()
|
589
|
+
mock_response.json.return_value = self.sample_response
|
590
|
+
mock_response.raise_for_status = Mock()
|
591
|
+
existing_scraper.get.return_value = mock_response
|
592
|
+
|
593
|
+
self.downloader.set_scraper(existing_scraper)
|
594
|
+
|
595
|
+
identifier = "10.1101/2023.01.01.123456"
|
596
|
+
metadata = self.downloader.fetch_metadata(identifier)
|
597
|
+
pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
|
598
|
+
|
599
|
+
# Try to download (will use existing scraper)
|
600
|
+
with patch("tempfile.NamedTemporaryFile"):
|
601
|
+
# Reset the mock and set up responses for landing + PDF
|
602
|
+
existing_scraper.reset_mock()
|
603
|
+
mock_landing = Mock()
|
604
|
+
mock_landing.raise_for_status = Mock()
|
605
|
+
mock_pdf = Mock()
|
606
|
+
mock_pdf.raise_for_status = Mock()
|
607
|
+
mock_pdf.iter_content.return_value = [b"data"]
|
608
|
+
mock_pdf.headers = {}
|
609
|
+
existing_scraper.get.side_effect = [mock_landing, mock_pdf]
|
610
|
+
|
611
|
+
self.downloader.download_pdf_to_temp(pdf_url, identifier)
|
612
|
+
|
613
|
+
# Should have used existing scraper for landing + PDF (2 calls)
|
614
|
+
self.assertEqual(existing_scraper.get.call_count, 2)
|
615
|
+
|
616
|
+
|
617
|
+
class TestBiorxivCloudFlareHandling(unittest.TestCase):
|
618
|
+
"""Tests specific to CloudFlare protection handling."""
|
619
|
+
|
620
|
+
@patch("cloudscraper.create_scraper")
|
621
|
+
def setUp(self, mock_create_scraper):
|
622
|
+
"""Set up CloudFlare handling test fixtures."""
|
623
|
+
self.mock_config = Mock()
|
624
|
+
self.mock_config.api_url = "https://api.biorxiv.org/details"
|
625
|
+
self.mock_config.pdf_url_template = (
|
626
|
+
"https://www.biorxiv.org/content/{doi}v{version}.full.pdf"
|
627
|
+
)
|
628
|
+
self.mock_config.user_agent = "Mozilla/5.0 (compatible; test-agent)"
|
629
|
+
self.mock_config.cf_clearance_timeout = 15
|
630
|
+
self.mock_config.request_timeout = 30
|
631
|
+
self.mock_config.chunk_size = 8192
|
632
|
+
self.mock_config.session_reuse = True
|
633
|
+
self.mock_config.default_version = "1"
|
634
|
+
self.mock_config.browser_config = {"type": "custom"}
|
635
|
+
|
636
|
+
# Mock the scraper creation during initialization
|
637
|
+
mock_scraper = Mock()
|
638
|
+
mock_create_scraper.return_value = mock_scraper
|
639
|
+
|
640
|
+
self.downloader = BiorxivDownloaderTestShim(self.mock_config)
|
641
|
+
|
642
|
+
@patch("cloudscraper.create_scraper")
|
643
|
+
def test_cloudscraper_configuration(self, mock_create_scraper):
|
644
|
+
"""Test CloudScraper is configured with proper parameters."""
|
645
|
+
# Set scraper to None so we create a new one
|
646
|
+
self.downloader.set_scraper(None)
|
647
|
+
mock_scraper = Mock()
|
648
|
+
mock_create_scraper.return_value = mock_scraper
|
649
|
+
|
650
|
+
scraper = self.downloader.get_scraper_public()
|
651
|
+
|
652
|
+
mock_create_scraper.assert_called_once_with(
|
653
|
+
browser={"custom": "Mozilla/5.0 (compatible; test-agent)"}, delay=15
|
654
|
+
)
|
655
|
+
self.assertEqual(scraper, mock_scraper)
|
656
|
+
|
657
|
+
@patch("tempfile.NamedTemporaryFile")
|
658
|
+
def test_landing_page_visit_before_pdf_download(self, mock_tempfile):
|
659
|
+
"""Test that landing page is visited before PDF download for CloudFlare bypass."""
|
660
|
+
mock_scraper = Mock()
|
661
|
+
self.downloader.set_scraper(mock_scraper)
|
662
|
+
|
663
|
+
# Mock responses
|
664
|
+
mock_landing_response = Mock()
|
665
|
+
mock_landing_response.raise_for_status = Mock()
|
666
|
+
|
667
|
+
mock_pdf_response = Mock()
|
668
|
+
mock_pdf_response.raise_for_status = Mock()
|
669
|
+
mock_pdf_response.iter_content.return_value = [b"PDF content"]
|
670
|
+
mock_pdf_response.headers = {}
|
671
|
+
|
672
|
+
mock_scraper.get.side_effect = [mock_landing_response, mock_pdf_response]
|
673
|
+
|
674
|
+
# Mock temp file
|
675
|
+
mock_temp_file = Mock()
|
676
|
+
mock_temp_file.name = "/tmp/test.pdf"
|
677
|
+
mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
|
678
|
+
mock_temp_file.__exit__ = Mock(return_value=None)
|
679
|
+
mock_tempfile.return_value = mock_temp_file
|
680
|
+
|
681
|
+
pdf_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
|
682
|
+
self.downloader.download_pdf_to_temp(pdf_url, "10.1101/2023.01.01.123456")
|
683
|
+
|
684
|
+
# Verify landing page was visited first
|
685
|
+
landing_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
|
686
|
+
|
687
|
+
calls = mock_scraper.get.call_args_list
|
688
|
+
self.assertEqual(len(calls), 2)
|
689
|
+
|
690
|
+
# First call should be to landing page
|
691
|
+
self.assertEqual(calls[0][0][0], landing_url)
|
692
|
+
self.assertEqual(calls[0][1]["timeout"], 30)
|
693
|
+
|
694
|
+
# Second call should be to PDF URL
|
695
|
+
self.assertEqual(calls[1][0][0], pdf_url)
|
696
|
+
self.assertEqual(calls[1][1]["timeout"], 30)
|
697
|
+
self.assertEqual(calls[1][1]["stream"], True)
|