aiagents4pharma 1.42.0__py3-none-any.whl → 1.44.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +17 -2
  2. aiagents4pharma/talk2knowledgegraphs/tests/test_tools_milvus_multimodal_subgraph_extraction.py +618 -413
  3. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_extractions_milvus_multimodal_pcst.py +362 -25
  4. aiagents4pharma/talk2knowledgegraphs/tools/milvus_multimodal_subgraph_extraction.py +146 -109
  5. aiagents4pharma/talk2knowledgegraphs/utils/extractions/milvus_multimodal_pcst.py +240 -83
  6. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
  7. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
  8. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
  9. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
  12. aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
  13. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  14. aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
  15. aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
  16. aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
  17. aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
  18. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
  19. aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
  20. aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
  21. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
  22. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
  23. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
  24. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
  25. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
  26. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
  28. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
  29. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
  30. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
  31. {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/METADATA +3 -1
  32. {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/RECORD +36 -33
  33. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
  34. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
  35. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
  36. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
  37. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
  38. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
  39. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
  40. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
  41. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
  42. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
  43. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
  44. /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
  45. {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/WHEEL +0 -0
  46. {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/licenses/LICENSE +0 -0
  47. {aiagents4pharma-1.42.0.dist-info → aiagents4pharma-1.44.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,697 @@
1
+ """
2
+ Unit tests for BiorxivDownloader.
3
+ Tests CloudScraper integration, JSON API interaction, and PDF download with CloudFlare protection.
4
+ """
5
+
6
+ import unittest
7
+ from unittest.mock import Mock, patch
8
+
9
+ import requests
10
+
11
+ from aiagents4pharma.talk2scholars.tools.paper_download.utils.biorxiv_downloader import (
12
+ BiorxivDownloader,
13
+ )
14
+
15
+
16
+ class BiorxivDownloaderTestShim(BiorxivDownloader):
17
+ """biorxiv_downloader test shim to expose protected methods."""
18
+
19
+ __test__ = False
20
+
21
+ def set_scraper(self, scraper):
22
+ """set_scraper is a public method to set the scraper."""
23
+ self._scraper = scraper
24
+
25
+ def get_scraper_public(self):
26
+ """get_scraper_public is a public method to access the scraper."""
27
+ return self._get_scraper()
28
+
29
+ def visit_landing_page_public(self, scraper, pdf_url, identifier):
30
+ """call visit_landing_page with public access."""
31
+ return self._visit_landing_page(scraper, pdf_url, identifier)
32
+
33
+ def save_pdf_to_temp_public(self, response):
34
+ """save_pdf_to_temp_public is a public method to save PDF response."""
35
+ return self._save_pdf_to_temp(response)
36
+
37
+ def extract_filename_public(self, response, identifier):
38
+ """extract_filename_public is a public method to extract filename from response."""
39
+ return self._extract_filename(response, identifier)
40
+
41
+ def extract_basic_metadata_public(self, paper, identifier):
42
+ """extract_basic_metadata_public is a public method to extract basic metadata."""
43
+ return self._extract_basic_metadata(paper, identifier)
44
+
45
+ def extract_authors_public(self, authors_str):
46
+ """extract_authors_public is a public method to extract authors from a string."""
47
+ return self._extract_authors(authors_str)
48
+
49
+ def get_paper_identifier_info_public(self, paper):
50
+ """get_paper_identifier_info_public is a public method to get paper identifier info."""
51
+ return self._get_paper_identifier_info(paper)
52
+
53
+ def add_service_identifier_public(self, entry, identifier):
54
+ """add_service_identifier_public is a public method to add service identifier."""
55
+ self._add_service_identifier(entry, identifier)
56
+
57
+
58
+ class TestBiorxivDownloader(unittest.TestCase):
59
+ """Tests for the BiorxivDownloader class."""
60
+
61
+ @patch("cloudscraper.create_scraper")
62
+ def setUp(self, mock_create_scraper):
63
+ """Set up test fixtures."""
64
+ self.mock_config = Mock()
65
+ self.mock_config.api_url = "https://api.biorxiv.org/details"
66
+ self.mock_config.pdf_url_template = (
67
+ "https://www.biorxiv.org/content/{doi}v{version}.full.pdf"
68
+ )
69
+ self.mock_config.user_agent = "test-agent"
70
+ self.mock_config.cf_clearance_timeout = 10
71
+ self.mock_config.request_timeout = 30
72
+ self.mock_config.chunk_size = 8192
73
+ self.mock_config.session_reuse = True
74
+ self.mock_config.default_version = "1"
75
+ self.mock_config.browser_config = {"type": "custom"}
76
+
77
+ # Mock the scraper creation during initialization
78
+ mock_scraper = Mock()
79
+ mock_create_scraper.return_value = mock_scraper
80
+
81
+ self.downloader = BiorxivDownloaderTestShim(self.mock_config)
82
+ self.initial_scraper = mock_scraper
83
+ self.downloader.set_scraper(mock_scraper)
84
+
85
+ # Sample bioRxiv API response
86
+ self.sample_json_response = {
87
+ "collection": [
88
+ {
89
+ "title": "Test BioRxiv Paper",
90
+ "authors": "John Doe; Jane Smith",
91
+ "abstract": "This is a test abstract for bioRxiv paper.",
92
+ "date": "2023-01-01",
93
+ "category": "Biochemistry",
94
+ "version": "2",
95
+ "doi": "10.1101/2023.01.01.123456",
96
+ }
97
+ ]
98
+ }
99
+
100
+ def test_initialization(self):
101
+ """Test BiorxivDownloader initialization."""
102
+ self.assertEqual(self.downloader.api_url, "https://api.biorxiv.org/details")
103
+ self.assertEqual(
104
+ self.downloader.pdf_url_template,
105
+ "https://www.biorxiv.org/content/{doi}v{version}.full.pdf",
106
+ )
107
+ self.assertEqual(self.downloader.user_agent, "test-agent")
108
+ self.assertEqual(self.downloader.cf_clearance_timeout, 10)
109
+ self.assertIsNotNone(self.downloader.get_scraper_public())
110
+
111
+ def test_fetch_metadata_success(self):
112
+ """Test successful metadata fetching from bioRxiv API."""
113
+ mock_scraper = Mock()
114
+ mock_response = Mock()
115
+ mock_response.json.return_value = self.sample_json_response
116
+ mock_response.raise_for_status = Mock()
117
+ mock_scraper.get.return_value = mock_response
118
+
119
+ # Mock the existing scraper
120
+ self.downloader.set_scraper(mock_scraper)
121
+
122
+ result = self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
123
+
124
+ # Verify API call
125
+ expected_url = (
126
+ "https://api.biorxiv.org/details/biorxiv/10.1101/2023.01.01.123456/na/json"
127
+ )
128
+ mock_scraper.get.assert_called_once_with(expected_url, timeout=30)
129
+ mock_response.raise_for_status.assert_called_once()
130
+
131
+ # Verify JSON parsing
132
+ self.assertEqual(result, self.sample_json_response)
133
+
134
+ def test_fetch_metadata_network_error(self):
135
+ """Test fetch_metadata with network error."""
136
+ mock_scraper = Mock()
137
+ mock_scraper.get.side_effect = requests.RequestException("Network error")
138
+ self.downloader.set_scraper(mock_scraper)
139
+
140
+ with self.assertRaises(requests.RequestException):
141
+ self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
142
+
143
+ def test_fetch_metadata_no_collection_data(self):
144
+ """Test fetch_metadata when API response has no collection data."""
145
+ mock_scraper = Mock()
146
+ mock_response = Mock()
147
+ mock_response.json.return_value = {} # Empty response
148
+ mock_response.raise_for_status = Mock()
149
+ mock_scraper.get.return_value = mock_response
150
+ self.downloader.set_scraper(mock_scraper)
151
+
152
+ with self.assertRaises(RuntimeError) as context:
153
+ self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
154
+
155
+ self.assertIn("No collection data found", str(context.exception))
156
+
157
+ def test_construct_pdf_url_variants(self):
158
+ """PDF URL construction: normal, missing collection, default version."""
159
+ # Success
160
+ self.assertEqual(
161
+ self.downloader.construct_pdf_url(
162
+ self.sample_json_response, "10.1101/2023.01.01.123456"
163
+ ),
164
+ "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v2.full.pdf",
165
+ )
166
+ # No collection
167
+ self.assertEqual(
168
+ self.downloader.construct_pdf_url({}, "10.1101/2023.01.01.123456"),
169
+ "",
170
+ )
171
+ # Default version
172
+ meta_default = {"collection": [{"title": "Test Paper"}]}
173
+ self.assertEqual(
174
+ self.downloader.construct_pdf_url(
175
+ meta_default, "10.1101/2023.01.01.123456"
176
+ ),
177
+ "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf",
178
+ )
179
+
180
+ @patch("tempfile.NamedTemporaryFile")
181
+ def test_download_pdf_to_temp_success(self, mock_tempfile):
182
+ """Test successful PDF download with CloudScraper."""
183
+ # Setup mock scraper
184
+ mock_scraper = Mock()
185
+ self.downloader.set_scraper(mock_scraper)
186
+
187
+ # Mock landing page response
188
+ mock_landing_response = Mock()
189
+ mock_landing_response.raise_for_status = Mock()
190
+
191
+ # Mock PDF download response
192
+ mock_pdf_response = Mock()
193
+ mock_pdf_response.raise_for_status = Mock()
194
+ mock_pdf_response.iter_content.return_value = [
195
+ b"PDF content chunk 1",
196
+ b"PDF content chunk 2",
197
+ ]
198
+ mock_pdf_response.headers = {
199
+ "Content-Disposition": 'attachment; filename="paper.pdf"'
200
+ }
201
+
202
+ mock_scraper.get.side_effect = [mock_landing_response, mock_pdf_response]
203
+
204
+ # Mock temporary file
205
+ mock_temp_file = Mock()
206
+ mock_temp_file.name = "/tmp/test.pdf"
207
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
208
+ mock_temp_file.__exit__ = Mock(return_value=None)
209
+ mock_tempfile.return_value = mock_temp_file
210
+
211
+ pdf_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
212
+ result = self.downloader.download_pdf_to_temp(
213
+ pdf_url, "10.1101/2023.01.01.123456"
214
+ )
215
+
216
+ # Verify result
217
+ self.assertEqual(result, ("/tmp/test.pdf", "paper.pdf"))
218
+
219
+ # Verify landing page visit
220
+ landing_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
221
+ mock_scraper.get.assert_any_call(landing_url, timeout=30)
222
+
223
+ # Verify PDF download
224
+ mock_scraper.get.assert_any_call(pdf_url, timeout=30, stream=True)
225
+
226
+ # Verify file writing
227
+ mock_temp_file.write.assert_any_call(b"PDF content chunk 1")
228
+ mock_temp_file.write.assert_any_call(b"PDF content chunk 2")
229
+
230
+ def test_download_pdf_to_temp_error_variants(self):
231
+ """Download errors: empty URL and network failure."""
232
+ # Empty URL
233
+ self.assertIsNone(self.downloader.download_pdf_to_temp("", "10.1101/x"))
234
+
235
+ # Network error
236
+ mock_scraper = Mock()
237
+ mock_scraper.get.side_effect = requests.RequestException("Network error")
238
+ self.downloader.set_scraper(mock_scraper)
239
+ url = "https://www.biorxiv.org/content/10.1101/xv1.full.pdf"
240
+ self.assertIsNone(self.downloader.download_pdf_to_temp(url, "10.1101/x"))
241
+
242
+ @patch("cloudscraper.create_scraper")
243
+ def test_get_scraper_new_and_existing(self, mock_create):
244
+ """_get_scraper creates when missing and reuses when present."""
245
+ # New scraper
246
+ self.downloader.set_scraper(None)
247
+ new_scraper = Mock()
248
+ mock_create.return_value = new_scraper
249
+ got = self.downloader.get_scraper_public()
250
+ self.assertIs(got, new_scraper)
251
+ mock_create.assert_called_once_with(browser={"custom": "test-agent"}, delay=10)
252
+
253
+ # Existing scraper
254
+ self.downloader.set_scraper(new_scraper)
255
+ got2 = self.downloader.get_scraper_public()
256
+ self.assertIs(got2, new_scraper)
257
+
258
+ def test_visit_landing_page_variants(self):
259
+ """Landing page visit happens only for .full.pdf URLs."""
260
+ mock_scraper = Mock()
261
+ ok = Mock()
262
+ ok.raise_for_status = Mock()
263
+ mock_scraper.get.return_value = ok
264
+
265
+ # Case 1: with .full.pdf -> should visit landing
266
+ pdf_url_full = (
267
+ "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
268
+ )
269
+ self.downloader.visit_landing_page_public(
270
+ mock_scraper, pdf_url_full, "10.1101/2023.01.01.123456"
271
+ )
272
+ expected = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
273
+ mock_scraper.get.assert_called_with(expected, timeout=30)
274
+
275
+ # Case 2: no .full.pdf -> no call
276
+ mock_scraper.get.reset_mock()
277
+ pdf_url_plain = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
278
+ self.downloader.visit_landing_page_public(
279
+ mock_scraper, pdf_url_plain, "10.1101/2023.01.01.123456"
280
+ )
281
+ mock_scraper.get.assert_not_called()
282
+
283
+ @patch("tempfile.NamedTemporaryFile")
284
+ def test_save_pdf_to_temp(self, mock_tempfile):
285
+ """Test saving PDF response to temporary file."""
286
+ mock_response = Mock()
287
+ mock_response.iter_content.return_value = [
288
+ b"chunk1",
289
+ b"chunk2",
290
+ None,
291
+ b"chunk3",
292
+ ] # Include None chunk
293
+
294
+ mock_temp_file = Mock()
295
+ mock_temp_file.name = "/tmp/saved.pdf"
296
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
297
+ mock_temp_file.__exit__ = Mock(return_value=None)
298
+ mock_tempfile.return_value = mock_temp_file
299
+
300
+ result = self.downloader.save_pdf_to_temp_public(mock_response)
301
+
302
+ self.assertEqual(result, "/tmp/saved.pdf")
303
+
304
+ # Verify chunks were written (None chunk should be skipped)
305
+ mock_temp_file.write.assert_any_call(b"chunk1")
306
+ mock_temp_file.write.assert_any_call(b"chunk2")
307
+ mock_temp_file.write.assert_any_call(b"chunk3")
308
+ self.assertEqual(mock_temp_file.write.call_count, 3)
309
+
310
+ def test_extract_filename_variants(self):
311
+ """Filename extraction across header variants and regex-exception path."""
312
+ cases = [
313
+ (
314
+ {"Content-Disposition": 'attachment; filename="test-paper.pdf"'},
315
+ "test-paper.pdf",
316
+ False,
317
+ ),
318
+ ({}, "default.pdf", False),
319
+ ({"Content-Disposition": "invalid header format"}, "default.pdf", False),
320
+ (
321
+ {"Content-Disposition": 'attachment; filename="test.pdf"'},
322
+ "default.pdf",
323
+ True,
324
+ ), # trigger exception path
325
+ ]
326
+ for headers, expected, raise_regex in cases:
327
+ with self.subTest(
328
+ headers=headers, expected=expected, raise_regex=raise_regex
329
+ ):
330
+ resp = Mock()
331
+ resp.headers = headers
332
+ if raise_regex:
333
+ with patch(
334
+ "re.search",
335
+ side_effect=requests.RequestException("Regex error"),
336
+ ):
337
+ with patch.object(
338
+ self.downloader,
339
+ "get_default_filename",
340
+ return_value="default.pdf",
341
+ ):
342
+ got = self.downloader.extract_filename_public(
343
+ resp, "10.1101/test"
344
+ )
345
+ else:
346
+ with patch.object(
347
+ self.downloader,
348
+ "get_default_filename",
349
+ return_value="default.pdf",
350
+ ):
351
+ got = self.downloader.extract_filename_public(
352
+ resp, "10.1101/test"
353
+ )
354
+ self.assertEqual(got, expected)
355
+
356
+ def test_extract_paper_metadata_success(self):
357
+ """Test successful paper metadata extraction."""
358
+ metadata = self.sample_json_response
359
+ pdf_result = ("/tmp/paper.pdf", "biorxiv_paper.pdf")
360
+
361
+ result = self.downloader.extract_paper_metadata(
362
+ metadata, "10.1101/2023.01.01.123456", pdf_result
363
+ )
364
+
365
+ expected = {
366
+ "Title": "Test BioRxiv Paper",
367
+ "Authors": ["John Doe", "Jane Smith"],
368
+ "Abstract": "This is a test abstract for bioRxiv paper.",
369
+ "Publication Date": "2023-01-01",
370
+ "DOI": "10.1101/2023.01.01.123456",
371
+ "Category": "Biochemistry",
372
+ "Version": "2",
373
+ "source": "biorxiv",
374
+ "server": "biorxiv",
375
+ "URL": "/tmp/paper.pdf",
376
+ "pdf_url": "/tmp/paper.pdf",
377
+ "filename": "biorxiv_paper.pdf",
378
+ "access_type": "open_access_downloaded",
379
+ "temp_file_path": "/tmp/paper.pdf",
380
+ }
381
+
382
+ self.assertEqual(result, expected)
383
+
384
+ def test_extract_paper_metadata_no_pdf_result(self):
385
+ """Test metadata extraction when PDF download failed."""
386
+ metadata = self.sample_json_response
387
+ pdf_result = None # No PDF download result
388
+
389
+ result = self.downloader.extract_paper_metadata(
390
+ metadata, "10.1101/2023.01.01.123456", pdf_result
391
+ )
392
+
393
+ # Should still have basic metadata but with download_failed access type
394
+ self.assertEqual(result["Title"], "Test BioRxiv Paper")
395
+ self.assertEqual(result["access_type"], "download_failed")
396
+ self.assertEqual(result["URL"], "")
397
+ self.assertEqual(result["pdf_url"], "")
398
+ self.assertEqual(result["temp_file_path"], "")
399
+ self.assertEqual(
400
+ result["filename"], "10_1101_2023_01_01_123456.pdf"
401
+ ) # Default filename
402
+
403
+ def test_extract_paper_metadata_no_collection(self):
404
+ """Test metadata extraction with missing collection."""
405
+ metadata = {}
406
+
407
+ with self.assertRaises(RuntimeError) as context:
408
+ self.downloader.extract_paper_metadata(
409
+ metadata, "10.1101/2023.01.01.123456", None
410
+ )
411
+
412
+ self.assertIn("No collection data found", str(context.exception))
413
+
414
+ def test_extract_basic_metadata(self):
415
+ """Test basic metadata extraction helper method."""
416
+ paper = self.sample_json_response["collection"][0]
417
+
418
+ result = self.downloader.extract_basic_metadata_public(
419
+ paper, "10.1101/2023.01.01.123456"
420
+ )
421
+
422
+ expected = {
423
+ "Title": "Test BioRxiv Paper",
424
+ "Authors": ["John Doe", "Jane Smith"],
425
+ "Abstract": "This is a test abstract for bioRxiv paper.",
426
+ "Publication Date": "2023-01-01",
427
+ "DOI": "10.1101/2023.01.01.123456",
428
+ "Category": "Biochemistry",
429
+ "Version": "2",
430
+ "source": "biorxiv",
431
+ "server": "biorxiv",
432
+ }
433
+
434
+ self.assertEqual(result, expected)
435
+
436
+ def test_extract_authors_variants(self):
437
+ """Author parsing for semicolon list and empty string."""
438
+ self.assertEqual(
439
+ self.downloader.extract_authors_public("John Doe; Jane Smith; Bob Johnson"),
440
+ ["John Doe", "Jane Smith", "Bob Johnson"],
441
+ )
442
+ self.assertEqual(self.downloader.extract_authors_public(""), [])
443
+
444
+ def test_service_and_identifier_helpers(self):
445
+ """Service name, identifier name, and default filename."""
446
+ self.assertEqual(self.downloader.get_service_name(), "bioRxiv")
447
+ self.assertEqual(self.downloader.get_identifier_name(), "DOI")
448
+ self.assertEqual(
449
+ self.downloader.get_default_filename("10.1101/2023.01.01.123456"),
450
+ "10_1101_2023_01_01_123456.pdf",
451
+ )
452
+
453
+ def test_get_paper_identifier_info(self):
454
+ """Test _get_paper_identifier_info method."""
455
+ paper = {
456
+ "DOI": "10.1101/2023.01.01.123456",
457
+ "Publication Date": "2023-01-01",
458
+ "Category": "Biology",
459
+ }
460
+
461
+ result = self.downloader.get_paper_identifier_info_public(paper)
462
+
463
+ self.assertIn("10.1101/2023.01.01.123456", result)
464
+ self.assertIn("2023-01-01", result)
465
+ self.assertIn("Biology", result)
466
+
467
+ def test_add_service_identifier(self):
468
+ """Test _add_service_identifier method."""
469
+ entry = {}
470
+
471
+ self.downloader.add_service_identifier_public(
472
+ entry, "10.1101/2023.01.01.123456"
473
+ )
474
+
475
+ self.assertEqual(entry["DOI"], "10.1101/2023.01.01.123456")
476
+ self.assertEqual(entry["server"], "biorxiv")
477
+
478
+
479
+ class TestBiorxivDownloaderIntegration(unittest.TestCase):
480
+ """Integration tests for BiorxivDownloader workflow."""
481
+
482
+ @patch("cloudscraper.create_scraper")
483
+ def setUp(self, mock_create_scraper):
484
+ """Set up integration test fixtures."""
485
+ self.mock_config = Mock()
486
+ self.mock_config.api_url = "https://api.biorxiv.org/details"
487
+ self.mock_config.pdf_url_template = (
488
+ "https://www.biorxiv.org/content/{doi}v{version}.full.pdf"
489
+ )
490
+ self.mock_config.user_agent = "test-agent"
491
+ self.mock_config.cf_clearance_timeout = 10
492
+ self.mock_config.request_timeout = 30
493
+ self.mock_config.chunk_size = 8192
494
+ self.mock_config.session_reuse = True
495
+ self.mock_config.default_version = "1"
496
+ self.mock_config.browser_config = {"type": "custom"}
497
+
498
+ # Mock the scraper creation during initialization
499
+ mock_scraper = Mock()
500
+ mock_create_scraper.return_value = mock_scraper
501
+
502
+ self.downloader = BiorxivDownloaderTestShim(self.mock_config)
503
+
504
+ self.sample_response = {
505
+ "collection": [
506
+ {
507
+ "title": "Integration Test Paper",
508
+ "authors": "Test Author",
509
+ "abstract": "Integration test abstract.",
510
+ "date": "2023-01-01",
511
+ "category": "Biology",
512
+ "version": "1",
513
+ "doi": "10.1101/2023.01.01.123456",
514
+ }
515
+ ]
516
+ }
517
+
518
+ @patch("tempfile.NamedTemporaryFile")
519
+ def test_full_paper_processing_workflow(self, mock_tempfile):
520
+ """Test the complete workflow from DOI to processed paper data."""
521
+ # Mock scraper responses
522
+ mock_scraper = Mock()
523
+ mock_metadata_response = Mock()
524
+ mock_metadata_response.json.return_value = self.sample_response
525
+ mock_metadata_response.raise_for_status = Mock()
526
+
527
+ # Mock landing page and PDF responses for download
528
+ mock_landing_response = Mock()
529
+ mock_landing_response.raise_for_status = Mock()
530
+
531
+ mock_pdf_response = Mock()
532
+ mock_pdf_response.raise_for_status = Mock()
533
+ mock_pdf_response.iter_content.return_value = [b"PDF data"]
534
+ mock_pdf_response.headers = {}
535
+
536
+ # First call for metadata, then landing page, then PDF download
537
+ mock_scraper.get.side_effect = [
538
+ mock_metadata_response,
539
+ mock_landing_response,
540
+ mock_pdf_response,
541
+ ]
542
+ self.downloader.set_scraper(mock_scraper)
543
+
544
+ # Mock temporary file
545
+ mock_temp_file = Mock()
546
+ mock_temp_file.name = "/tmp/integration.pdf"
547
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
548
+ mock_temp_file.__exit__ = Mock(return_value=None)
549
+ mock_tempfile.return_value = mock_temp_file
550
+
551
+ # Simulate the workflow
552
+ identifier = "10.1101/2023.01.01.123456"
553
+
554
+ # Step 1: Fetch metadata
555
+ metadata = self.downloader.fetch_metadata(identifier)
556
+
557
+ # Step 2: Construct PDF URL
558
+ pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
559
+
560
+ # Step 3: Download PDF
561
+ pdf_result = self.downloader.download_pdf_to_temp(pdf_url, identifier)
562
+
563
+ # Step 4: Extract metadata
564
+ paper_data = self.downloader.extract_paper_metadata(
565
+ metadata, identifier, pdf_result
566
+ )
567
+
568
+ # Verify the complete workflow
569
+ self.assertEqual(paper_data["Title"], "Integration Test Paper")
570
+ self.assertEqual(paper_data["Authors"], ["Test Author"])
571
+ self.assertEqual(paper_data["access_type"], "open_access_downloaded")
572
+ self.assertEqual(paper_data["temp_file_path"], "/tmp/integration.pdf")
573
+
574
+ expected_pdf_url = (
575
+ "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
576
+ )
577
+ self.assertEqual(pdf_url, expected_pdf_url)
578
+
579
+ # Verify 3 calls: metadata, landing page, PDF
580
+ self.assertEqual(mock_scraper.get.call_count, 3)
581
+
582
+ def test_workflow_with_existing_scraper(self):
583
+ """Test workflow reusing existing scraper instance."""
584
+ # Set existing scraper
585
+ existing_scraper = Mock()
586
+
587
+ # Mock API response for metadata
588
+ mock_response = Mock()
589
+ mock_response.json.return_value = self.sample_response
590
+ mock_response.raise_for_status = Mock()
591
+ existing_scraper.get.return_value = mock_response
592
+
593
+ self.downloader.set_scraper(existing_scraper)
594
+
595
+ identifier = "10.1101/2023.01.01.123456"
596
+ metadata = self.downloader.fetch_metadata(identifier)
597
+ pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
598
+
599
+ # Try to download (will use existing scraper)
600
+ with patch("tempfile.NamedTemporaryFile"):
601
+ # Reset the mock and set up responses for landing + PDF
602
+ existing_scraper.reset_mock()
603
+ mock_landing = Mock()
604
+ mock_landing.raise_for_status = Mock()
605
+ mock_pdf = Mock()
606
+ mock_pdf.raise_for_status = Mock()
607
+ mock_pdf.iter_content.return_value = [b"data"]
608
+ mock_pdf.headers = {}
609
+ existing_scraper.get.side_effect = [mock_landing, mock_pdf]
610
+
611
+ self.downloader.download_pdf_to_temp(pdf_url, identifier)
612
+
613
+ # Should have used existing scraper for landing + PDF (2 calls)
614
+ self.assertEqual(existing_scraper.get.call_count, 2)
615
+
616
+
617
+ class TestBiorxivCloudFlareHandling(unittest.TestCase):
618
+ """Tests specific to CloudFlare protection handling."""
619
+
620
+ @patch("cloudscraper.create_scraper")
621
+ def setUp(self, mock_create_scraper):
622
+ """Set up CloudFlare handling test fixtures."""
623
+ self.mock_config = Mock()
624
+ self.mock_config.api_url = "https://api.biorxiv.org/details"
625
+ self.mock_config.pdf_url_template = (
626
+ "https://www.biorxiv.org/content/{doi}v{version}.full.pdf"
627
+ )
628
+ self.mock_config.user_agent = "Mozilla/5.0 (compatible; test-agent)"
629
+ self.mock_config.cf_clearance_timeout = 15
630
+ self.mock_config.request_timeout = 30
631
+ self.mock_config.chunk_size = 8192
632
+ self.mock_config.session_reuse = True
633
+ self.mock_config.default_version = "1"
634
+ self.mock_config.browser_config = {"type": "custom"}
635
+
636
+ # Mock the scraper creation during initialization
637
+ mock_scraper = Mock()
638
+ mock_create_scraper.return_value = mock_scraper
639
+
640
+ self.downloader = BiorxivDownloaderTestShim(self.mock_config)
641
+
642
+ @patch("cloudscraper.create_scraper")
643
+ def test_cloudscraper_configuration(self, mock_create_scraper):
644
+ """Test CloudScraper is configured with proper parameters."""
645
+ # Set scraper to None so we create a new one
646
+ self.downloader.set_scraper(None)
647
+ mock_scraper = Mock()
648
+ mock_create_scraper.return_value = mock_scraper
649
+
650
+ scraper = self.downloader.get_scraper_public()
651
+
652
+ mock_create_scraper.assert_called_once_with(
653
+ browser={"custom": "Mozilla/5.0 (compatible; test-agent)"}, delay=15
654
+ )
655
+ self.assertEqual(scraper, mock_scraper)
656
+
657
+ @patch("tempfile.NamedTemporaryFile")
658
+ def test_landing_page_visit_before_pdf_download(self, mock_tempfile):
659
+ """Test that landing page is visited before PDF download for CloudFlare bypass."""
660
+ mock_scraper = Mock()
661
+ self.downloader.set_scraper(mock_scraper)
662
+
663
+ # Mock responses
664
+ mock_landing_response = Mock()
665
+ mock_landing_response.raise_for_status = Mock()
666
+
667
+ mock_pdf_response = Mock()
668
+ mock_pdf_response.raise_for_status = Mock()
669
+ mock_pdf_response.iter_content.return_value = [b"PDF content"]
670
+ mock_pdf_response.headers = {}
671
+
672
+ mock_scraper.get.side_effect = [mock_landing_response, mock_pdf_response]
673
+
674
+ # Mock temp file
675
+ mock_temp_file = Mock()
676
+ mock_temp_file.name = "/tmp/test.pdf"
677
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
678
+ mock_temp_file.__exit__ = Mock(return_value=None)
679
+ mock_tempfile.return_value = mock_temp_file
680
+
681
+ pdf_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf"
682
+ self.downloader.download_pdf_to_temp(pdf_url, "10.1101/2023.01.01.123456")
683
+
684
+ # Verify landing page was visited first
685
+ landing_url = "https://www.biorxiv.org/content/10.1101/2023.01.01.123456v1"
686
+
687
+ calls = mock_scraper.get.call_args_list
688
+ self.assertEqual(len(calls), 2)
689
+
690
+ # First call should be to landing page
691
+ self.assertEqual(calls[0][0][0], landing_url)
692
+ self.assertEqual(calls[0][1]["timeout"], 30)
693
+
694
+ # Second call should be to PDF URL
695
+ self.assertEqual(calls[1][0][0], pdf_url)
696
+ self.assertEqual(calls[1][1]["timeout"], 30)
697
+ self.assertEqual(calls[1][1]["stream"], True)