aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  2. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
  3. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  4. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  6. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  7. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
  8. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
  9. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
  12. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
  13. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
  14. aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
  15. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  16. aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
  17. aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
  18. aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
  19. aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
  20. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
  21. aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
  22. aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
  23. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
  24. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
  25. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
  26. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
  28. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
  29. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
  30. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
  31. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
  32. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
  33. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
  34. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
  35. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
  36. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
  37. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
  38. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
  39. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
  40. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
  41. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
  42. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
  43. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
  44. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
  45. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
  46. /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
  47. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
  48. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
  49. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,534 @@
1
+ """
2
+ Unit tests for MedrxivDownloader.
3
+ Tests JSON API interaction, PDF URL construction, and metadata extraction.
4
+ """
5
+
6
+ import json
7
+ import unittest
8
+ from unittest.mock import Mock, patch
9
+
10
+ import requests
11
+
12
+ from aiagents4pharma.talk2scholars.tools.paper_download.utils.medrxiv_downloader import (
13
+ MedrxivDownloader,
14
+ )
15
+
16
+
17
+ # ---- Test-only shim to access protected helpers without pylint W0212 ----
18
+ class MedrxivDownloaderTestShim(MedrxivDownloader):
19
+ """mock class to access protected methods for testing."""
20
+
21
+ __test__ = False # prevent pytest collection
22
+
23
+ def extract_basic_metadata_public(self, paper, identifier):
24
+ """extract basic metadata from a paper."""
25
+ return self._extract_basic_metadata(paper, identifier)
26
+
27
+ def extract_authors_public(self, authors_str):
28
+ """extract authors from a semicolon-separated string."""
29
+ return self._extract_authors(authors_str)
30
+
31
+ def extract_pdf_metadata_public(self, pdf_result, identifier):
32
+ """extract PDF metadata from the download result."""
33
+ return self._extract_pdf_metadata(pdf_result, identifier)
34
+
35
+ def get_paper_identifier_info_public(self, paper):
36
+ """get paper identifier info for public use."""
37
+ return self._get_paper_identifier_info(paper)
38
+
39
+ def add_service_identifier_public(self, entry, identifier):
40
+ """add service identifier to an entry."""
41
+ self._add_service_identifier(entry, identifier)
42
+
43
+
44
+ class TestMedrxivDownloader(unittest.TestCase):
45
+ """Tests for the MedrxivDownloader class."""
46
+
47
+ def setUp(self):
48
+ """Set up test fixtures."""
49
+ self.mock_config = Mock()
50
+ self.mock_config.api_url = "https://api.medrxiv.org/details"
51
+ self.mock_config.request_timeout = 30
52
+ self.mock_config.chunk_size = 8192
53
+ self.mock_config.pdf_url_template = (
54
+ "https://www.medrxiv.org/content/{identifier}v{version}.full.pdf"
55
+ )
56
+ self.mock_config.default_version = "1"
57
+
58
+ self.downloader = MedrxivDownloaderTestShim(self.mock_config)
59
+
60
+ # Sample medRxiv API response
61
+ self.sample_json_response = {
62
+ "collection": [
63
+ {
64
+ "title": "Test MedRxiv Paper",
65
+ "authors": "John Doe; Jane Smith",
66
+ "abstract": "This is a test abstract for medRxiv paper.",
67
+ "date": "2023-01-01",
68
+ "category": "Infectious Diseases",
69
+ "version": "1",
70
+ "doi": "10.1101/2023.01.01.123456",
71
+ }
72
+ ]
73
+ }
74
+
75
+ def test_initialization(self):
76
+ """Test MedrxivDownloader initialization."""
77
+ self.assertEqual(self.downloader.api_url, "https://api.medrxiv.org/details")
78
+ self.assertEqual(self.downloader.request_timeout, 30)
79
+ self.assertEqual(self.downloader.chunk_size, 8192)
80
+
81
+ @patch("requests.get")
82
+ def test_fetch_metadata_success(self, mock_get):
83
+ """Test successful metadata fetching from medRxiv API."""
84
+ mock_response = Mock()
85
+ mock_response.json.return_value = self.sample_json_response
86
+ mock_response.raise_for_status = Mock()
87
+ mock_get.return_value = mock_response
88
+
89
+ result = self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
90
+
91
+ # Verify API call - should include /medrxiv/ and /na/json
92
+ expected_url = (
93
+ "https://api.medrxiv.org/details/medrxiv/10.1101/2023.01.01.123456/na/json"
94
+ )
95
+ mock_get.assert_called_once_with(expected_url, timeout=30)
96
+ mock_response.raise_for_status.assert_called_once()
97
+
98
+ # Verify JSON parsing
99
+ self.assertEqual(result, self.sample_json_response)
100
+
101
+ @patch("requests.get")
102
+ def test_fetch_metadata_network_error(self, mock_get):
103
+ """Test fetch_metadata with network error."""
104
+ mock_get.side_effect = requests.RequestException("Network error")
105
+
106
+ with self.assertRaises(requests.RequestException):
107
+ self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
108
+
109
+ @patch("requests.get")
110
+ def test_fetch_metadata_json_decode_error(self, mock_get):
111
+ """Test fetch_metadata with JSON decode error."""
112
+ mock_response = Mock()
113
+ mock_response.json.side_effect = json.JSONDecodeError("Invalid JSON", "", 0)
114
+ mock_response.raise_for_status = Mock()
115
+ mock_get.return_value = mock_response
116
+
117
+ with self.assertRaises(json.JSONDecodeError):
118
+ self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
119
+
120
+ def test_construct_pdf_url_variants(self):
121
+ """PDF URL construction: normal, missing/empty collection, custom version."""
122
+ # Success
123
+ self.assertEqual(
124
+ self.downloader.construct_pdf_url(
125
+ self.sample_json_response, "10.1101/2023.01.01.123456"
126
+ ),
127
+ "https://www.medrxiv.org/content/10.1101/2023.01.01.123456v1.full.pdf",
128
+ )
129
+ # No collection
130
+ self.assertEqual(
131
+ self.downloader.construct_pdf_url({}, "10.1101/2023.01.01.123456"),
132
+ "",
133
+ )
134
+ # Empty collection
135
+ self.assertEqual(
136
+ self.downloader.construct_pdf_url(
137
+ {"collection": []}, "10.1101/2023.01.01.123456"
138
+ ),
139
+ "",
140
+ )
141
+ # Custom version
142
+ self.assertEqual(
143
+ self.downloader.construct_pdf_url(
144
+ {"collection": [{"title": "Test Paper", "version": "3"}]},
145
+ "10.1101/2023.01.01.123456",
146
+ ),
147
+ "https://www.medrxiv.org/content/10.1101/2023.01.01.123456v3.full.pdf",
148
+ )
149
+
150
+ def test_extract_paper_metadata_success(self):
151
+ """Test successful paper metadata extraction."""
152
+ metadata = self.sample_json_response
153
+ pdf_result = ("/tmp/paper.pdf", "medrxiv_paper.pdf")
154
+
155
+ result = self.downloader.extract_paper_metadata(
156
+ metadata, "10.1101/2023.01.01.123456", pdf_result
157
+ )
158
+
159
+ expected = {
160
+ "Title": "Test MedRxiv Paper",
161
+ "Authors": ["John Doe", "Jane Smith"],
162
+ "Abstract": "This is a test abstract for medRxiv paper.",
163
+ "Publication Date": "2023-01-01",
164
+ "DOI": "10.1101/2023.01.01.123456",
165
+ "Category": "Infectious Diseases",
166
+ "Version": "1",
167
+ "source": "medrxiv",
168
+ "server": "medrxiv",
169
+ "URL": "/tmp/paper.pdf",
170
+ "pdf_url": "/tmp/paper.pdf",
171
+ "filename": "medrxiv_paper.pdf",
172
+ "access_type": "open_access_downloaded",
173
+ "temp_file_path": "/tmp/paper.pdf",
174
+ }
175
+
176
+ self.assertEqual(result, expected)
177
+
178
+ def test_extract_paper_metadata_no_pdf(self):
179
+ """Test metadata extraction without PDF download."""
180
+ metadata = self.sample_json_response
181
+
182
+ with patch.object(
183
+ self.downloader, "get_default_filename", return_value="default.pdf"
184
+ ):
185
+ result = self.downloader.extract_paper_metadata(
186
+ metadata, "10.1101/2023.01.01.123456", None
187
+ )
188
+
189
+ self.assertEqual(result["Title"], "Test MedRxiv Paper")
190
+ self.assertEqual(result["URL"], "")
191
+ self.assertEqual(result["access_type"], "download_failed")
192
+ self.assertEqual(result["filename"], "default.pdf")
193
+
194
+ def test_extract_paper_metadata_no_collection(self):
195
+ """Test metadata extraction with missing collection."""
196
+ metadata = {}
197
+
198
+ with self.assertRaises(RuntimeError) as context:
199
+ self.downloader.extract_paper_metadata(
200
+ metadata, "10.1101/2023.01.01.123456", None
201
+ )
202
+
203
+ self.assertIn("No collection data found", str(context.exception))
204
+
205
+ def test_extract_basic_metadata_variants(self):
206
+ """Basic metadata extraction: complete and missing fields."""
207
+ # Complete
208
+ paper_full = self.sample_json_response["collection"][0]
209
+ got_full = self.downloader.extract_basic_metadata_public(
210
+ paper_full, "10.1101/2023.01.01.123456"
211
+ )
212
+ expected_full = {
213
+ "Title": "Test MedRxiv Paper",
214
+ "Authors": ["John Doe", "Jane Smith"],
215
+ "Abstract": "This is a test abstract for medRxiv paper.",
216
+ "Publication Date": "2023-01-01",
217
+ "DOI": "10.1101/2023.01.01.123456",
218
+ "Category": "Infectious Diseases",
219
+ "Version": "1",
220
+ "source": "medrxiv",
221
+ "server": "medrxiv",
222
+ }
223
+ self.assertEqual(got_full, expected_full)
224
+
225
+ # Missing fields
226
+ paper_missing = {"title": "Test Paper"} # Missing others
227
+ got_missing = self.downloader.extract_basic_metadata_public(
228
+ paper_missing, "10.1101/test"
229
+ )
230
+ self.assertEqual(got_missing["Title"], "Test Paper")
231
+ self.assertEqual(got_missing["Authors"], [])
232
+ self.assertEqual(got_missing["Abstract"], "N/A")
233
+ self.assertEqual(got_missing["Category"], "N/A")
234
+
235
+ def test_extract_authors_variants(self):
236
+ """Author parsing from semicolon string, empty, and whitespace-heavy inputs."""
237
+ self.assertEqual(
238
+ self.downloader.extract_authors_public("John Doe; Jane Smith; Bob Johnson"),
239
+ ["John Doe", "Jane Smith", "Bob Johnson"],
240
+ )
241
+ self.assertEqual(self.downloader.extract_authors_public(""), [])
242
+ self.assertEqual(
243
+ self.downloader.extract_authors_public(" John Doe ; Jane Smith ; "),
244
+ ["John Doe", "Jane Smith"],
245
+ )
246
+
247
+ def test_extract_pdf_metadata_variants(self):
248
+ """PDF metadata: with and without download result."""
249
+ # With result
250
+ pdf_result = ("/tmp/test.pdf", "paper.pdf")
251
+ expected_with = {
252
+ "URL": "/tmp/test.pdf",
253
+ "pdf_url": "/tmp/test.pdf",
254
+ "filename": "paper.pdf",
255
+ "access_type": "open_access_downloaded",
256
+ "temp_file_path": "/tmp/test.pdf",
257
+ }
258
+ self.assertEqual(
259
+ self.downloader.extract_pdf_metadata_public(pdf_result, "10.1101/test"),
260
+ expected_with,
261
+ )
262
+
263
+ # Without result
264
+ with patch.object(
265
+ self.downloader, "get_default_filename", return_value="default.pdf"
266
+ ):
267
+ expected_without = {
268
+ "URL": "",
269
+ "pdf_url": "",
270
+ "filename": "default.pdf",
271
+ "access_type": "download_failed",
272
+ "temp_file_path": "",
273
+ }
274
+ self.assertEqual(
275
+ self.downloader.extract_pdf_metadata_public(None, "10.1101/test"),
276
+ expected_without,
277
+ )
278
+
279
+ def test_service_and_identifier_helpers(self):
280
+ """Service, identifier, and default filename helpers."""
281
+ self.assertEqual(self.downloader.get_service_name(), "medRxiv")
282
+ self.assertEqual(self.downloader.get_identifier_name(), "DOI")
283
+ self.assertEqual(
284
+ self.downloader.get_default_filename("10.1101/2023.01.01.123456"),
285
+ "10_1101_2023_01_01_123456.pdf",
286
+ )
287
+
288
+ def test_get_paper_identifier_info(self):
289
+ """Test _get_paper_identifier_info method."""
290
+ paper = {
291
+ "DOI": "10.1101/2023.01.01.123456",
292
+ "Publication Date": "2023-01-01",
293
+ "Category": "Medicine",
294
+ }
295
+
296
+ result = self.downloader.get_paper_identifier_info_public(paper)
297
+
298
+ self.assertIn("10.1101/2023.01.01.123456", result)
299
+ self.assertIn("2023-01-01", result)
300
+ self.assertIn("Medicine", result)
301
+
302
+ def test_add_service_identifier(self):
303
+ """Test _add_service_identifier method."""
304
+ entry = {}
305
+ self.downloader.add_service_identifier_public(
306
+ entry, "10.1101/2023.01.01.123456"
307
+ )
308
+ self.assertEqual(entry["DOI"], "10.1101/2023.01.01.123456")
309
+ self.assertEqual(entry["server"], "medrxiv")
310
+
311
+
312
+ class TestMedrxivDownloaderIntegration(unittest.TestCase):
313
+ """Integration tests for MedrxivDownloader workflow."""
314
+
315
+ def setUp(self):
316
+ """Set up integration test fixtures."""
317
+ self.mock_config = Mock()
318
+ self.mock_config.api_url = "https://api.medrxiv.org/details"
319
+ self.mock_config.request_timeout = 30
320
+ self.mock_config.chunk_size = 8192
321
+ self.mock_config.pdf_url_template = (
322
+ "https://www.medrxiv.org/content/{identifier}v{version}.full.pdf"
323
+ )
324
+ self.mock_config.default_version = "1"
325
+
326
+ self.downloader = MedrxivDownloaderTestShim(self.mock_config)
327
+
328
+ self.sample_response = {
329
+ "collection": [
330
+ {
331
+ "title": "Integration Test Paper",
332
+ "authors": "Test Author",
333
+ "abstract": "Integration test abstract.",
334
+ "date": "2023-01-01",
335
+ "category": "Medicine",
336
+ "version": "2",
337
+ "doi": "10.1101/2023.01.01.123456",
338
+ }
339
+ ]
340
+ }
341
+
342
+ @patch(
343
+ "aiagents4pharma.talk2scholars.tools.paper_download.utils."
344
+ "medrxiv_downloader.MedrxivDownloader.download_pdf_to_temp"
345
+ )
346
+ @patch("requests.get")
347
+ def test_full_paper_processing_workflow(self, mock_get, mock_download):
348
+ """Test the complete workflow from DOI to processed paper data."""
349
+ # Mock API response
350
+ mock_response = Mock()
351
+ mock_response.json.return_value = self.sample_response
352
+ mock_response.raise_for_status = Mock()
353
+ mock_get.return_value = mock_response
354
+
355
+ # Mock PDF download
356
+ mock_download.return_value = ("/tmp/paper.pdf", "medrxiv_paper.pdf")
357
+
358
+ # Simulate the workflow
359
+ identifier = "10.1101/2023.01.01.123456"
360
+
361
+ # Step 1: Fetch metadata
362
+ metadata = self.downloader.fetch_metadata(identifier)
363
+
364
+ # Step 2: Construct PDF URL
365
+ pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
366
+
367
+ # Step 3: Download PDF
368
+ pdf_result = self.downloader.download_pdf_to_temp(pdf_url, identifier)
369
+
370
+ # Step 4: Extract metadata
371
+ paper_data = self.downloader.extract_paper_metadata(
372
+ metadata, identifier, pdf_result
373
+ )
374
+
375
+ # Verify the complete workflow
376
+ self.assertEqual(paper_data["Title"], "Integration Test Paper")
377
+ self.assertEqual(paper_data["Authors"], ["Test Author"])
378
+ self.assertEqual(paper_data["access_type"], "open_access_downloaded")
379
+ self.assertEqual(paper_data["filename"], "medrxiv_paper.pdf")
380
+ self.assertEqual(paper_data["temp_file_path"], "/tmp/paper.pdf")
381
+
382
+ # Verify method calls
383
+ mock_get.assert_called_once_with(
384
+ "https://api.medrxiv.org/details/medrxiv/10.1101/2023.01.01.123456/na/json",
385
+ timeout=30,
386
+ )
387
+ expected_pdf_url = (
388
+ "https://www.medrxiv.org/content/10.1101/2023.01.01.123456v2.full.pdf"
389
+ )
390
+ mock_download.assert_called_once_with(expected_pdf_url, identifier)
391
+
392
+ @patch("requests.get")
393
+ def test_error_handling_workflow(self, mock_get):
394
+ """Test error handling in the workflow."""
395
+ # Mock API error
396
+ mock_get.side_effect = requests.RequestException("API error")
397
+
398
+ with self.assertRaises(requests.RequestException):
399
+ self.downloader.fetch_metadata("10.1101/2023.01.01.123456")
400
+
401
+ @patch("requests.get")
402
+ def test_workflow_with_empty_collection(self, mock_get):
403
+ """Test workflow with empty collection response."""
404
+ # Mock API response with empty collection - this should raise error in fetch_metadata
405
+ mock_response = Mock()
406
+ mock_response.json.return_value = {"collection": []}
407
+ mock_response.raise_for_status = Mock()
408
+ mock_get.return_value = mock_response
409
+
410
+ identifier = "10.1101/2023.01.01.123456"
411
+
412
+ # Should raise error in fetch_metadata when collection is empty
413
+ with self.assertRaises(RuntimeError) as context:
414
+ self.downloader.fetch_metadata(identifier)
415
+
416
+ self.assertIn(
417
+ "No collection data found in medRxiv API response", str(context.exception)
418
+ )
419
+
420
+ @patch("requests.get")
421
+ def test_multiple_identifiers_workflow(self, mock_get):
422
+ """Test processing multiple identifiers."""
423
+ # Mock different responses for different DOIs
424
+ responses = [
425
+ {
426
+ "collection": [
427
+ {"title": "Paper 1", "version": "1", "authors": "Author 1"}
428
+ ]
429
+ },
430
+ {
431
+ "collection": [
432
+ {"title": "Paper 2", "version": "2", "authors": "Author 2"}
433
+ ]
434
+ },
435
+ ]
436
+
437
+ mock_responses = []
438
+ for response in responses:
439
+ mock_resp = Mock()
440
+ mock_resp.json.return_value = response
441
+ mock_resp.raise_for_status = Mock()
442
+ mock_responses.append(mock_resp)
443
+
444
+ mock_get.side_effect = mock_responses
445
+
446
+ identifiers = ["10.1101/2023.01.01.111111", "10.1101/2023.01.01.222222"]
447
+ results = {}
448
+
449
+ for identifier in identifiers:
450
+ metadata = self.downloader.fetch_metadata(identifier)
451
+ _ = self.downloader.construct_pdf_url(
452
+ metadata, identifier
453
+ ) # ensure path covered
454
+ paper_data = self.downloader.extract_paper_metadata(
455
+ metadata, identifier, None
456
+ )
457
+ results[identifier] = paper_data
458
+
459
+ # Verify both papers were processed
460
+ self.assertEqual(len(results), 2)
461
+ self.assertEqual(results["10.1101/2023.01.01.111111"]["Title"], "Paper 1")
462
+ self.assertEqual(results["10.1101/2023.01.01.222222"]["Title"], "Paper 2")
463
+
464
+ # Verify API calls with correct URLs
465
+ self.assertEqual(mock_get.call_count, 2)
466
+ expected_calls = [
467
+ "https://api.medrxiv.org/details/medrxiv/10.1101/2023.01.01.111111/na/json",
468
+ "https://api.medrxiv.org/details/medrxiv/10.1101/2023.01.01.222222/na/json",
469
+ ]
470
+ actual_urls = [call[0][0] for call in mock_get.call_args_list]
471
+ for expected_url in expected_calls:
472
+ self.assertIn(expected_url, actual_urls)
473
+
474
+
475
+ class TestMedrxivSpecialCases(unittest.TestCase):
476
+ """Tests for special cases and edge conditions."""
477
+
478
+ def setUp(self):
479
+ """Set up test fixtures for special cases."""
480
+ self.mock_config = Mock()
481
+ self.mock_config.api_url = "https://api.medrxiv.org/details"
482
+ self.mock_config.request_timeout = 30
483
+ self.mock_config.chunk_size = 8192
484
+ self.mock_config.pdf_url_template = (
485
+ "https://www.medrxiv.org/content/{identifier}v{version}.full.pdf"
486
+ )
487
+ self.mock_config.default_version = "1"
488
+
489
+ self.downloader = MedrxivDownloaderTestShim(self.mock_config)
490
+
491
+ def test_filename_generation_special_characters(self):
492
+ """Test filename generation with special characters in DOI."""
493
+ doi_with_special_chars = "10.1101/2023.01.01.123456/special-chars_test"
494
+
495
+ result = self.downloader.get_default_filename(doi_with_special_chars)
496
+
497
+ # Should replace problematic characters
498
+ self.assertEqual(result, "10_1101_2023_01_01_123456_special-chars_test.pdf")
499
+
500
+ def test_version_handling_edge_cases(self):
501
+ """Test PDF URL construction with various version formats."""
502
+ test_cases = [
503
+ ({"collection": [{"version": ""}]}, "v.full.pdf"), # Empty version
504
+ ({"collection": [{"version": None}]}, "vNone.full.pdf"), # None version
505
+ ({"collection": [{}]}, "v1.full.pdf"), # Missing version key defaults to 1
506
+ ]
507
+
508
+ for metadata, expected_suffix in test_cases:
509
+ result = self.downloader.construct_pdf_url(metadata, "10.1101/test")
510
+ self.assertTrue(result.endswith(expected_suffix))
511
+
512
+ def test_metadata_extraction_unicode_handling(self):
513
+ """Test metadata extraction with Unicode characters."""
514
+ metadata = {
515
+ "collection": [
516
+ {
517
+ "title": "Título com acentos é símbolos especiais",
518
+ "authors": "José María; François Müller",
519
+ "abstract": "Resumo com çaracteres especiais ñ símbolos",
520
+ "date": "2023-01-01",
521
+ "category": "Médecine",
522
+ "version": "1",
523
+ }
524
+ ]
525
+ }
526
+
527
+ result = self.downloader.extract_paper_metadata(metadata, "10.1101/test", None)
528
+
529
+ # Should handle Unicode properly
530
+ self.assertEqual(result["Title"], "Título com acentos é símbolos especiais")
531
+ self.assertEqual(result["Authors"], ["José María", "François Müller"])
532
+ self.assertEqual(
533
+ result["Abstract"], "Resumo com çaracteres especiais ñ símbolos"
534
+ )
@@ -13,8 +13,9 @@ def mock_hydra_fixture():
13
13
  """Mocks Hydra configuration for tests."""
14
14
  with mock.patch("hydra.initialize"), mock.patch("hydra.compose") as mock_compose:
15
15
  cfg_mock = mock.MagicMock()
16
- cfg_mock.agents.talk2scholars.s2_agent.temperature = 0
17
- cfg_mock.agents.talk2scholars.paper_download_agent.prompt = "Test prompt"
16
+ cfg_mock.agents.talk2scholars.paper_download_agent.paper_download_agent = (
17
+ "Test prompt"
18
+ )
18
19
  mock_compose.return_value = cfg_mock
19
20
  yield mock_compose
20
21
 
@@ -23,13 +24,12 @@ def mock_hydra_fixture():
23
24
  def mock_tools_fixture():
24
25
  """Mocks paper download tools to prevent real HTTP calls."""
25
26
  with mock.patch(
26
- "aiagents4pharma.talk2scholars.tools.paper_download."
27
- "download_arxiv_input.download_arxiv_paper"
28
- ) as mock_download_arxiv_paper:
29
- mock_download_arxiv_paper.return_value = {
27
+ "aiagents4pharma.talk2scholars.tools.paper_download.paper_downloader.download_papers"
28
+ ) as mock_download_papers:
29
+ mock_download_papers.return_value = {
30
30
  "article_data": {"dummy_key": "dummy_value"}
31
31
  }
32
- yield [mock_download_arxiv_paper]
32
+ yield [mock_download_papers]
33
33
 
34
34
 
35
35
  @pytest.mark.usefixtures("mock_hydra_fixture")
@@ -83,10 +83,12 @@ def test_paper_download_agent_invocation():
83
83
  assert "article_data" in result
84
84
 
85
85
 
86
- def test_paper_download_agent_tools_assignment(request): # Keep fixture name
87
- """Checks correct tool assignment (download_arxiv_paper, query_dataframe)."""
86
+ def test_paper_download_agent_tools_assignment(
87
+ request,
88
+ ):
89
+ """Checks correct tool assignment (download_papers tool)."""
88
90
  thread_id = "test_thread_paper_dl"
89
- mock_tools = request.getfixturevalue("mock_tools_fixture")
91
+ request.getfixturevalue("mock_tools_fixture")
90
92
  llm_mock = mock.Mock(spec=BaseChatModel)
91
93
 
92
94
  with (
@@ -100,12 +102,20 @@ def test_paper_download_agent_tools_assignment(request): # Keep fixture name
100
102
  mock_agent = mock.Mock()
101
103
  mock_create_agent.return_value = mock_agent
102
104
  mock_tool_instance = mock.Mock()
103
- mock_tool_instance.tools = mock_tools if mock_tools else []
104
105
  mock_toolnode.return_value = mock_tool_instance
105
106
 
106
107
  get_app(thread_id, llm_mock)
108
+ # Verify ToolNode was called with download_papers function
107
109
  assert mock_toolnode.called
108
- assert len(mock_tool_instance.tools) == 1
110
+ # Check that ToolNode was called with a list containing the download_papers tool
111
+ call_args = mock_toolnode.call_args[0][
112
+ 0
113
+ ] # Get first positional argument (the tools list)
114
+ assert len(call_args) == 1
115
+ # The tool should be a StructuredTool with name 'download_papers'
116
+ tool = call_args[0]
117
+ assert hasattr(tool, "name")
118
+ assert tool.name == "download_papers"
109
119
 
110
120
 
111
121
  def test_paper_download_agent_hydra_failure():