aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  2. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
  3. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  4. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  6. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  7. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
  8. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
  9. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
  12. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
  13. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
  14. aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
  15. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  16. aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
  17. aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
  18. aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
  19. aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
  20. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
  21. aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
  22. aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
  23. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
  24. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
  25. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
  26. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
  28. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
  29. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
  30. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
  31. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
  32. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
  33. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
  34. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
  35. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
  36. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
  37. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
  38. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
  39. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
  40. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
  41. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
  42. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
  43. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
  44. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
  45. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
  46. /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
  47. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
  48. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
  49. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,620 @@
1
+ """
2
+ Unit tests for BasePaperDownloader.
3
+ Tests the abstract base class functionality and common methods.
4
+ """
5
+
6
+ import unittest
7
+ from typing import Any, Dict, Optional, Tuple
8
+ from unittest.mock import Mock, patch
9
+
10
+ import inspect
11
+ import requests
12
+
13
+ from aiagents4pharma.talk2scholars.tools.paper_download.utils.base_paper_downloader import (
14
+ BasePaperDownloader,
15
+ )
16
+
17
+
18
+ class ConcretePaperDownloader(BasePaperDownloader):
19
+ """Concrete implementation of BasePaperDownloader for testing."""
20
+
21
+ def __init__(self, config: Any):
22
+ super().__init__(config)
23
+ self.test_metadata = {"test": "data"}
24
+
25
+ def fetch_metadata(self, identifier: str) -> Any:
26
+ """Concrete implementation for testing."""
27
+ return self.test_metadata
28
+
29
+ def construct_pdf_url(self, metadata: Any, identifier: str) -> str:
30
+ """Concrete implementation for testing."""
31
+ return f"https://test.com/{identifier}.pdf"
32
+
33
+ def extract_paper_metadata(
34
+ self, metadata: Any, identifier: str, pdf_result: Optional[Tuple[str, str]]
35
+ ) -> Dict[str, Any]:
36
+ """Concrete implementation for testing."""
37
+ return {
38
+ "Title": f"Test Paper {identifier}",
39
+ "Authors": ["Test Author"],
40
+ "identifier": identifier,
41
+ "metadata_source": metadata,
42
+ }
43
+
44
+ def get_service_name(self) -> str:
45
+ """Concrete implementation for testing."""
46
+ return "TestService"
47
+
48
+ def get_identifier_name(self) -> str:
49
+ """Concrete implementation for testing."""
50
+ return "Test ID"
51
+
52
+ def get_default_filename(self, identifier: str) -> str:
53
+ """Concrete implementation for testing."""
54
+ return f"test_{identifier}.pdf"
55
+
56
+ def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
57
+ """Concrete implementation for testing."""
58
+ return f" ({paper.get('identifier', 'unknown')})"
59
+
60
+ def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
61
+ """Concrete implementation for testing."""
62
+ entry["test_id"] = identifier
63
+
64
+ def get_paper_identifier_info_public(self, paper: Dict[str, Any]) -> str:
65
+ """Public wrapper to access protected identifier info for tests."""
66
+ return self._get_paper_identifier_info(paper)
67
+
68
+ def add_service_identifier_public(
69
+ self, entry: Dict[str, Any], identifier: str
70
+ ) -> None:
71
+ """Public wrapper to access protected service identifier for tests."""
72
+ self._add_service_identifier(entry, identifier)
73
+
74
+
75
+ class TestBasePaperDownloader(unittest.TestCase):
76
+ """Tests for the BasePaperDownloader class."""
77
+
78
+ def setUp(self):
79
+ """Set up test fixtures."""
80
+ self.mock_config = Mock()
81
+ self.mock_config.request_timeout = 30
82
+ self.mock_config.chunk_size = 8192
83
+
84
+ self.downloader = ConcretePaperDownloader(self.mock_config)
85
+
86
+ def test_initialization(self):
87
+ """Test BasePaperDownloader initialization."""
88
+ self.assertEqual(self.downloader.request_timeout, 30)
89
+ self.assertEqual(self.downloader.chunk_size, 8192)
90
+
91
+ def test_abstract_methods_raise_not_implemented(self):
92
+ """Test that abstract methods are unimplemented in an incomplete subclass."""
93
+
94
+ # Create an intentionally incomplete subclass **without** instantiating it
95
+ # (avoid E0110) and without a pointless 'pass' (avoid W0107).
96
+ class IncompleteDownloader(BasePaperDownloader):
97
+ """Intentionally incomplete concrete subclass for introspection only."""
98
+
99
+ __test__ = False # not a test class
100
+
101
+ # Assert it's abstract instead of trying to instantiate
102
+ self.assertTrue(inspect.isabstract(IncompleteDownloader))
103
+
104
+ @patch("tempfile.NamedTemporaryFile")
105
+ @patch("requests.get")
106
+ def test_download_pdf_to_temp_success(self, mock_get, mock_tempfile):
107
+ """Test successful PDF download to temporary file."""
108
+ # Mock response
109
+ mock_response = Mock()
110
+ mock_response.raise_for_status = Mock()
111
+ mock_response.iter_content.return_value = [b"PDF chunk 1", b"PDF chunk 2"]
112
+ mock_response.headers = {
113
+ "Content-Disposition": 'attachment; filename="paper.pdf"'
114
+ }
115
+ mock_get.return_value = mock_response
116
+
117
+ # Mock temporary file
118
+ mock_temp_file = Mock()
119
+ mock_temp_file.name = "/tmp/test.pdf"
120
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
121
+ mock_temp_file.__exit__ = Mock(return_value=None)
122
+ mock_tempfile.return_value = mock_temp_file
123
+
124
+ result = self.downloader.download_pdf_to_temp(
125
+ "https://test.com/paper.pdf", "12345"
126
+ )
127
+
128
+ # Verify result
129
+ self.assertEqual(result, ("/tmp/test.pdf", "paper.pdf"))
130
+
131
+ # Verify HTTP request - includes headers with User-Agent
132
+ expected_headers = {"User-Agent": self.downloader.user_agent}
133
+ mock_get.assert_called_once_with(
134
+ "https://test.com/paper.pdf",
135
+ headers=expected_headers,
136
+ timeout=30,
137
+ stream=True,
138
+ )
139
+ mock_response.raise_for_status.assert_called_once()
140
+
141
+ # Verify file writing
142
+ mock_temp_file.write.assert_any_call(b"PDF chunk 1")
143
+ mock_temp_file.write.assert_any_call(b"PDF chunk 2")
144
+
145
+ def test_download_pdf_to_temp_empty_url(self):
146
+ """Test PDF download with empty URL."""
147
+ result = self.downloader.download_pdf_to_temp("", "12345")
148
+
149
+ self.assertIsNone(result)
150
+
151
+ @patch("requests.get")
152
+ def test_download_pdf_to_temp_network_error(self, mock_get):
153
+ """Test PDF download with network error."""
154
+ mock_get.side_effect = requests.RequestException("Network error")
155
+
156
+ result = self.downloader.download_pdf_to_temp(
157
+ "https://test.com/paper.pdf", "12345"
158
+ )
159
+
160
+ self.assertIsNone(result)
161
+
162
+ @patch("tempfile.NamedTemporaryFile")
163
+ @patch("requests.get")
164
+ def test_download_pdf_to_temp_filename_extraction(self, mock_get, mock_tempfile):
165
+ """Test filename extraction from Content-Disposition header."""
166
+ # Mock response with various header formats
167
+ test_cases = [
168
+ ('attachment; filename="test-paper.pdf"', "test-paper.pdf"),
169
+ ("attachment; filename=simple.pdf", "simple.pdf"),
170
+ (
171
+ "attachment; filename*=UTF-8''encoded%20file.pdf",
172
+ "12345.pdf",
173
+ ), # Complex header format falls back to default
174
+ ('inline; filename="quoted file.pdf"', "quoted file.pdf"),
175
+ ("", "12345.pdf"), # No header, should use default
176
+ ]
177
+
178
+ for header_value, expected_filename in test_cases:
179
+ with self.subTest(header=header_value):
180
+ mock_response = Mock()
181
+ mock_response.raise_for_status = Mock()
182
+ mock_response.iter_content.return_value = [b"PDF data"]
183
+ mock_response.headers = (
184
+ {"Content-Disposition": header_value} if header_value else {}
185
+ )
186
+ mock_get.return_value = mock_response
187
+
188
+ # Mock get_default_filename for fallback case
189
+ with patch.object(
190
+ self.downloader, "get_default_filename", return_value="12345.pdf"
191
+ ):
192
+ # Mock temporary file
193
+ mock_temp_file = Mock()
194
+ mock_temp_file.name = "/tmp/test.pdf"
195
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
196
+ mock_temp_file.__exit__ = Mock(return_value=None)
197
+ mock_tempfile.return_value = mock_temp_file
198
+
199
+ result = self.downloader.download_pdf_to_temp(
200
+ "https://test.com/paper.pdf", "12345"
201
+ )
202
+
203
+ self.assertEqual(result[1], expected_filename)
204
+
205
+ def test_process_identifiers_success(self):
206
+ """Test successful processing of multiple identifiers."""
207
+ identifiers = ["12345", "67890"]
208
+
209
+ # Mock download_pdf_to_temp to return different results
210
+ with patch.object(self.downloader, "download_pdf_to_temp") as mock_download:
211
+ mock_download.side_effect = [
212
+ ("/tmp/paper1.pdf", "paper1.pdf"), # First paper succeeds
213
+ None, # Second paper fails
214
+ ]
215
+
216
+ result = self.downloader.process_identifiers(identifiers)
217
+
218
+ # Verify results
219
+ self.assertIn("12345", result)
220
+ self.assertIn("67890", result)
221
+
222
+ # First paper should have PDF data
223
+ self.assertEqual(result["12345"]["Title"], "Test Paper 12345")
224
+ self.assertEqual(result["12345"]["Authors"], ["Test Author"])
225
+
226
+ # Second paper should also be processed (but without PDF)
227
+ self.assertEqual(result["67890"]["Title"], "Test Paper 67890")
228
+
229
+ def test_process_identifiers_with_errors(self):
230
+ """Test processing identifiers with various errors."""
231
+ identifiers = ["valid", "fetch_error"]
232
+
233
+ def mock_fetch_metadata(identifier):
234
+ if identifier == "fetch_error":
235
+ raise requests.RequestException("Fetch failed")
236
+ return {"test": identifier}
237
+
238
+ with patch.object(
239
+ self.downloader, "fetch_metadata", side_effect=mock_fetch_metadata
240
+ ):
241
+ with patch.object(
242
+ self.downloader, "download_pdf_to_temp", return_value=None
243
+ ):
244
+ result = self.downloader.process_identifiers(identifiers)
245
+
246
+ # Valid identifier should succeed
247
+ self.assertIn("valid", result)
248
+ self.assertEqual(result["valid"]["Title"], "Test Paper valid")
249
+
250
+ # Error cases should create error entries (not be excluded)
251
+ self.assertIn("fetch_error", result)
252
+ self.assertEqual(result["fetch_error"]["Title"], "Error fetching paper")
253
+ self.assertIn("Fetch failed", result["fetch_error"]["Abstract"])
254
+ self.assertEqual(result["fetch_error"]["access_type"], "error")
255
+
256
+ def test_build_summary_success(self):
257
+ """Test building summary for successful downloads."""
258
+ article_data = {
259
+ "paper1": {"Title": "Paper 1", "access_type": "open_access_downloaded"},
260
+ "paper2": {"Title": "Paper 2", "access_type": "download_failed"},
261
+ "paper3": {"Title": "Paper 3", "access_type": "open_access_downloaded"},
262
+ }
263
+
264
+ result = self.downloader.build_summary(article_data)
265
+
266
+ # Should include count of papers and successful downloads
267
+ self.assertIn("3", result) # Total papers
268
+ self.assertIn("2", result) # Successful downloads
269
+ self.assertIn("TestService", result) # Service name
270
+
271
+ def test_build_summary_no_papers(self):
272
+ """Test building summary with no papers."""
273
+ result = self.downloader.build_summary({})
274
+
275
+ self.assertIn("0", result)
276
+ self.assertIn("TestService", result)
277
+
278
+ def test_build_summary_all_failed(self):
279
+ """Test building summary with all failed downloads."""
280
+ article_data = {
281
+ "paper1": {"Title": "Paper 1", "access_type": "download_failed"},
282
+ "paper2": {"Title": "Paper 2", "access_type": "download_failed"},
283
+ }
284
+
285
+ result = self.downloader.build_summary(article_data)
286
+
287
+ self.assertIn("2", result) # Total papers
288
+ self.assertIn("0", result) # Successful downloads (should be 0)
289
+
290
+ def test_build_summary_with_papers(self):
291
+ """Test building summary with paper list."""
292
+ article_data = {
293
+ "123": {
294
+ "Title": "Paper 1",
295
+ "identifier": "123",
296
+ "access_type": "open_access_downloaded",
297
+ "Abstract": "Test abstract.",
298
+ },
299
+ "456": {
300
+ "Title": "Paper 2",
301
+ "identifier": "456",
302
+ "access_type": "download_failed",
303
+ "Abstract": "Another abstract.",
304
+ },
305
+ }
306
+
307
+ result = self.downloader.build_summary(article_data)
308
+
309
+ self.assertIn("Paper 1", result)
310
+ self.assertIn("Paper 2", result)
311
+ self.assertIn("TestService", result)
312
+ self.assertIn("2", result) # Total papers
313
+ self.assertIn("1", result) # Successfully downloaded
314
+
315
+ def test_build_summary_truncated_list(self):
316
+ """Test building summary with long list (should show only top 3)."""
317
+ article_data = {}
318
+ for i in range(5): # More than 3
319
+ article_data[f"{i+1}"] = {
320
+ "Title": f"Paper {i+1}",
321
+ "identifier": f"{i+1}",
322
+ "access_type": "open_access_downloaded",
323
+ "Abstract": f"Abstract {i+1}",
324
+ }
325
+
326
+ result = self.downloader.build_summary(article_data)
327
+
328
+ # Should include first 3 papers only
329
+ self.assertIn("Paper 1", result)
330
+ self.assertIn("Paper 2", result)
331
+ self.assertIn("Paper 3", result)
332
+
333
+ # Should not include papers 4 and 5
334
+ self.assertNotIn("Paper 4", result)
335
+ self.assertNotIn("Paper 5", result)
336
+
337
+ # Should show total count
338
+ self.assertIn("5", result) # Total papers
339
+
340
+ def test_concrete_implementation_methods(self):
341
+ """Test that concrete implementations work correctly."""
342
+ # Test fetch_metadata
343
+ metadata = self.downloader.fetch_metadata("test123")
344
+ self.assertEqual(metadata, {"test": "data"})
345
+
346
+ # Test construct_pdf_url
347
+ pdf_url = self.downloader.construct_pdf_url(metadata, "test123")
348
+ self.assertEqual(pdf_url, "https://test.com/test123.pdf")
349
+
350
+ # Test extract_paper_metadata
351
+ paper_data = self.downloader.extract_paper_metadata(metadata, "test123", None)
352
+ self.assertEqual(paper_data["Title"], "Test Paper test123")
353
+ self.assertEqual(paper_data["Authors"], ["Test Author"])
354
+
355
+ # Test get_service_name
356
+ service_name = self.downloader.get_service_name()
357
+ self.assertEqual(service_name, "TestService")
358
+
359
+ # Test get_identifier_name
360
+ identifier_name = self.downloader.get_identifier_name()
361
+ self.assertEqual(identifier_name, "Test ID")
362
+
363
+ # Test get_default_filename
364
+ filename = self.downloader.get_default_filename("test123")
365
+ self.assertEqual(filename, "test_test123.pdf")
366
+
367
+ def test_helper_methods(self):
368
+ """Test helper methods."""
369
+ # Test _get_paper_identifier_info via public wrapper
370
+ paper = {"identifier": "test123"}
371
+ info = self.downloader.get_paper_identifier_info_public(paper)
372
+ self.assertEqual(info, " (test123)")
373
+
374
+ # Test _add_service_identifier via public wrapper
375
+ entry = {}
376
+ self.downloader.add_service_identifier_public(entry, "test123")
377
+ self.assertEqual(entry["test_id"], "test123")
378
+
379
+ def test_abstract_methods_raise_not_implemented_direct_call(self):
380
+ """Test that base-class abstract methods raise NotImplementedError when called."""
381
+ # Use the already-imported BasePaperDownloader (no reimport/redefinition).
382
+
383
+ # Public abstract methods: call directly on the base to hit the NotImplementedError paths.
384
+ with self.assertRaises(NotImplementedError):
385
+ BasePaperDownloader.fetch_metadata(self.downloader, "test")
386
+
387
+ with self.assertRaises(NotImplementedError):
388
+ BasePaperDownloader.construct_pdf_url(self.downloader, {}, "test")
389
+
390
+ with self.assertRaises(NotImplementedError):
391
+ BasePaperDownloader.extract_paper_metadata(
392
+ self.downloader, {}, "test", None
393
+ )
394
+
395
+ with self.assertRaises(NotImplementedError):
396
+ BasePaperDownloader.get_service_name(self.downloader)
397
+
398
+ with self.assertRaises(NotImplementedError):
399
+ BasePaperDownloader.get_identifier_name(self.downloader)
400
+
401
+ with self.assertRaises(NotImplementedError):
402
+ BasePaperDownloader.get_default_filename(self.downloader, "test")
403
+
404
+ # Protected abstract methods: call via getattr to avoid W0212 while still executing code.
405
+ with self.assertRaises(NotImplementedError):
406
+ getattr(BasePaperDownloader, "_get_paper_identifier_info")(
407
+ self.downloader, {}
408
+ )
409
+
410
+ with self.assertRaises(NotImplementedError):
411
+ getattr(BasePaperDownloader, "_add_service_identifier")(
412
+ self.downloader, {}, "test"
413
+ )
414
+
415
+ @patch("tempfile.NamedTemporaryFile")
416
+ @patch("requests.get")
417
+ def test_filename_extraction_exception_handling(self, mock_get, mock_tempfile):
418
+ """Test exception handling during filename extraction."""
419
+ # Mock response that will cause an exception in filename extraction
420
+ mock_response = Mock()
421
+ mock_response.raise_for_status = Mock()
422
+ mock_response.iter_content.return_value = [b"PDF data"]
423
+ mock_response.headers = {
424
+ "Content-Disposition": 'attachment; filename="paper.pdf"'
425
+ }
426
+ mock_get.return_value = mock_response
427
+
428
+ # Mock temporary file
429
+ mock_temp_file = Mock()
430
+ mock_temp_file.name = "/tmp/test.pdf"
431
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
432
+ mock_temp_file.__exit__ = Mock(return_value=None)
433
+ mock_tempfile.return_value = mock_temp_file
434
+
435
+ # Patch re.search to raise an exception during filename extraction
436
+ with patch("re.search", side_effect=requests.RequestException("Regex error")):
437
+ result = self.downloader.download_pdf_to_temp(
438
+ "https://test.com/paper.pdf", "12345"
439
+ )
440
+
441
+ # Should still succeed but use default filename due to exception
442
+ self.assertEqual(result, ("/tmp/test.pdf", "test_12345.pdf"))
443
+
444
+ def test_build_summary_with_temp_file_path(self):
445
+ """Test build_summary with papers that have temp_file_path."""
446
+ article_data = {
447
+ "paper1": {
448
+ "Title": "Paper 1",
449
+ "access_type": "open_access_downloaded",
450
+ "Abstract": "This is a test abstract with multiple sentences."
451
+ "It should be truncated.",
452
+ "temp_file_path": "/tmp/paper1.pdf",
453
+ },
454
+ "paper2": {
455
+ "Title": "Paper 2",
456
+ "access_type": "download_failed",
457
+ "Abstract": "Short abstract.",
458
+ "temp_file_path": "", # Empty temp_file_path
459
+ },
460
+ }
461
+
462
+ result = self.downloader.build_summary(article_data)
463
+
464
+ # Should include temp file path for paper1
465
+ self.assertIn("/tmp/paper1.pdf", result)
466
+ self.assertIn("Downloaded to:", result)
467
+ self.assertIn("Abstract snippet:", result)
468
+
469
+ # Should include count information
470
+ self.assertIn("2", result) # Total papers
471
+ self.assertIn("1", result) # Successfully downloaded
472
+
473
+
474
+ class TestBasePaperDownloaderEdgeCases(unittest.TestCase):
475
+ """Tests for edge cases and error conditions."""
476
+
477
+ def setUp(self):
478
+ """Set up edge case test fixtures."""
479
+ self.mock_config = Mock()
480
+ self.mock_config.request_timeout = 30
481
+ self.mock_config.chunk_size = 8192
482
+
483
+ self.downloader = ConcretePaperDownloader(self.mock_config)
484
+
485
+ @patch("tempfile.NamedTemporaryFile")
486
+ @patch("requests.get")
487
+ def test_download_pdf_chunk_filtering(self, mock_get, mock_tempfile):
488
+ """Test that empty chunks are filtered out during download."""
489
+ # Mock response with mixed chunks including None/empty ones
490
+ mock_response = Mock()
491
+ mock_response.raise_for_status = Mock()
492
+ mock_response.iter_content.return_value = [
493
+ b"chunk1",
494
+ None, # Should be filtered out
495
+ b"", # Empty chunk, should be filtered out
496
+ b"chunk2",
497
+ None,
498
+ b"chunk3",
499
+ ]
500
+ mock_response.headers = {}
501
+ mock_get.return_value = mock_response
502
+
503
+ # Mock temporary file
504
+ mock_temp_file = Mock()
505
+ mock_temp_file.name = "/tmp/test.pdf"
506
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
507
+ mock_temp_file.__exit__ = Mock(return_value=None)
508
+ mock_tempfile.return_value = mock_temp_file
509
+
510
+ with patch.object(
511
+ self.downloader, "get_default_filename", return_value="default.pdf"
512
+ ):
513
+ # Call without assigning to avoid 'unused-variable'
514
+ self.downloader.download_pdf_to_temp("https://test.com/paper.pdf", "12345")
515
+
516
+ # Should only write non-empty chunks
517
+ self.assertEqual(mock_temp_file.write.call_count, 3)
518
+ mock_temp_file.write.assert_any_call(b"chunk1")
519
+ mock_temp_file.write.assert_any_call(b"chunk2")
520
+ mock_temp_file.write.assert_any_call(b"chunk3")
521
+
522
+ def test_filename_extraction_regex_edge_cases(self):
523
+ """Test filename extraction with various regex edge cases."""
524
+ test_headers = [
525
+ # Various quote combinations
526
+ ('filename="file with spaces.pdf"', "file with spaces.pdf"),
527
+ (
528
+ "filename='single_quotes.pdf'",
529
+ "default.pdf",
530
+ ), # Single quotes don't match regex
531
+ ("filename=no_quotes.pdf", "no_quotes.pdf"),
532
+ # Unicode and special characters
533
+ ('filename="файл.pdf"', "файл.pdf"),
534
+ (
535
+ 'filename="file-with-dashes_and_underscores.pdf"',
536
+ "file-with-dashes_and_underscores.pdf",
537
+ ),
538
+ # Edge cases
539
+ ('filename=""', "default.pdf"), # Empty filename falls back to default
540
+ ("filename=", "default.pdf"), # No value falls back to default
541
+ (
542
+ 'other_param=value; filename="actual.pdf"',
543
+ "actual.pdf",
544
+ ), # Mixed parameters
545
+ # Invalid cases (should fall back to default)
546
+ ("invalid_header_format", None),
547
+ ("filename=not_a_pdf.txt", "default.pdf"), # Non-PDF falls back to default
548
+ ]
549
+
550
+ for header_value, expected in test_headers:
551
+ with self.subTest(header=header_value):
552
+ with patch("requests.get") as mock_get:
553
+ mock_response = Mock()
554
+ mock_response.raise_for_status = Mock()
555
+ mock_response.iter_content.return_value = [b"data"]
556
+ mock_response.headers = {"Content-Disposition": header_value}
557
+ mock_get.return_value = mock_response
558
+
559
+ with patch("tempfile.NamedTemporaryFile") as mock_tempfile:
560
+ mock_temp_file = Mock()
561
+ mock_temp_file.name = "/tmp/test.pdf"
562
+ mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
563
+ mock_temp_file.__exit__ = Mock(return_value=None)
564
+ mock_tempfile.return_value = mock_temp_file
565
+
566
+ with patch.object(
567
+ self.downloader,
568
+ "get_default_filename",
569
+ return_value="default.pdf",
570
+ ):
571
+ result = self.downloader.download_pdf_to_temp(
572
+ "https://test.com/paper.pdf", "12345"
573
+ )
574
+
575
+ if expected is None:
576
+ # Should fall back to default
577
+ self.assertEqual(result[1], "default.pdf")
578
+ else:
579
+ self.assertEqual(result[1], expected)
580
+
581
+ def test_process_identifiers_empty_list(self):
582
+ """Test processing empty identifier list."""
583
+ result = self.downloader.process_identifiers([])
584
+
585
+ self.assertEqual(result, {})
586
+
587
+ def test_process_identifiers_duplicate_handling(self):
588
+ """Test processing list with duplicate identifiers."""
589
+ identifiers = ["12345", "67890", "12345"] # Duplicate 12345
590
+
591
+ with patch.object(self.downloader, "download_pdf_to_temp", return_value=None):
592
+ result = self.downloader.process_identifiers(identifiers)
593
+
594
+ # Should only have unique entries
595
+ self.assertEqual(len(result), 2)
596
+ self.assertIn("12345", result)
597
+ self.assertIn("67890", result)
598
+
599
+
600
+ class TestBasePaperDownloaderAbstractMethods(unittest.TestCase):
601
+ """Test abstract method behavior."""
602
+
603
+ def test_abstract_class_cannot_be_instantiated(self):
604
+ """BasePaperDownloader should be abstract (non-instantiable)."""
605
+
606
+ self.assertTrue(inspect.isabstract(BasePaperDownloader))
607
+
608
+ def test_complete_implementation_succeeds(self):
609
+ """Test that complete implementations work."""
610
+ # ConcretePaperDownloader from setUp should work
611
+ config = Mock()
612
+ config.request_timeout = 30
613
+ config.chunk_size = 8192
614
+
615
+ downloader = ConcretePaperDownloader(config)
616
+
617
+ # Should be able to call all methods
618
+ self.assertEqual(downloader.get_service_name(), "TestService")
619
+ self.assertEqual(downloader.get_identifier_name(), "Test ID")
620
+ self.assertEqual(downloader.get_default_filename("test"), "test_test.pdf")