aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
- aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
- aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
- aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
- aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
- aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
- aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
- aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
- aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
- /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,620 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for BasePaperDownloader.
|
3
|
+
Tests the abstract base class functionality and common methods.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import unittest
|
7
|
+
from typing import Any, Dict, Optional, Tuple
|
8
|
+
from unittest.mock import Mock, patch
|
9
|
+
|
10
|
+
import inspect
|
11
|
+
import requests
|
12
|
+
|
13
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.utils.base_paper_downloader import (
|
14
|
+
BasePaperDownloader,
|
15
|
+
)
|
16
|
+
|
17
|
+
|
18
|
+
class ConcretePaperDownloader(BasePaperDownloader):
|
19
|
+
"""Concrete implementation of BasePaperDownloader for testing."""
|
20
|
+
|
21
|
+
def __init__(self, config: Any):
|
22
|
+
super().__init__(config)
|
23
|
+
self.test_metadata = {"test": "data"}
|
24
|
+
|
25
|
+
def fetch_metadata(self, identifier: str) -> Any:
|
26
|
+
"""Concrete implementation for testing."""
|
27
|
+
return self.test_metadata
|
28
|
+
|
29
|
+
def construct_pdf_url(self, metadata: Any, identifier: str) -> str:
|
30
|
+
"""Concrete implementation for testing."""
|
31
|
+
return f"https://test.com/{identifier}.pdf"
|
32
|
+
|
33
|
+
def extract_paper_metadata(
|
34
|
+
self, metadata: Any, identifier: str, pdf_result: Optional[Tuple[str, str]]
|
35
|
+
) -> Dict[str, Any]:
|
36
|
+
"""Concrete implementation for testing."""
|
37
|
+
return {
|
38
|
+
"Title": f"Test Paper {identifier}",
|
39
|
+
"Authors": ["Test Author"],
|
40
|
+
"identifier": identifier,
|
41
|
+
"metadata_source": metadata,
|
42
|
+
}
|
43
|
+
|
44
|
+
def get_service_name(self) -> str:
|
45
|
+
"""Concrete implementation for testing."""
|
46
|
+
return "TestService"
|
47
|
+
|
48
|
+
def get_identifier_name(self) -> str:
|
49
|
+
"""Concrete implementation for testing."""
|
50
|
+
return "Test ID"
|
51
|
+
|
52
|
+
def get_default_filename(self, identifier: str) -> str:
|
53
|
+
"""Concrete implementation for testing."""
|
54
|
+
return f"test_{identifier}.pdf"
|
55
|
+
|
56
|
+
def _get_paper_identifier_info(self, paper: Dict[str, Any]) -> str:
|
57
|
+
"""Concrete implementation for testing."""
|
58
|
+
return f" ({paper.get('identifier', 'unknown')})"
|
59
|
+
|
60
|
+
def _add_service_identifier(self, entry: Dict[str, Any], identifier: str) -> None:
|
61
|
+
"""Concrete implementation for testing."""
|
62
|
+
entry["test_id"] = identifier
|
63
|
+
|
64
|
+
def get_paper_identifier_info_public(self, paper: Dict[str, Any]) -> str:
|
65
|
+
"""Public wrapper to access protected identifier info for tests."""
|
66
|
+
return self._get_paper_identifier_info(paper)
|
67
|
+
|
68
|
+
def add_service_identifier_public(
|
69
|
+
self, entry: Dict[str, Any], identifier: str
|
70
|
+
) -> None:
|
71
|
+
"""Public wrapper to access protected service identifier for tests."""
|
72
|
+
self._add_service_identifier(entry, identifier)
|
73
|
+
|
74
|
+
|
75
|
+
class TestBasePaperDownloader(unittest.TestCase):
|
76
|
+
"""Tests for the BasePaperDownloader class."""
|
77
|
+
|
78
|
+
def setUp(self):
|
79
|
+
"""Set up test fixtures."""
|
80
|
+
self.mock_config = Mock()
|
81
|
+
self.mock_config.request_timeout = 30
|
82
|
+
self.mock_config.chunk_size = 8192
|
83
|
+
|
84
|
+
self.downloader = ConcretePaperDownloader(self.mock_config)
|
85
|
+
|
86
|
+
def test_initialization(self):
|
87
|
+
"""Test BasePaperDownloader initialization."""
|
88
|
+
self.assertEqual(self.downloader.request_timeout, 30)
|
89
|
+
self.assertEqual(self.downloader.chunk_size, 8192)
|
90
|
+
|
91
|
+
def test_abstract_methods_raise_not_implemented(self):
|
92
|
+
"""Test that abstract methods are unimplemented in an incomplete subclass."""
|
93
|
+
|
94
|
+
# Create an intentionally incomplete subclass **without** instantiating it
|
95
|
+
# (avoid E0110) and without a pointless 'pass' (avoid W0107).
|
96
|
+
class IncompleteDownloader(BasePaperDownloader):
|
97
|
+
"""Intentionally incomplete concrete subclass for introspection only."""
|
98
|
+
|
99
|
+
__test__ = False # not a test class
|
100
|
+
|
101
|
+
# Assert it's abstract instead of trying to instantiate
|
102
|
+
self.assertTrue(inspect.isabstract(IncompleteDownloader))
|
103
|
+
|
104
|
+
@patch("tempfile.NamedTemporaryFile")
|
105
|
+
@patch("requests.get")
|
106
|
+
def test_download_pdf_to_temp_success(self, mock_get, mock_tempfile):
|
107
|
+
"""Test successful PDF download to temporary file."""
|
108
|
+
# Mock response
|
109
|
+
mock_response = Mock()
|
110
|
+
mock_response.raise_for_status = Mock()
|
111
|
+
mock_response.iter_content.return_value = [b"PDF chunk 1", b"PDF chunk 2"]
|
112
|
+
mock_response.headers = {
|
113
|
+
"Content-Disposition": 'attachment; filename="paper.pdf"'
|
114
|
+
}
|
115
|
+
mock_get.return_value = mock_response
|
116
|
+
|
117
|
+
# Mock temporary file
|
118
|
+
mock_temp_file = Mock()
|
119
|
+
mock_temp_file.name = "/tmp/test.pdf"
|
120
|
+
mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
|
121
|
+
mock_temp_file.__exit__ = Mock(return_value=None)
|
122
|
+
mock_tempfile.return_value = mock_temp_file
|
123
|
+
|
124
|
+
result = self.downloader.download_pdf_to_temp(
|
125
|
+
"https://test.com/paper.pdf", "12345"
|
126
|
+
)
|
127
|
+
|
128
|
+
# Verify result
|
129
|
+
self.assertEqual(result, ("/tmp/test.pdf", "paper.pdf"))
|
130
|
+
|
131
|
+
# Verify HTTP request - includes headers with User-Agent
|
132
|
+
expected_headers = {"User-Agent": self.downloader.user_agent}
|
133
|
+
mock_get.assert_called_once_with(
|
134
|
+
"https://test.com/paper.pdf",
|
135
|
+
headers=expected_headers,
|
136
|
+
timeout=30,
|
137
|
+
stream=True,
|
138
|
+
)
|
139
|
+
mock_response.raise_for_status.assert_called_once()
|
140
|
+
|
141
|
+
# Verify file writing
|
142
|
+
mock_temp_file.write.assert_any_call(b"PDF chunk 1")
|
143
|
+
mock_temp_file.write.assert_any_call(b"PDF chunk 2")
|
144
|
+
|
145
|
+
def test_download_pdf_to_temp_empty_url(self):
|
146
|
+
"""Test PDF download with empty URL."""
|
147
|
+
result = self.downloader.download_pdf_to_temp("", "12345")
|
148
|
+
|
149
|
+
self.assertIsNone(result)
|
150
|
+
|
151
|
+
@patch("requests.get")
|
152
|
+
def test_download_pdf_to_temp_network_error(self, mock_get):
|
153
|
+
"""Test PDF download with network error."""
|
154
|
+
mock_get.side_effect = requests.RequestException("Network error")
|
155
|
+
|
156
|
+
result = self.downloader.download_pdf_to_temp(
|
157
|
+
"https://test.com/paper.pdf", "12345"
|
158
|
+
)
|
159
|
+
|
160
|
+
self.assertIsNone(result)
|
161
|
+
|
162
|
+
@patch("tempfile.NamedTemporaryFile")
|
163
|
+
@patch("requests.get")
|
164
|
+
def test_download_pdf_to_temp_filename_extraction(self, mock_get, mock_tempfile):
|
165
|
+
"""Test filename extraction from Content-Disposition header."""
|
166
|
+
# Mock response with various header formats
|
167
|
+
test_cases = [
|
168
|
+
('attachment; filename="test-paper.pdf"', "test-paper.pdf"),
|
169
|
+
("attachment; filename=simple.pdf", "simple.pdf"),
|
170
|
+
(
|
171
|
+
"attachment; filename*=UTF-8''encoded%20file.pdf",
|
172
|
+
"12345.pdf",
|
173
|
+
), # Complex header format falls back to default
|
174
|
+
('inline; filename="quoted file.pdf"', "quoted file.pdf"),
|
175
|
+
("", "12345.pdf"), # No header, should use default
|
176
|
+
]
|
177
|
+
|
178
|
+
for header_value, expected_filename in test_cases:
|
179
|
+
with self.subTest(header=header_value):
|
180
|
+
mock_response = Mock()
|
181
|
+
mock_response.raise_for_status = Mock()
|
182
|
+
mock_response.iter_content.return_value = [b"PDF data"]
|
183
|
+
mock_response.headers = (
|
184
|
+
{"Content-Disposition": header_value} if header_value else {}
|
185
|
+
)
|
186
|
+
mock_get.return_value = mock_response
|
187
|
+
|
188
|
+
# Mock get_default_filename for fallback case
|
189
|
+
with patch.object(
|
190
|
+
self.downloader, "get_default_filename", return_value="12345.pdf"
|
191
|
+
):
|
192
|
+
# Mock temporary file
|
193
|
+
mock_temp_file = Mock()
|
194
|
+
mock_temp_file.name = "/tmp/test.pdf"
|
195
|
+
mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
|
196
|
+
mock_temp_file.__exit__ = Mock(return_value=None)
|
197
|
+
mock_tempfile.return_value = mock_temp_file
|
198
|
+
|
199
|
+
result = self.downloader.download_pdf_to_temp(
|
200
|
+
"https://test.com/paper.pdf", "12345"
|
201
|
+
)
|
202
|
+
|
203
|
+
self.assertEqual(result[1], expected_filename)
|
204
|
+
|
205
|
+
def test_process_identifiers_success(self):
|
206
|
+
"""Test successful processing of multiple identifiers."""
|
207
|
+
identifiers = ["12345", "67890"]
|
208
|
+
|
209
|
+
# Mock download_pdf_to_temp to return different results
|
210
|
+
with patch.object(self.downloader, "download_pdf_to_temp") as mock_download:
|
211
|
+
mock_download.side_effect = [
|
212
|
+
("/tmp/paper1.pdf", "paper1.pdf"), # First paper succeeds
|
213
|
+
None, # Second paper fails
|
214
|
+
]
|
215
|
+
|
216
|
+
result = self.downloader.process_identifiers(identifiers)
|
217
|
+
|
218
|
+
# Verify results
|
219
|
+
self.assertIn("12345", result)
|
220
|
+
self.assertIn("67890", result)
|
221
|
+
|
222
|
+
# First paper should have PDF data
|
223
|
+
self.assertEqual(result["12345"]["Title"], "Test Paper 12345")
|
224
|
+
self.assertEqual(result["12345"]["Authors"], ["Test Author"])
|
225
|
+
|
226
|
+
# Second paper should also be processed (but without PDF)
|
227
|
+
self.assertEqual(result["67890"]["Title"], "Test Paper 67890")
|
228
|
+
|
229
|
+
def test_process_identifiers_with_errors(self):
|
230
|
+
"""Test processing identifiers with various errors."""
|
231
|
+
identifiers = ["valid", "fetch_error"]
|
232
|
+
|
233
|
+
def mock_fetch_metadata(identifier):
|
234
|
+
if identifier == "fetch_error":
|
235
|
+
raise requests.RequestException("Fetch failed")
|
236
|
+
return {"test": identifier}
|
237
|
+
|
238
|
+
with patch.object(
|
239
|
+
self.downloader, "fetch_metadata", side_effect=mock_fetch_metadata
|
240
|
+
):
|
241
|
+
with patch.object(
|
242
|
+
self.downloader, "download_pdf_to_temp", return_value=None
|
243
|
+
):
|
244
|
+
result = self.downloader.process_identifiers(identifiers)
|
245
|
+
|
246
|
+
# Valid identifier should succeed
|
247
|
+
self.assertIn("valid", result)
|
248
|
+
self.assertEqual(result["valid"]["Title"], "Test Paper valid")
|
249
|
+
|
250
|
+
# Error cases should create error entries (not be excluded)
|
251
|
+
self.assertIn("fetch_error", result)
|
252
|
+
self.assertEqual(result["fetch_error"]["Title"], "Error fetching paper")
|
253
|
+
self.assertIn("Fetch failed", result["fetch_error"]["Abstract"])
|
254
|
+
self.assertEqual(result["fetch_error"]["access_type"], "error")
|
255
|
+
|
256
|
+
def test_build_summary_success(self):
|
257
|
+
"""Test building summary for successful downloads."""
|
258
|
+
article_data = {
|
259
|
+
"paper1": {"Title": "Paper 1", "access_type": "open_access_downloaded"},
|
260
|
+
"paper2": {"Title": "Paper 2", "access_type": "download_failed"},
|
261
|
+
"paper3": {"Title": "Paper 3", "access_type": "open_access_downloaded"},
|
262
|
+
}
|
263
|
+
|
264
|
+
result = self.downloader.build_summary(article_data)
|
265
|
+
|
266
|
+
# Should include count of papers and successful downloads
|
267
|
+
self.assertIn("3", result) # Total papers
|
268
|
+
self.assertIn("2", result) # Successful downloads
|
269
|
+
self.assertIn("TestService", result) # Service name
|
270
|
+
|
271
|
+
def test_build_summary_no_papers(self):
|
272
|
+
"""Test building summary with no papers."""
|
273
|
+
result = self.downloader.build_summary({})
|
274
|
+
|
275
|
+
self.assertIn("0", result)
|
276
|
+
self.assertIn("TestService", result)
|
277
|
+
|
278
|
+
def test_build_summary_all_failed(self):
|
279
|
+
"""Test building summary with all failed downloads."""
|
280
|
+
article_data = {
|
281
|
+
"paper1": {"Title": "Paper 1", "access_type": "download_failed"},
|
282
|
+
"paper2": {"Title": "Paper 2", "access_type": "download_failed"},
|
283
|
+
}
|
284
|
+
|
285
|
+
result = self.downloader.build_summary(article_data)
|
286
|
+
|
287
|
+
self.assertIn("2", result) # Total papers
|
288
|
+
self.assertIn("0", result) # Successful downloads (should be 0)
|
289
|
+
|
290
|
+
def test_build_summary_with_papers(self):
|
291
|
+
"""Test building summary with paper list."""
|
292
|
+
article_data = {
|
293
|
+
"123": {
|
294
|
+
"Title": "Paper 1",
|
295
|
+
"identifier": "123",
|
296
|
+
"access_type": "open_access_downloaded",
|
297
|
+
"Abstract": "Test abstract.",
|
298
|
+
},
|
299
|
+
"456": {
|
300
|
+
"Title": "Paper 2",
|
301
|
+
"identifier": "456",
|
302
|
+
"access_type": "download_failed",
|
303
|
+
"Abstract": "Another abstract.",
|
304
|
+
},
|
305
|
+
}
|
306
|
+
|
307
|
+
result = self.downloader.build_summary(article_data)
|
308
|
+
|
309
|
+
self.assertIn("Paper 1", result)
|
310
|
+
self.assertIn("Paper 2", result)
|
311
|
+
self.assertIn("TestService", result)
|
312
|
+
self.assertIn("2", result) # Total papers
|
313
|
+
self.assertIn("1", result) # Successfully downloaded
|
314
|
+
|
315
|
+
def test_build_summary_truncated_list(self):
|
316
|
+
"""Test building summary with long list (should show only top 3)."""
|
317
|
+
article_data = {}
|
318
|
+
for i in range(5): # More than 3
|
319
|
+
article_data[f"{i+1}"] = {
|
320
|
+
"Title": f"Paper {i+1}",
|
321
|
+
"identifier": f"{i+1}",
|
322
|
+
"access_type": "open_access_downloaded",
|
323
|
+
"Abstract": f"Abstract {i+1}",
|
324
|
+
}
|
325
|
+
|
326
|
+
result = self.downloader.build_summary(article_data)
|
327
|
+
|
328
|
+
# Should include first 3 papers only
|
329
|
+
self.assertIn("Paper 1", result)
|
330
|
+
self.assertIn("Paper 2", result)
|
331
|
+
self.assertIn("Paper 3", result)
|
332
|
+
|
333
|
+
# Should not include papers 4 and 5
|
334
|
+
self.assertNotIn("Paper 4", result)
|
335
|
+
self.assertNotIn("Paper 5", result)
|
336
|
+
|
337
|
+
# Should show total count
|
338
|
+
self.assertIn("5", result) # Total papers
|
339
|
+
|
340
|
+
def test_concrete_implementation_methods(self):
|
341
|
+
"""Test that concrete implementations work correctly."""
|
342
|
+
# Test fetch_metadata
|
343
|
+
metadata = self.downloader.fetch_metadata("test123")
|
344
|
+
self.assertEqual(metadata, {"test": "data"})
|
345
|
+
|
346
|
+
# Test construct_pdf_url
|
347
|
+
pdf_url = self.downloader.construct_pdf_url(metadata, "test123")
|
348
|
+
self.assertEqual(pdf_url, "https://test.com/test123.pdf")
|
349
|
+
|
350
|
+
# Test extract_paper_metadata
|
351
|
+
paper_data = self.downloader.extract_paper_metadata(metadata, "test123", None)
|
352
|
+
self.assertEqual(paper_data["Title"], "Test Paper test123")
|
353
|
+
self.assertEqual(paper_data["Authors"], ["Test Author"])
|
354
|
+
|
355
|
+
# Test get_service_name
|
356
|
+
service_name = self.downloader.get_service_name()
|
357
|
+
self.assertEqual(service_name, "TestService")
|
358
|
+
|
359
|
+
# Test get_identifier_name
|
360
|
+
identifier_name = self.downloader.get_identifier_name()
|
361
|
+
self.assertEqual(identifier_name, "Test ID")
|
362
|
+
|
363
|
+
# Test get_default_filename
|
364
|
+
filename = self.downloader.get_default_filename("test123")
|
365
|
+
self.assertEqual(filename, "test_test123.pdf")
|
366
|
+
|
367
|
+
def test_helper_methods(self):
|
368
|
+
"""Test helper methods."""
|
369
|
+
# Test _get_paper_identifier_info via public wrapper
|
370
|
+
paper = {"identifier": "test123"}
|
371
|
+
info = self.downloader.get_paper_identifier_info_public(paper)
|
372
|
+
self.assertEqual(info, " (test123)")
|
373
|
+
|
374
|
+
# Test _add_service_identifier via public wrapper
|
375
|
+
entry = {}
|
376
|
+
self.downloader.add_service_identifier_public(entry, "test123")
|
377
|
+
self.assertEqual(entry["test_id"], "test123")
|
378
|
+
|
379
|
+
def test_abstract_methods_raise_not_implemented_direct_call(self):
|
380
|
+
"""Test that base-class abstract methods raise NotImplementedError when called."""
|
381
|
+
# Use the already-imported BasePaperDownloader (no reimport/redefinition).
|
382
|
+
|
383
|
+
# Public abstract methods: call directly on the base to hit the NotImplementedError paths.
|
384
|
+
with self.assertRaises(NotImplementedError):
|
385
|
+
BasePaperDownloader.fetch_metadata(self.downloader, "test")
|
386
|
+
|
387
|
+
with self.assertRaises(NotImplementedError):
|
388
|
+
BasePaperDownloader.construct_pdf_url(self.downloader, {}, "test")
|
389
|
+
|
390
|
+
with self.assertRaises(NotImplementedError):
|
391
|
+
BasePaperDownloader.extract_paper_metadata(
|
392
|
+
self.downloader, {}, "test", None
|
393
|
+
)
|
394
|
+
|
395
|
+
with self.assertRaises(NotImplementedError):
|
396
|
+
BasePaperDownloader.get_service_name(self.downloader)
|
397
|
+
|
398
|
+
with self.assertRaises(NotImplementedError):
|
399
|
+
BasePaperDownloader.get_identifier_name(self.downloader)
|
400
|
+
|
401
|
+
with self.assertRaises(NotImplementedError):
|
402
|
+
BasePaperDownloader.get_default_filename(self.downloader, "test")
|
403
|
+
|
404
|
+
# Protected abstract methods: call via getattr to avoid W0212 while still executing code.
|
405
|
+
with self.assertRaises(NotImplementedError):
|
406
|
+
getattr(BasePaperDownloader, "_get_paper_identifier_info")(
|
407
|
+
self.downloader, {}
|
408
|
+
)
|
409
|
+
|
410
|
+
with self.assertRaises(NotImplementedError):
|
411
|
+
getattr(BasePaperDownloader, "_add_service_identifier")(
|
412
|
+
self.downloader, {}, "test"
|
413
|
+
)
|
414
|
+
|
415
|
+
@patch("tempfile.NamedTemporaryFile")
|
416
|
+
@patch("requests.get")
|
417
|
+
def test_filename_extraction_exception_handling(self, mock_get, mock_tempfile):
|
418
|
+
"""Test exception handling during filename extraction."""
|
419
|
+
# Mock response that will cause an exception in filename extraction
|
420
|
+
mock_response = Mock()
|
421
|
+
mock_response.raise_for_status = Mock()
|
422
|
+
mock_response.iter_content.return_value = [b"PDF data"]
|
423
|
+
mock_response.headers = {
|
424
|
+
"Content-Disposition": 'attachment; filename="paper.pdf"'
|
425
|
+
}
|
426
|
+
mock_get.return_value = mock_response
|
427
|
+
|
428
|
+
# Mock temporary file
|
429
|
+
mock_temp_file = Mock()
|
430
|
+
mock_temp_file.name = "/tmp/test.pdf"
|
431
|
+
mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
|
432
|
+
mock_temp_file.__exit__ = Mock(return_value=None)
|
433
|
+
mock_tempfile.return_value = mock_temp_file
|
434
|
+
|
435
|
+
# Patch re.search to raise an exception during filename extraction
|
436
|
+
with patch("re.search", side_effect=requests.RequestException("Regex error")):
|
437
|
+
result = self.downloader.download_pdf_to_temp(
|
438
|
+
"https://test.com/paper.pdf", "12345"
|
439
|
+
)
|
440
|
+
|
441
|
+
# Should still succeed but use default filename due to exception
|
442
|
+
self.assertEqual(result, ("/tmp/test.pdf", "test_12345.pdf"))
|
443
|
+
|
444
|
+
def test_build_summary_with_temp_file_path(self):
|
445
|
+
"""Test build_summary with papers that have temp_file_path."""
|
446
|
+
article_data = {
|
447
|
+
"paper1": {
|
448
|
+
"Title": "Paper 1",
|
449
|
+
"access_type": "open_access_downloaded",
|
450
|
+
"Abstract": "This is a test abstract with multiple sentences."
|
451
|
+
"It should be truncated.",
|
452
|
+
"temp_file_path": "/tmp/paper1.pdf",
|
453
|
+
},
|
454
|
+
"paper2": {
|
455
|
+
"Title": "Paper 2",
|
456
|
+
"access_type": "download_failed",
|
457
|
+
"Abstract": "Short abstract.",
|
458
|
+
"temp_file_path": "", # Empty temp_file_path
|
459
|
+
},
|
460
|
+
}
|
461
|
+
|
462
|
+
result = self.downloader.build_summary(article_data)
|
463
|
+
|
464
|
+
# Should include temp file path for paper1
|
465
|
+
self.assertIn("/tmp/paper1.pdf", result)
|
466
|
+
self.assertIn("Downloaded to:", result)
|
467
|
+
self.assertIn("Abstract snippet:", result)
|
468
|
+
|
469
|
+
# Should include count information
|
470
|
+
self.assertIn("2", result) # Total papers
|
471
|
+
self.assertIn("1", result) # Successfully downloaded
|
472
|
+
|
473
|
+
|
474
|
+
class TestBasePaperDownloaderEdgeCases(unittest.TestCase):
|
475
|
+
"""Tests for edge cases and error conditions."""
|
476
|
+
|
477
|
+
def setUp(self):
|
478
|
+
"""Set up edge case test fixtures."""
|
479
|
+
self.mock_config = Mock()
|
480
|
+
self.mock_config.request_timeout = 30
|
481
|
+
self.mock_config.chunk_size = 8192
|
482
|
+
|
483
|
+
self.downloader = ConcretePaperDownloader(self.mock_config)
|
484
|
+
|
485
|
+
@patch("tempfile.NamedTemporaryFile")
|
486
|
+
@patch("requests.get")
|
487
|
+
def test_download_pdf_chunk_filtering(self, mock_get, mock_tempfile):
|
488
|
+
"""Test that empty chunks are filtered out during download."""
|
489
|
+
# Mock response with mixed chunks including None/empty ones
|
490
|
+
mock_response = Mock()
|
491
|
+
mock_response.raise_for_status = Mock()
|
492
|
+
mock_response.iter_content.return_value = [
|
493
|
+
b"chunk1",
|
494
|
+
None, # Should be filtered out
|
495
|
+
b"", # Empty chunk, should be filtered out
|
496
|
+
b"chunk2",
|
497
|
+
None,
|
498
|
+
b"chunk3",
|
499
|
+
]
|
500
|
+
mock_response.headers = {}
|
501
|
+
mock_get.return_value = mock_response
|
502
|
+
|
503
|
+
# Mock temporary file
|
504
|
+
mock_temp_file = Mock()
|
505
|
+
mock_temp_file.name = "/tmp/test.pdf"
|
506
|
+
mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
|
507
|
+
mock_temp_file.__exit__ = Mock(return_value=None)
|
508
|
+
mock_tempfile.return_value = mock_temp_file
|
509
|
+
|
510
|
+
with patch.object(
|
511
|
+
self.downloader, "get_default_filename", return_value="default.pdf"
|
512
|
+
):
|
513
|
+
# Call without assigning to avoid 'unused-variable'
|
514
|
+
self.downloader.download_pdf_to_temp("https://test.com/paper.pdf", "12345")
|
515
|
+
|
516
|
+
# Should only write non-empty chunks
|
517
|
+
self.assertEqual(mock_temp_file.write.call_count, 3)
|
518
|
+
mock_temp_file.write.assert_any_call(b"chunk1")
|
519
|
+
mock_temp_file.write.assert_any_call(b"chunk2")
|
520
|
+
mock_temp_file.write.assert_any_call(b"chunk3")
|
521
|
+
|
522
|
+
def test_filename_extraction_regex_edge_cases(self):
|
523
|
+
"""Test filename extraction with various regex edge cases."""
|
524
|
+
test_headers = [
|
525
|
+
# Various quote combinations
|
526
|
+
('filename="file with spaces.pdf"', "file with spaces.pdf"),
|
527
|
+
(
|
528
|
+
"filename='single_quotes.pdf'",
|
529
|
+
"default.pdf",
|
530
|
+
), # Single quotes don't match regex
|
531
|
+
("filename=no_quotes.pdf", "no_quotes.pdf"),
|
532
|
+
# Unicode and special characters
|
533
|
+
('filename="файл.pdf"', "файл.pdf"),
|
534
|
+
(
|
535
|
+
'filename="file-with-dashes_and_underscores.pdf"',
|
536
|
+
"file-with-dashes_and_underscores.pdf",
|
537
|
+
),
|
538
|
+
# Edge cases
|
539
|
+
('filename=""', "default.pdf"), # Empty filename falls back to default
|
540
|
+
("filename=", "default.pdf"), # No value falls back to default
|
541
|
+
(
|
542
|
+
'other_param=value; filename="actual.pdf"',
|
543
|
+
"actual.pdf",
|
544
|
+
), # Mixed parameters
|
545
|
+
# Invalid cases (should fall back to default)
|
546
|
+
("invalid_header_format", None),
|
547
|
+
("filename=not_a_pdf.txt", "default.pdf"), # Non-PDF falls back to default
|
548
|
+
]
|
549
|
+
|
550
|
+
for header_value, expected in test_headers:
|
551
|
+
with self.subTest(header=header_value):
|
552
|
+
with patch("requests.get") as mock_get:
|
553
|
+
mock_response = Mock()
|
554
|
+
mock_response.raise_for_status = Mock()
|
555
|
+
mock_response.iter_content.return_value = [b"data"]
|
556
|
+
mock_response.headers = {"Content-Disposition": header_value}
|
557
|
+
mock_get.return_value = mock_response
|
558
|
+
|
559
|
+
with patch("tempfile.NamedTemporaryFile") as mock_tempfile:
|
560
|
+
mock_temp_file = Mock()
|
561
|
+
mock_temp_file.name = "/tmp/test.pdf"
|
562
|
+
mock_temp_file.__enter__ = Mock(return_value=mock_temp_file)
|
563
|
+
mock_temp_file.__exit__ = Mock(return_value=None)
|
564
|
+
mock_tempfile.return_value = mock_temp_file
|
565
|
+
|
566
|
+
with patch.object(
|
567
|
+
self.downloader,
|
568
|
+
"get_default_filename",
|
569
|
+
return_value="default.pdf",
|
570
|
+
):
|
571
|
+
result = self.downloader.download_pdf_to_temp(
|
572
|
+
"https://test.com/paper.pdf", "12345"
|
573
|
+
)
|
574
|
+
|
575
|
+
if expected is None:
|
576
|
+
# Should fall back to default
|
577
|
+
self.assertEqual(result[1], "default.pdf")
|
578
|
+
else:
|
579
|
+
self.assertEqual(result[1], expected)
|
580
|
+
|
581
|
+
def test_process_identifiers_empty_list(self):
|
582
|
+
"""Test processing empty identifier list."""
|
583
|
+
result = self.downloader.process_identifiers([])
|
584
|
+
|
585
|
+
self.assertEqual(result, {})
|
586
|
+
|
587
|
+
def test_process_identifiers_duplicate_handling(self):
|
588
|
+
"""Test processing list with duplicate identifiers."""
|
589
|
+
identifiers = ["12345", "67890", "12345"] # Duplicate 12345
|
590
|
+
|
591
|
+
with patch.object(self.downloader, "download_pdf_to_temp", return_value=None):
|
592
|
+
result = self.downloader.process_identifiers(identifiers)
|
593
|
+
|
594
|
+
# Should only have unique entries
|
595
|
+
self.assertEqual(len(result), 2)
|
596
|
+
self.assertIn("12345", result)
|
597
|
+
self.assertIn("67890", result)
|
598
|
+
|
599
|
+
|
600
|
+
class TestBasePaperDownloaderAbstractMethods(unittest.TestCase):
|
601
|
+
"""Test abstract method behavior."""
|
602
|
+
|
603
|
+
def test_abstract_class_cannot_be_instantiated(self):
|
604
|
+
"""BasePaperDownloader should be abstract (non-instantiable)."""
|
605
|
+
|
606
|
+
self.assertTrue(inspect.isabstract(BasePaperDownloader))
|
607
|
+
|
608
|
+
def test_complete_implementation_succeeds(self):
|
609
|
+
"""Test that complete implementations work."""
|
610
|
+
# ConcretePaperDownloader from setUp should work
|
611
|
+
config = Mock()
|
612
|
+
config.request_timeout = 30
|
613
|
+
config.chunk_size = 8192
|
614
|
+
|
615
|
+
downloader = ConcretePaperDownloader(config)
|
616
|
+
|
617
|
+
# Should be able to call all methods
|
618
|
+
self.assertEqual(downloader.get_service_name(), "TestService")
|
619
|
+
self.assertEqual(downloader.get_identifier_name(), "Test ID")
|
620
|
+
self.assertEqual(downloader.get_default_filename("test"), "test_test.pdf")
|