aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
- aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
- aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
- aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
- aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
- aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
- aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
- aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
- aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
- aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
- aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
- aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
- aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
- aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
- /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,478 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for ArxivDownloader.
|
3
|
+
Tests XML parsing, PDF URL construction, and metadata extraction.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import unittest
|
7
|
+
import xml.etree.ElementTree as ET
|
8
|
+
from unittest.mock import Mock, patch
|
9
|
+
|
10
|
+
import requests
|
11
|
+
|
12
|
+
from aiagents4pharma.talk2scholars.tools.paper_download.utils.arxiv_downloader import (
|
13
|
+
ArxivDownloader,
|
14
|
+
)
|
15
|
+
|
16
|
+
|
17
|
+
class ArxivDownloaderTestShim(ArxivDownloader):
|
18
|
+
"""Public wrappers to exercise protected helpers without W0212."""
|
19
|
+
|
20
|
+
def extract_basic_metadata_public(self, entry, ns):
|
21
|
+
"""extract_basic_metadata_public"""
|
22
|
+
return self._extract_basic_metadata(entry, ns)
|
23
|
+
|
24
|
+
def extract_title_public(self, entry, ns):
|
25
|
+
"""extract_title_public"""
|
26
|
+
return self._extract_title(entry, ns)
|
27
|
+
|
28
|
+
def extract_authors_public(self, entry, ns):
|
29
|
+
"""extract_authors_public"""
|
30
|
+
return self._extract_authors(entry, ns)
|
31
|
+
|
32
|
+
def extract_abstract_public(self, entry, ns):
|
33
|
+
"""extract_authors_public"""
|
34
|
+
return self._extract_abstract(entry, ns)
|
35
|
+
|
36
|
+
def extract_publication_date_public(self, entry, ns):
|
37
|
+
"""extract_publication_date_public"""
|
38
|
+
return self._extract_publication_date(entry, ns)
|
39
|
+
|
40
|
+
def extract_pdf_metadata_public(self, pdf_result, identifier):
|
41
|
+
"""extract_pdf_metadata_public"""
|
42
|
+
return self._extract_pdf_metadata(pdf_result, identifier)
|
43
|
+
|
44
|
+
def get_paper_identifier_info_public(self, paper):
|
45
|
+
"""get_paper_identifier_info_public"""
|
46
|
+
return self._get_paper_identifier_info(paper)
|
47
|
+
|
48
|
+
def add_service_identifier_public(self, entry, identifier):
|
49
|
+
"""add_service_identifier_public"""
|
50
|
+
self._add_service_identifier(entry, identifier)
|
51
|
+
|
52
|
+
|
53
|
+
class TestArxivDownloader(unittest.TestCase):
|
54
|
+
"""Tests for the ArxivDownloader class."""
|
55
|
+
|
56
|
+
def setUp(self):
|
57
|
+
"""Set up test fixtures."""
|
58
|
+
self.mock_config = Mock()
|
59
|
+
self.mock_config.api_url = "http://export.arxiv.org/api/query"
|
60
|
+
self.mock_config.pdf_base_url = "https://arxiv.org/pdf"
|
61
|
+
self.mock_config.request_timeout = 30
|
62
|
+
self.mock_config.chunk_size = 8192
|
63
|
+
self.mock_config.xml_namespace = {"atom": "http://www.w3.org/2005/Atom"}
|
64
|
+
|
65
|
+
# Use the testable subclass to avoid W0212 while still covering helpers
|
66
|
+
self.downloader = ArxivDownloaderTestShim(self.mock_config)
|
67
|
+
|
68
|
+
# Sample arXiv XML response
|
69
|
+
self.sample_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
70
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
71
|
+
<entry>
|
72
|
+
<id>http://arxiv.org/abs/1234.5678v1</id>
|
73
|
+
<updated>2023-01-01T12:00:00Z</updated>
|
74
|
+
<published>2023-01-01T12:00:00Z</published>
|
75
|
+
<title>Test Paper Title</title>
|
76
|
+
<summary>This is a test abstract for the paper.</summary>
|
77
|
+
<author>
|
78
|
+
<name>John Doe</name>
|
79
|
+
</author>
|
80
|
+
<author>
|
81
|
+
<name>Jane Smith</name>
|
82
|
+
</author>
|
83
|
+
<link href="http://arxiv.org/abs/1234.5678v1" rel="alternate" type="text/html"/>
|
84
|
+
<link href="http://arxiv.org/pdf/1234.5678v1.pdf" rel="related" type="application/pdf" title="pdf"/>
|
85
|
+
</entry>
|
86
|
+
</feed>"""
|
87
|
+
|
88
|
+
def test_initialization(self):
|
89
|
+
"""Test ArxivDownloader initialization."""
|
90
|
+
self.assertEqual(self.downloader.api_url, "http://export.arxiv.org/api/query")
|
91
|
+
self.assertEqual(self.downloader.pdf_base_url, "https://arxiv.org/pdf")
|
92
|
+
self.assertEqual(self.downloader.request_timeout, 30)
|
93
|
+
self.assertEqual(self.downloader.chunk_size, 8192)
|
94
|
+
|
95
|
+
@patch("requests.get")
|
96
|
+
def test_fetch_metadata_success(self, mock_get):
|
97
|
+
"""Test successful metadata fetching from arXiv API."""
|
98
|
+
mock_response = Mock()
|
99
|
+
mock_response.text = self.sample_xml
|
100
|
+
mock_response.raise_for_status = Mock()
|
101
|
+
mock_get.return_value = mock_response
|
102
|
+
|
103
|
+
result = self.downloader.fetch_metadata("1234.5678")
|
104
|
+
|
105
|
+
# Verify API call - it uses query string format, not params
|
106
|
+
expected_url = (
|
107
|
+
"http://export.arxiv.org/api/query?search_query="
|
108
|
+
"id:1234.5678&start=0&max_results=1"
|
109
|
+
)
|
110
|
+
mock_get.assert_called_once_with(expected_url, timeout=30)
|
111
|
+
mock_response.raise_for_status.assert_called_once()
|
112
|
+
|
113
|
+
# Verify XML parsing
|
114
|
+
self.assertIsInstance(result, ET.Element)
|
115
|
+
self.assertEqual(result.tag, "{http://www.w3.org/2005/Atom}feed")
|
116
|
+
|
117
|
+
@patch("requests.get")
|
118
|
+
def test_fetch_metadata_request_error(self, mock_get):
|
119
|
+
"""Test fetch_metadata with request error."""
|
120
|
+
mock_get.side_effect = requests.RequestException("Network error")
|
121
|
+
|
122
|
+
with self.assertRaises(requests.RequestException):
|
123
|
+
self.downloader.fetch_metadata("1234.5678")
|
124
|
+
|
125
|
+
@patch("requests.get")
|
126
|
+
def test_fetch_metadata_invalid_xml(self, mock_get):
|
127
|
+
"""Test fetch_metadata with invalid XML response."""
|
128
|
+
mock_response = Mock()
|
129
|
+
mock_response.text = "Invalid XML content"
|
130
|
+
mock_response.raise_for_status = Mock()
|
131
|
+
mock_get.return_value = mock_response
|
132
|
+
|
133
|
+
with self.assertRaises(ET.ParseError):
|
134
|
+
self.downloader.fetch_metadata("1234.5678")
|
135
|
+
|
136
|
+
@patch("requests.get")
|
137
|
+
def test_fetch_metadata_no_entry_found(self, mock_get):
|
138
|
+
"""Test fetch_metadata when no entry is found in arXiv API response."""
|
139
|
+
# XML response without any entry - note the namespace declarations
|
140
|
+
empty_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
141
|
+
<feed xmlns="http://www.w3.org/2005/Atom" xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">
|
142
|
+
<title>ArXiv Query: search_query=all:1234.5678</title>
|
143
|
+
<id>http://arxiv.org/api/query?search_query=all:1234.5678</id>
|
144
|
+
<opensearch:totalResults>0</opensearch:totalResults>
|
145
|
+
<opensearch:startIndex>0</opensearch:startIndex>
|
146
|
+
</feed>"""
|
147
|
+
|
148
|
+
mock_response = Mock()
|
149
|
+
mock_response.text = empty_xml
|
150
|
+
mock_response.raise_for_status = Mock()
|
151
|
+
mock_get.return_value = mock_response
|
152
|
+
|
153
|
+
with self.assertRaises(RuntimeError) as context:
|
154
|
+
self.downloader.fetch_metadata("1234.5678")
|
155
|
+
|
156
|
+
self.assertIn("No entry found in arXiv API response", str(context.exception))
|
157
|
+
|
158
|
+
def test_construct_pdf_url_from_metadata(self):
|
159
|
+
"""Test PDF URL construction from metadata."""
|
160
|
+
metadata = ET.fromstring(self.sample_xml)
|
161
|
+
|
162
|
+
result = self.downloader.construct_pdf_url(metadata, "1234.5678")
|
163
|
+
|
164
|
+
# Should extract PDF URL from the link with title="pdf"
|
165
|
+
self.assertEqual(result, "http://arxiv.org/pdf/1234.5678v1.pdf")
|
166
|
+
|
167
|
+
def test_construct_pdf_url_fallback(self):
|
168
|
+
"""Test PDF URL construction fallback when not found in metadata."""
|
169
|
+
# XML without PDF link
|
170
|
+
xml_no_pdf = """<?xml version="1.0" encoding="UTF-8"?>
|
171
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
172
|
+
<entry>
|
173
|
+
<id>http://arxiv.org/abs/1234.5678v1</id>
|
174
|
+
<title>Test Paper Title</title>
|
175
|
+
<link href="http://arxiv.org/abs/1234.5678v1" rel="alternate" type="text/html"/>
|
176
|
+
</entry>
|
177
|
+
</feed>"""
|
178
|
+
|
179
|
+
metadata = ET.fromstring(xml_no_pdf)
|
180
|
+
|
181
|
+
result = self.downloader.construct_pdf_url(metadata, "1234.5678")
|
182
|
+
|
183
|
+
# Should fallback to constructed URL
|
184
|
+
self.assertEqual(result, "https://arxiv.org/pdf/1234.5678.pdf")
|
185
|
+
|
186
|
+
def test_construct_pdf_url_no_entry(self):
|
187
|
+
"""Test PDF URL construction with no entry in metadata."""
|
188
|
+
xml_no_entry = """<?xml version="1.0" encoding="UTF-8"?>
|
189
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
190
|
+
</feed>"""
|
191
|
+
|
192
|
+
metadata = ET.fromstring(xml_no_entry)
|
193
|
+
|
194
|
+
result = self.downloader.construct_pdf_url(metadata, "1234.5678")
|
195
|
+
|
196
|
+
# Should return empty string when no entry found
|
197
|
+
self.assertEqual(result, "")
|
198
|
+
|
199
|
+
def test_extract_paper_metadata_success(self):
|
200
|
+
"""Test successful paper metadata extraction."""
|
201
|
+
metadata = ET.fromstring(self.sample_xml)
|
202
|
+
pdf_result = ("/tmp/test.pdf", "test_paper.pdf")
|
203
|
+
|
204
|
+
result = self.downloader.extract_paper_metadata(
|
205
|
+
metadata, "1234.5678", pdf_result
|
206
|
+
)
|
207
|
+
|
208
|
+
# Verify extracted metadata
|
209
|
+
expected_metadata = {
|
210
|
+
"Title": "Test Paper Title",
|
211
|
+
"Authors": ["John Doe", "Jane Smith"],
|
212
|
+
"Abstract": "This is a test abstract for the paper.",
|
213
|
+
"Publication Date": "2023-01-01T12:00:00Z",
|
214
|
+
"URL": "/tmp/test.pdf",
|
215
|
+
"pdf_url": "/tmp/test.pdf",
|
216
|
+
"filename": "test_paper.pdf",
|
217
|
+
"source": "arxiv",
|
218
|
+
"arxiv_id": "1234.5678",
|
219
|
+
"access_type": "open_access_downloaded",
|
220
|
+
"temp_file_path": "/tmp/test.pdf",
|
221
|
+
}
|
222
|
+
|
223
|
+
self.assertEqual(result, expected_metadata)
|
224
|
+
|
225
|
+
def test_extract_paper_metadata_no_pdf(self):
|
226
|
+
"""Test metadata extraction without PDF download."""
|
227
|
+
metadata = ET.fromstring(self.sample_xml)
|
228
|
+
|
229
|
+
with patch.object(
|
230
|
+
self.downloader, "get_default_filename", return_value="1234.5678.pdf"
|
231
|
+
):
|
232
|
+
result = self.downloader.extract_paper_metadata(metadata, "1234.5678", None)
|
233
|
+
|
234
|
+
# Verify metadata without PDF
|
235
|
+
self.assertEqual(result["Title"], "Test Paper Title")
|
236
|
+
self.assertEqual(result["URL"], "")
|
237
|
+
self.assertEqual(result["pdf_url"], "")
|
238
|
+
self.assertEqual(result["filename"], "1234.5678.pdf")
|
239
|
+
self.assertEqual(result["access_type"], "download_failed")
|
240
|
+
self.assertEqual(result["temp_file_path"], "")
|
241
|
+
|
242
|
+
def test_extract_paper_metadata_no_entry(self):
|
243
|
+
"""Test metadata extraction with no entry in XML."""
|
244
|
+
xml_no_entry = """<?xml version="1.0" encoding="UTF-8"?>
|
245
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
246
|
+
</feed>"""
|
247
|
+
|
248
|
+
metadata = ET.fromstring(xml_no_entry)
|
249
|
+
|
250
|
+
with self.assertRaises(RuntimeError) as context:
|
251
|
+
self.downloader.extract_paper_metadata(metadata, "1234.5678", None)
|
252
|
+
|
253
|
+
self.assertIn("No entry found in metadata", str(context.exception))
|
254
|
+
|
255
|
+
def test_extract_basic_metadata(self):
|
256
|
+
"""Test basic metadata extraction helper method."""
|
257
|
+
metadata = ET.fromstring(self.sample_xml)
|
258
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
259
|
+
entry = metadata.find("atom:entry", ns)
|
260
|
+
|
261
|
+
result = self.downloader.extract_basic_metadata_public(entry, ns)
|
262
|
+
|
263
|
+
expected = {
|
264
|
+
"Title": "Test Paper Title",
|
265
|
+
"Authors": ["John Doe", "Jane Smith"],
|
266
|
+
"Abstract": "This is a test abstract for the paper.",
|
267
|
+
"Publication Date": "2023-01-01T12:00:00Z",
|
268
|
+
}
|
269
|
+
self.assertEqual(result, expected)
|
270
|
+
|
271
|
+
def test_extract_title_variants(self):
|
272
|
+
"""Title extraction for present and missing cases."""
|
273
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
274
|
+
|
275
|
+
# Case 1: Title present
|
276
|
+
metadata1 = ET.fromstring(self.sample_xml)
|
277
|
+
entry1 = metadata1.find("atom:entry", ns)
|
278
|
+
self.assertEqual(
|
279
|
+
self.downloader.extract_title_public(entry1, ns), "Test Paper Title"
|
280
|
+
)
|
281
|
+
|
282
|
+
# Case 2: Title missing
|
283
|
+
xml_no_title = """<?xml version="1.0" encoding="UTF-8"?>
|
284
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
285
|
+
<entry>
|
286
|
+
<id>http://arxiv.org/abs/1234.5678v1</id>
|
287
|
+
</entry>
|
288
|
+
</feed>"""
|
289
|
+
metadata2 = ET.fromstring(xml_no_title)
|
290
|
+
entry2 = metadata2.find("atom:entry", ns)
|
291
|
+
self.assertEqual(self.downloader.extract_title_public(entry2, ns), "N/A")
|
292
|
+
|
293
|
+
def test_extract_authors_variants(self):
|
294
|
+
"""Authors extraction for present and empty cases."""
|
295
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
296
|
+
|
297
|
+
# Case 1: Authors present
|
298
|
+
metadata1 = ET.fromstring(self.sample_xml)
|
299
|
+
entry1 = metadata1.find("atom:entry", ns)
|
300
|
+
self.assertEqual(
|
301
|
+
self.downloader.extract_authors_public(entry1, ns),
|
302
|
+
["John Doe", "Jane Smith"],
|
303
|
+
)
|
304
|
+
|
305
|
+
# Case 2: Authors missing
|
306
|
+
xml_no_authors = """<?xml version="1.0" encoding="UTF-8"?>
|
307
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
308
|
+
<entry>
|
309
|
+
<id>http://arxiv.org/abs/1234.5678v1</id>
|
310
|
+
<title>Test Paper Title</title>
|
311
|
+
</entry>
|
312
|
+
</feed>"""
|
313
|
+
metadata2 = ET.fromstring(xml_no_authors)
|
314
|
+
entry2 = metadata2.find("atom:entry", ns)
|
315
|
+
self.assertEqual(self.downloader.extract_authors_public(entry2, ns), [])
|
316
|
+
|
317
|
+
def test_extract_abstract_and_publication_date(self):
|
318
|
+
"""Abstract and publication date extraction."""
|
319
|
+
metadata = ET.fromstring(self.sample_xml)
|
320
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
321
|
+
entry = metadata.find("atom:entry", ns)
|
322
|
+
|
323
|
+
self.assertEqual(
|
324
|
+
self.downloader.extract_abstract_public(entry, ns),
|
325
|
+
"This is a test abstract for the paper.",
|
326
|
+
)
|
327
|
+
self.assertEqual(
|
328
|
+
self.downloader.extract_publication_date_public(entry, ns),
|
329
|
+
"2023-01-01T12:00:00Z",
|
330
|
+
)
|
331
|
+
|
332
|
+
def test_extract_pdf_metadata_variants(self):
|
333
|
+
"""PDF metadata extraction with and without a download result."""
|
334
|
+
# With result
|
335
|
+
pdf_result = ("/tmp/test.pdf", "paper.pdf")
|
336
|
+
expected_with = {
|
337
|
+
"URL": "/tmp/test.pdf",
|
338
|
+
"pdf_url": "/tmp/test.pdf",
|
339
|
+
"filename": "paper.pdf",
|
340
|
+
"access_type": "open_access_downloaded",
|
341
|
+
"temp_file_path": "/tmp/test.pdf",
|
342
|
+
}
|
343
|
+
self.assertEqual(
|
344
|
+
self.downloader.extract_pdf_metadata_public(pdf_result, "1234.5678"),
|
345
|
+
expected_with,
|
346
|
+
)
|
347
|
+
|
348
|
+
# Without result
|
349
|
+
with patch.object(
|
350
|
+
self.downloader, "get_default_filename", return_value="default.pdf"
|
351
|
+
):
|
352
|
+
expected_without = {
|
353
|
+
"URL": "",
|
354
|
+
"pdf_url": "",
|
355
|
+
"filename": "default.pdf",
|
356
|
+
"access_type": "download_failed",
|
357
|
+
"temp_file_path": "",
|
358
|
+
}
|
359
|
+
self.assertEqual(
|
360
|
+
self.downloader.extract_pdf_metadata_public(None, "1234.5678"),
|
361
|
+
expected_without,
|
362
|
+
)
|
363
|
+
|
364
|
+
def test_service_and_identifier_helpers(self):
|
365
|
+
"""Service name, identifier name, and default filename helpers."""
|
366
|
+
self.assertEqual(self.downloader.get_service_name(), "arXiv")
|
367
|
+
self.assertEqual(self.downloader.get_identifier_name(), "arXiv ID")
|
368
|
+
self.assertEqual(
|
369
|
+
self.downloader.get_default_filename("1234.5678"), "1234.5678.pdf"
|
370
|
+
)
|
371
|
+
|
372
|
+
def test_get_paper_identifier_info(self):
|
373
|
+
"""Test _get_paper_identifier_info method."""
|
374
|
+
paper = {"arxiv_id": "1234.5678", "Publication Date": "2023-01-01T12:00:00Z"}
|
375
|
+
|
376
|
+
result = self.downloader.get_paper_identifier_info_public(paper)
|
377
|
+
|
378
|
+
self.assertIn("1234.5678", result)
|
379
|
+
self.assertIn("2023-01-01", result)
|
380
|
+
|
381
|
+
def test_add_service_identifier(self):
|
382
|
+
"""Test _add_service_identifier method."""
|
383
|
+
entry = {}
|
384
|
+
|
385
|
+
self.downloader.add_service_identifier_public(entry, "1234.5678")
|
386
|
+
|
387
|
+
self.assertEqual(entry["arxiv_id"], "1234.5678")
|
388
|
+
|
389
|
+
|
390
|
+
class TestArxivDownloaderIntegration(unittest.TestCase):
|
391
|
+
"""Integration tests for ArxivDownloader with mocked external dependencies."""
|
392
|
+
|
393
|
+
def setUp(self):
|
394
|
+
"""Set up integration test fixtures."""
|
395
|
+
self.mock_config = Mock()
|
396
|
+
self.mock_config.api_url = "http://export.arxiv.org/api/query"
|
397
|
+
self.mock_config.pdf_base_url = "https://arxiv.org/pdf"
|
398
|
+
self.mock_config.request_timeout = 30
|
399
|
+
self.mock_config.chunk_size = 8192
|
400
|
+
self.mock_config.xml_namespace = {"atom": "http://www.w3.org/2005/Atom"}
|
401
|
+
|
402
|
+
self.downloader = ArxivDownloaderTestShim(self.mock_config)
|
403
|
+
|
404
|
+
self.sample_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
405
|
+
<feed xmlns="http://www.w3.org/2005/Atom">
|
406
|
+
<entry>
|
407
|
+
<id>http://arxiv.org/abs/1234.5678v1</id>
|
408
|
+
<published>2023-01-01T12:00:00Z</published>
|
409
|
+
<title>Integration Test Paper</title>
|
410
|
+
<summary>This is a test abstract.</summary>
|
411
|
+
<author>
|
412
|
+
<name>Test Author</name>
|
413
|
+
</author>
|
414
|
+
<link href="http://arxiv.org/pdf/1234.5678v1.pdf" rel="related" type="application/pdf" title="pdf"/>
|
415
|
+
</entry>
|
416
|
+
</feed>"""
|
417
|
+
|
418
|
+
@patch(
|
419
|
+
"aiagents4pharma.talk2scholars.tools.paper_download.utils."
|
420
|
+
"arxiv_downloader.ArxivDownloader.download_pdf_to_temp"
|
421
|
+
)
|
422
|
+
@patch("requests.get")
|
423
|
+
def test_full_paper_processing_workflow(self, mock_get, mock_download):
|
424
|
+
"""Test the complete workflow from identifier to processed paper data."""
|
425
|
+
# Mock API response
|
426
|
+
mock_response = Mock()
|
427
|
+
mock_response.text = self.sample_xml
|
428
|
+
mock_response.raise_for_status = Mock()
|
429
|
+
mock_get.return_value = mock_response
|
430
|
+
|
431
|
+
# Mock PDF download
|
432
|
+
mock_download.return_value = ("/tmp/paper.pdf", "1234.5678.pdf")
|
433
|
+
|
434
|
+
# Simulate the workflow
|
435
|
+
identifiers = ["1234.5678"]
|
436
|
+
results = {}
|
437
|
+
|
438
|
+
for identifier in identifiers:
|
439
|
+
# Step 1: Fetch metadata
|
440
|
+
metadata = self.downloader.fetch_metadata(identifier)
|
441
|
+
|
442
|
+
# Step 2: Construct PDF URL
|
443
|
+
pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
|
444
|
+
|
445
|
+
# Step 3: Download PDF
|
446
|
+
pdf_result = self.downloader.download_pdf_to_temp(pdf_url, identifier)
|
447
|
+
|
448
|
+
# Step 4: Extract metadata
|
449
|
+
paper_data = self.downloader.extract_paper_metadata(
|
450
|
+
metadata, identifier, pdf_result
|
451
|
+
)
|
452
|
+
|
453
|
+
results[identifier] = paper_data
|
454
|
+
|
455
|
+
# Verify the complete workflow
|
456
|
+
self.assertIn("1234.5678", results)
|
457
|
+
paper = results["1234.5678"]
|
458
|
+
|
459
|
+
self.assertEqual(paper["Title"], "Integration Test Paper")
|
460
|
+
self.assertEqual(paper["Authors"], ["Test Author"])
|
461
|
+
self.assertEqual(paper["access_type"], "open_access_downloaded")
|
462
|
+
self.assertEqual(paper["filename"], "1234.5678.pdf")
|
463
|
+
self.assertEqual(paper["temp_file_path"], "/tmp/paper.pdf")
|
464
|
+
|
465
|
+
# Verify method calls
|
466
|
+
mock_get.assert_called_once()
|
467
|
+
mock_download.assert_called_once_with(
|
468
|
+
"http://arxiv.org/pdf/1234.5678v1.pdf", "1234.5678"
|
469
|
+
)
|
470
|
+
|
471
|
+
@patch("requests.get")
|
472
|
+
def test_error_handling_workflow(self, mock_get):
|
473
|
+
"""Test error handling in the workflow."""
|
474
|
+
# Mock network error
|
475
|
+
mock_get.side_effect = requests.RequestException("Network error")
|
476
|
+
|
477
|
+
with self.assertRaises(requests.RequestException):
|
478
|
+
self.downloader.fetch_metadata("1234.5678")
|