aiagents4pharma 1.41.0__py3-none-any.whl → 1.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  2. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
  3. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  4. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  6. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  7. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
  8. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +7 -4
  9. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +49 -95
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +15 -1
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +16 -2
  12. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +40 -5
  13. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +15 -5
  14. aiagents4pharma/talk2scholars/configs/config.yaml +1 -3
  15. aiagents4pharma/talk2scholars/configs/tools/paper_download/default.yaml +124 -0
  16. aiagents4pharma/talk2scholars/tests/test_arxiv_downloader.py +478 -0
  17. aiagents4pharma/talk2scholars/tests/test_base_paper_downloader.py +620 -0
  18. aiagents4pharma/talk2scholars/tests/test_biorxiv_downloader.py +697 -0
  19. aiagents4pharma/talk2scholars/tests/test_medrxiv_downloader.py +534 -0
  20. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +22 -12
  21. aiagents4pharma/talk2scholars/tests/test_paper_downloader.py +545 -0
  22. aiagents4pharma/talk2scholars/tests/test_pubmed_downloader.py +1067 -0
  23. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +2 -4
  24. aiagents4pharma/talk2scholars/tools/paper_download/paper_downloader.py +457 -0
  25. aiagents4pharma/talk2scholars/tools/paper_download/utils/__init__.py +20 -0
  26. aiagents4pharma/talk2scholars/tools/paper_download/utils/arxiv_downloader.py +209 -0
  27. aiagents4pharma/talk2scholars/tools/paper_download/utils/base_paper_downloader.py +343 -0
  28. aiagents4pharma/talk2scholars/tools/paper_download/utils/biorxiv_downloader.py +321 -0
  29. aiagents4pharma/talk2scholars/tools/paper_download/utils/medrxiv_downloader.py +198 -0
  30. aiagents4pharma/talk2scholars/tools/paper_download/utils/pubmed_downloader.py +337 -0
  31. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +97 -45
  32. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +47 -29
  33. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/METADATA +30 -14
  34. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/RECORD +38 -30
  35. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +0 -4
  36. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/__init__.py +0 -3
  37. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +0 -2
  38. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/__init__.py +0 -3
  39. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +0 -2
  40. aiagents4pharma/talk2scholars/tests/test_paper_download_biorxiv.py +0 -151
  41. aiagents4pharma/talk2scholars/tests/test_paper_download_medrxiv.py +0 -151
  42. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +0 -249
  43. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +0 -177
  44. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +0 -114
  45. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +0 -114
  46. /aiagents4pharma/talk2scholars/configs/tools/{download_arxiv_paper → paper_download}/__init__.py +0 -0
  47. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/WHEEL +0 -0
  48. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/licenses/LICENSE +0 -0
  49. {aiagents4pharma-1.41.0.dist-info → aiagents4pharma-1.43.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,478 @@
1
+ """
2
+ Unit tests for ArxivDownloader.
3
+ Tests XML parsing, PDF URL construction, and metadata extraction.
4
+ """
5
+
6
+ import unittest
7
+ import xml.etree.ElementTree as ET
8
+ from unittest.mock import Mock, patch
9
+
10
+ import requests
11
+
12
+ from aiagents4pharma.talk2scholars.tools.paper_download.utils.arxiv_downloader import (
13
+ ArxivDownloader,
14
+ )
15
+
16
+
17
+ class ArxivDownloaderTestShim(ArxivDownloader):
18
+ """Public wrappers to exercise protected helpers without W0212."""
19
+
20
+ def extract_basic_metadata_public(self, entry, ns):
21
+ """extract_basic_metadata_public"""
22
+ return self._extract_basic_metadata(entry, ns)
23
+
24
+ def extract_title_public(self, entry, ns):
25
+ """extract_title_public"""
26
+ return self._extract_title(entry, ns)
27
+
28
+ def extract_authors_public(self, entry, ns):
29
+ """extract_authors_public"""
30
+ return self._extract_authors(entry, ns)
31
+
32
+ def extract_abstract_public(self, entry, ns):
33
+ """extract_authors_public"""
34
+ return self._extract_abstract(entry, ns)
35
+
36
+ def extract_publication_date_public(self, entry, ns):
37
+ """extract_publication_date_public"""
38
+ return self._extract_publication_date(entry, ns)
39
+
40
+ def extract_pdf_metadata_public(self, pdf_result, identifier):
41
+ """extract_pdf_metadata_public"""
42
+ return self._extract_pdf_metadata(pdf_result, identifier)
43
+
44
+ def get_paper_identifier_info_public(self, paper):
45
+ """get_paper_identifier_info_public"""
46
+ return self._get_paper_identifier_info(paper)
47
+
48
+ def add_service_identifier_public(self, entry, identifier):
49
+ """add_service_identifier_public"""
50
+ self._add_service_identifier(entry, identifier)
51
+
52
+
53
+ class TestArxivDownloader(unittest.TestCase):
54
+ """Tests for the ArxivDownloader class."""
55
+
56
+ def setUp(self):
57
+ """Set up test fixtures."""
58
+ self.mock_config = Mock()
59
+ self.mock_config.api_url = "http://export.arxiv.org/api/query"
60
+ self.mock_config.pdf_base_url = "https://arxiv.org/pdf"
61
+ self.mock_config.request_timeout = 30
62
+ self.mock_config.chunk_size = 8192
63
+ self.mock_config.xml_namespace = {"atom": "http://www.w3.org/2005/Atom"}
64
+
65
+ # Use the testable subclass to avoid W0212 while still covering helpers
66
+ self.downloader = ArxivDownloaderTestShim(self.mock_config)
67
+
68
+ # Sample arXiv XML response
69
+ self.sample_xml = """<?xml version="1.0" encoding="UTF-8"?>
70
+ <feed xmlns="http://www.w3.org/2005/Atom">
71
+ <entry>
72
+ <id>http://arxiv.org/abs/1234.5678v1</id>
73
+ <updated>2023-01-01T12:00:00Z</updated>
74
+ <published>2023-01-01T12:00:00Z</published>
75
+ <title>Test Paper Title</title>
76
+ <summary>This is a test abstract for the paper.</summary>
77
+ <author>
78
+ <name>John Doe</name>
79
+ </author>
80
+ <author>
81
+ <name>Jane Smith</name>
82
+ </author>
83
+ <link href="http://arxiv.org/abs/1234.5678v1" rel="alternate" type="text/html"/>
84
+ <link href="http://arxiv.org/pdf/1234.5678v1.pdf" rel="related" type="application/pdf" title="pdf"/>
85
+ </entry>
86
+ </feed>"""
87
+
88
+ def test_initialization(self):
89
+ """Test ArxivDownloader initialization."""
90
+ self.assertEqual(self.downloader.api_url, "http://export.arxiv.org/api/query")
91
+ self.assertEqual(self.downloader.pdf_base_url, "https://arxiv.org/pdf")
92
+ self.assertEqual(self.downloader.request_timeout, 30)
93
+ self.assertEqual(self.downloader.chunk_size, 8192)
94
+
95
+ @patch("requests.get")
96
+ def test_fetch_metadata_success(self, mock_get):
97
+ """Test successful metadata fetching from arXiv API."""
98
+ mock_response = Mock()
99
+ mock_response.text = self.sample_xml
100
+ mock_response.raise_for_status = Mock()
101
+ mock_get.return_value = mock_response
102
+
103
+ result = self.downloader.fetch_metadata("1234.5678")
104
+
105
+ # Verify API call - it uses query string format, not params
106
+ expected_url = (
107
+ "http://export.arxiv.org/api/query?search_query="
108
+ "id:1234.5678&start=0&max_results=1"
109
+ )
110
+ mock_get.assert_called_once_with(expected_url, timeout=30)
111
+ mock_response.raise_for_status.assert_called_once()
112
+
113
+ # Verify XML parsing
114
+ self.assertIsInstance(result, ET.Element)
115
+ self.assertEqual(result.tag, "{http://www.w3.org/2005/Atom}feed")
116
+
117
+ @patch("requests.get")
118
+ def test_fetch_metadata_request_error(self, mock_get):
119
+ """Test fetch_metadata with request error."""
120
+ mock_get.side_effect = requests.RequestException("Network error")
121
+
122
+ with self.assertRaises(requests.RequestException):
123
+ self.downloader.fetch_metadata("1234.5678")
124
+
125
+ @patch("requests.get")
126
+ def test_fetch_metadata_invalid_xml(self, mock_get):
127
+ """Test fetch_metadata with invalid XML response."""
128
+ mock_response = Mock()
129
+ mock_response.text = "Invalid XML content"
130
+ mock_response.raise_for_status = Mock()
131
+ mock_get.return_value = mock_response
132
+
133
+ with self.assertRaises(ET.ParseError):
134
+ self.downloader.fetch_metadata("1234.5678")
135
+
136
+ @patch("requests.get")
137
+ def test_fetch_metadata_no_entry_found(self, mock_get):
138
+ """Test fetch_metadata when no entry is found in arXiv API response."""
139
+ # XML response without any entry - note the namespace declarations
140
+ empty_xml = """<?xml version="1.0" encoding="UTF-8"?>
141
+ <feed xmlns="http://www.w3.org/2005/Atom" xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">
142
+ <title>ArXiv Query: search_query=all:1234.5678</title>
143
+ <id>http://arxiv.org/api/query?search_query=all:1234.5678</id>
144
+ <opensearch:totalResults>0</opensearch:totalResults>
145
+ <opensearch:startIndex>0</opensearch:startIndex>
146
+ </feed>"""
147
+
148
+ mock_response = Mock()
149
+ mock_response.text = empty_xml
150
+ mock_response.raise_for_status = Mock()
151
+ mock_get.return_value = mock_response
152
+
153
+ with self.assertRaises(RuntimeError) as context:
154
+ self.downloader.fetch_metadata("1234.5678")
155
+
156
+ self.assertIn("No entry found in arXiv API response", str(context.exception))
157
+
158
+ def test_construct_pdf_url_from_metadata(self):
159
+ """Test PDF URL construction from metadata."""
160
+ metadata = ET.fromstring(self.sample_xml)
161
+
162
+ result = self.downloader.construct_pdf_url(metadata, "1234.5678")
163
+
164
+ # Should extract PDF URL from the link with title="pdf"
165
+ self.assertEqual(result, "http://arxiv.org/pdf/1234.5678v1.pdf")
166
+
167
+ def test_construct_pdf_url_fallback(self):
168
+ """Test PDF URL construction fallback when not found in metadata."""
169
+ # XML without PDF link
170
+ xml_no_pdf = """<?xml version="1.0" encoding="UTF-8"?>
171
+ <feed xmlns="http://www.w3.org/2005/Atom">
172
+ <entry>
173
+ <id>http://arxiv.org/abs/1234.5678v1</id>
174
+ <title>Test Paper Title</title>
175
+ <link href="http://arxiv.org/abs/1234.5678v1" rel="alternate" type="text/html"/>
176
+ </entry>
177
+ </feed>"""
178
+
179
+ metadata = ET.fromstring(xml_no_pdf)
180
+
181
+ result = self.downloader.construct_pdf_url(metadata, "1234.5678")
182
+
183
+ # Should fallback to constructed URL
184
+ self.assertEqual(result, "https://arxiv.org/pdf/1234.5678.pdf")
185
+
186
+ def test_construct_pdf_url_no_entry(self):
187
+ """Test PDF URL construction with no entry in metadata."""
188
+ xml_no_entry = """<?xml version="1.0" encoding="UTF-8"?>
189
+ <feed xmlns="http://www.w3.org/2005/Atom">
190
+ </feed>"""
191
+
192
+ metadata = ET.fromstring(xml_no_entry)
193
+
194
+ result = self.downloader.construct_pdf_url(metadata, "1234.5678")
195
+
196
+ # Should return empty string when no entry found
197
+ self.assertEqual(result, "")
198
+
199
+ def test_extract_paper_metadata_success(self):
200
+ """Test successful paper metadata extraction."""
201
+ metadata = ET.fromstring(self.sample_xml)
202
+ pdf_result = ("/tmp/test.pdf", "test_paper.pdf")
203
+
204
+ result = self.downloader.extract_paper_metadata(
205
+ metadata, "1234.5678", pdf_result
206
+ )
207
+
208
+ # Verify extracted metadata
209
+ expected_metadata = {
210
+ "Title": "Test Paper Title",
211
+ "Authors": ["John Doe", "Jane Smith"],
212
+ "Abstract": "This is a test abstract for the paper.",
213
+ "Publication Date": "2023-01-01T12:00:00Z",
214
+ "URL": "/tmp/test.pdf",
215
+ "pdf_url": "/tmp/test.pdf",
216
+ "filename": "test_paper.pdf",
217
+ "source": "arxiv",
218
+ "arxiv_id": "1234.5678",
219
+ "access_type": "open_access_downloaded",
220
+ "temp_file_path": "/tmp/test.pdf",
221
+ }
222
+
223
+ self.assertEqual(result, expected_metadata)
224
+
225
+ def test_extract_paper_metadata_no_pdf(self):
226
+ """Test metadata extraction without PDF download."""
227
+ metadata = ET.fromstring(self.sample_xml)
228
+
229
+ with patch.object(
230
+ self.downloader, "get_default_filename", return_value="1234.5678.pdf"
231
+ ):
232
+ result = self.downloader.extract_paper_metadata(metadata, "1234.5678", None)
233
+
234
+ # Verify metadata without PDF
235
+ self.assertEqual(result["Title"], "Test Paper Title")
236
+ self.assertEqual(result["URL"], "")
237
+ self.assertEqual(result["pdf_url"], "")
238
+ self.assertEqual(result["filename"], "1234.5678.pdf")
239
+ self.assertEqual(result["access_type"], "download_failed")
240
+ self.assertEqual(result["temp_file_path"], "")
241
+
242
+ def test_extract_paper_metadata_no_entry(self):
243
+ """Test metadata extraction with no entry in XML."""
244
+ xml_no_entry = """<?xml version="1.0" encoding="UTF-8"?>
245
+ <feed xmlns="http://www.w3.org/2005/Atom">
246
+ </feed>"""
247
+
248
+ metadata = ET.fromstring(xml_no_entry)
249
+
250
+ with self.assertRaises(RuntimeError) as context:
251
+ self.downloader.extract_paper_metadata(metadata, "1234.5678", None)
252
+
253
+ self.assertIn("No entry found in metadata", str(context.exception))
254
+
255
+ def test_extract_basic_metadata(self):
256
+ """Test basic metadata extraction helper method."""
257
+ metadata = ET.fromstring(self.sample_xml)
258
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
259
+ entry = metadata.find("atom:entry", ns)
260
+
261
+ result = self.downloader.extract_basic_metadata_public(entry, ns)
262
+
263
+ expected = {
264
+ "Title": "Test Paper Title",
265
+ "Authors": ["John Doe", "Jane Smith"],
266
+ "Abstract": "This is a test abstract for the paper.",
267
+ "Publication Date": "2023-01-01T12:00:00Z",
268
+ }
269
+ self.assertEqual(result, expected)
270
+
271
+ def test_extract_title_variants(self):
272
+ """Title extraction for present and missing cases."""
273
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
274
+
275
+ # Case 1: Title present
276
+ metadata1 = ET.fromstring(self.sample_xml)
277
+ entry1 = metadata1.find("atom:entry", ns)
278
+ self.assertEqual(
279
+ self.downloader.extract_title_public(entry1, ns), "Test Paper Title"
280
+ )
281
+
282
+ # Case 2: Title missing
283
+ xml_no_title = """<?xml version="1.0" encoding="UTF-8"?>
284
+ <feed xmlns="http://www.w3.org/2005/Atom">
285
+ <entry>
286
+ <id>http://arxiv.org/abs/1234.5678v1</id>
287
+ </entry>
288
+ </feed>"""
289
+ metadata2 = ET.fromstring(xml_no_title)
290
+ entry2 = metadata2.find("atom:entry", ns)
291
+ self.assertEqual(self.downloader.extract_title_public(entry2, ns), "N/A")
292
+
293
+ def test_extract_authors_variants(self):
294
+ """Authors extraction for present and empty cases."""
295
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
296
+
297
+ # Case 1: Authors present
298
+ metadata1 = ET.fromstring(self.sample_xml)
299
+ entry1 = metadata1.find("atom:entry", ns)
300
+ self.assertEqual(
301
+ self.downloader.extract_authors_public(entry1, ns),
302
+ ["John Doe", "Jane Smith"],
303
+ )
304
+
305
+ # Case 2: Authors missing
306
+ xml_no_authors = """<?xml version="1.0" encoding="UTF-8"?>
307
+ <feed xmlns="http://www.w3.org/2005/Atom">
308
+ <entry>
309
+ <id>http://arxiv.org/abs/1234.5678v1</id>
310
+ <title>Test Paper Title</title>
311
+ </entry>
312
+ </feed>"""
313
+ metadata2 = ET.fromstring(xml_no_authors)
314
+ entry2 = metadata2.find("atom:entry", ns)
315
+ self.assertEqual(self.downloader.extract_authors_public(entry2, ns), [])
316
+
317
+ def test_extract_abstract_and_publication_date(self):
318
+ """Abstract and publication date extraction."""
319
+ metadata = ET.fromstring(self.sample_xml)
320
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
321
+ entry = metadata.find("atom:entry", ns)
322
+
323
+ self.assertEqual(
324
+ self.downloader.extract_abstract_public(entry, ns),
325
+ "This is a test abstract for the paper.",
326
+ )
327
+ self.assertEqual(
328
+ self.downloader.extract_publication_date_public(entry, ns),
329
+ "2023-01-01T12:00:00Z",
330
+ )
331
+
332
+ def test_extract_pdf_metadata_variants(self):
333
+ """PDF metadata extraction with and without a download result."""
334
+ # With result
335
+ pdf_result = ("/tmp/test.pdf", "paper.pdf")
336
+ expected_with = {
337
+ "URL": "/tmp/test.pdf",
338
+ "pdf_url": "/tmp/test.pdf",
339
+ "filename": "paper.pdf",
340
+ "access_type": "open_access_downloaded",
341
+ "temp_file_path": "/tmp/test.pdf",
342
+ }
343
+ self.assertEqual(
344
+ self.downloader.extract_pdf_metadata_public(pdf_result, "1234.5678"),
345
+ expected_with,
346
+ )
347
+
348
+ # Without result
349
+ with patch.object(
350
+ self.downloader, "get_default_filename", return_value="default.pdf"
351
+ ):
352
+ expected_without = {
353
+ "URL": "",
354
+ "pdf_url": "",
355
+ "filename": "default.pdf",
356
+ "access_type": "download_failed",
357
+ "temp_file_path": "",
358
+ }
359
+ self.assertEqual(
360
+ self.downloader.extract_pdf_metadata_public(None, "1234.5678"),
361
+ expected_without,
362
+ )
363
+
364
+ def test_service_and_identifier_helpers(self):
365
+ """Service name, identifier name, and default filename helpers."""
366
+ self.assertEqual(self.downloader.get_service_name(), "arXiv")
367
+ self.assertEqual(self.downloader.get_identifier_name(), "arXiv ID")
368
+ self.assertEqual(
369
+ self.downloader.get_default_filename("1234.5678"), "1234.5678.pdf"
370
+ )
371
+
372
+ def test_get_paper_identifier_info(self):
373
+ """Test _get_paper_identifier_info method."""
374
+ paper = {"arxiv_id": "1234.5678", "Publication Date": "2023-01-01T12:00:00Z"}
375
+
376
+ result = self.downloader.get_paper_identifier_info_public(paper)
377
+
378
+ self.assertIn("1234.5678", result)
379
+ self.assertIn("2023-01-01", result)
380
+
381
+ def test_add_service_identifier(self):
382
+ """Test _add_service_identifier method."""
383
+ entry = {}
384
+
385
+ self.downloader.add_service_identifier_public(entry, "1234.5678")
386
+
387
+ self.assertEqual(entry["arxiv_id"], "1234.5678")
388
+
389
+
390
+ class TestArxivDownloaderIntegration(unittest.TestCase):
391
+ """Integration tests for ArxivDownloader with mocked external dependencies."""
392
+
393
+ def setUp(self):
394
+ """Set up integration test fixtures."""
395
+ self.mock_config = Mock()
396
+ self.mock_config.api_url = "http://export.arxiv.org/api/query"
397
+ self.mock_config.pdf_base_url = "https://arxiv.org/pdf"
398
+ self.mock_config.request_timeout = 30
399
+ self.mock_config.chunk_size = 8192
400
+ self.mock_config.xml_namespace = {"atom": "http://www.w3.org/2005/Atom"}
401
+
402
+ self.downloader = ArxivDownloaderTestShim(self.mock_config)
403
+
404
+ self.sample_xml = """<?xml version="1.0" encoding="UTF-8"?>
405
+ <feed xmlns="http://www.w3.org/2005/Atom">
406
+ <entry>
407
+ <id>http://arxiv.org/abs/1234.5678v1</id>
408
+ <published>2023-01-01T12:00:00Z</published>
409
+ <title>Integration Test Paper</title>
410
+ <summary>This is a test abstract.</summary>
411
+ <author>
412
+ <name>Test Author</name>
413
+ </author>
414
+ <link href="http://arxiv.org/pdf/1234.5678v1.pdf" rel="related" type="application/pdf" title="pdf"/>
415
+ </entry>
416
+ </feed>"""
417
+
418
+ @patch(
419
+ "aiagents4pharma.talk2scholars.tools.paper_download.utils."
420
+ "arxiv_downloader.ArxivDownloader.download_pdf_to_temp"
421
+ )
422
+ @patch("requests.get")
423
+ def test_full_paper_processing_workflow(self, mock_get, mock_download):
424
+ """Test the complete workflow from identifier to processed paper data."""
425
+ # Mock API response
426
+ mock_response = Mock()
427
+ mock_response.text = self.sample_xml
428
+ mock_response.raise_for_status = Mock()
429
+ mock_get.return_value = mock_response
430
+
431
+ # Mock PDF download
432
+ mock_download.return_value = ("/tmp/paper.pdf", "1234.5678.pdf")
433
+
434
+ # Simulate the workflow
435
+ identifiers = ["1234.5678"]
436
+ results = {}
437
+
438
+ for identifier in identifiers:
439
+ # Step 1: Fetch metadata
440
+ metadata = self.downloader.fetch_metadata(identifier)
441
+
442
+ # Step 2: Construct PDF URL
443
+ pdf_url = self.downloader.construct_pdf_url(metadata, identifier)
444
+
445
+ # Step 3: Download PDF
446
+ pdf_result = self.downloader.download_pdf_to_temp(pdf_url, identifier)
447
+
448
+ # Step 4: Extract metadata
449
+ paper_data = self.downloader.extract_paper_metadata(
450
+ metadata, identifier, pdf_result
451
+ )
452
+
453
+ results[identifier] = paper_data
454
+
455
+ # Verify the complete workflow
456
+ self.assertIn("1234.5678", results)
457
+ paper = results["1234.5678"]
458
+
459
+ self.assertEqual(paper["Title"], "Integration Test Paper")
460
+ self.assertEqual(paper["Authors"], ["Test Author"])
461
+ self.assertEqual(paper["access_type"], "open_access_downloaded")
462
+ self.assertEqual(paper["filename"], "1234.5678.pdf")
463
+ self.assertEqual(paper["temp_file_path"], "/tmp/paper.pdf")
464
+
465
+ # Verify method calls
466
+ mock_get.assert_called_once()
467
+ mock_download.assert_called_once_with(
468
+ "http://arxiv.org/pdf/1234.5678v1.pdf", "1234.5678"
469
+ )
470
+
471
+ @patch("requests.get")
472
+ def test_error_handling_workflow(self, mock_get):
473
+ """Test error handling in the workflow."""
474
+ # Mock network error
475
+ mock_get.side_effect = requests.RequestException("Network error")
476
+
477
+ with self.assertRaises(requests.RequestException):
478
+ self.downloader.fetch_metadata("1234.5678")