aiagents4pharma 1.31.0__py3-none-any.whl → 1.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +1 -0
- aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_uniprot.py +44 -0
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py +1 -0
- aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py +90 -0
- aiagents4pharma/talk2scholars/agents/main_agent.py +4 -3
- aiagents4pharma/talk2scholars/agents/paper_download_agent.py +3 -4
- aiagents4pharma/talk2scholars/agents/pdf_agent.py +6 -7
- aiagents4pharma/talk2scholars/agents/s2_agent.py +23 -20
- aiagents4pharma/talk2scholars/agents/zotero_agent.py +11 -11
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +19 -19
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +20 -15
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +27 -6
- aiagents4pharma/talk2scholars/state/state_talk2scholars.py +7 -7
- aiagents4pharma/talk2scholars/tests/test_main_agent.py +16 -16
- aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +17 -24
- aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +152 -135
- aiagents4pharma/talk2scholars/tests/test_pdf_agent.py +9 -16
- aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +790 -218
- aiagents4pharma/talk2scholars/tests/test_s2_agent.py +9 -9
- aiagents4pharma/talk2scholars/tests/test_s2_display.py +8 -8
- aiagents4pharma/talk2scholars/tests/test_s2_query.py +8 -8
- aiagents4pharma/talk2scholars/tests/test_zotero_agent.py +12 -12
- aiagents4pharma/talk2scholars/tests/test_zotero_path.py +11 -12
- aiagents4pharma/talk2scholars/tests/test_zotero_read.py +400 -22
- aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +0 -6
- aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +89 -31
- aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +540 -156
- aiagents4pharma/talk2scholars/tools/s2/__init__.py +4 -4
- aiagents4pharma/talk2scholars/tools/s2/{display_results.py → display_dataframe.py} +19 -21
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +71 -0
- aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +213 -35
- aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +3 -3
- {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/METADATA +3 -1
- {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/RECORD +37 -37
- {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/WHEEL +1 -1
- aiagents4pharma/talk2scholars/tools/paper_download/abstract_downloader.py +0 -45
- aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py +0 -115
- aiagents4pharma/talk2scholars/tools/s2/query_results.py +0 -61
- {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,12 @@ import unittest
|
|
7
7
|
from unittest.mock import patch, MagicMock
|
8
8
|
from langgraph.types import Command
|
9
9
|
from aiagents4pharma.talk2scholars.tools.zotero.zotero_read import zotero_read
|
10
|
+
from aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper import (
|
11
|
+
ZoteroSearchData,
|
12
|
+
)
|
10
13
|
|
14
|
+
# pylint: disable=protected-access
|
15
|
+
# pylint: disable=protected-access, too-many-arguments, too-many-positional-arguments
|
11
16
|
|
12
17
|
# Dummy Hydra configuration to be used in tests
|
13
18
|
dummy_zotero_read_config = SimpleNamespace(
|
@@ -92,11 +97,11 @@ class TestZoteroSearchTool(unittest.TestCase):
|
|
92
97
|
# Verify the Command update structure and contents
|
93
98
|
self.assertIsInstance(result, Command)
|
94
99
|
update = result.update
|
95
|
-
self.assertIn("
|
100
|
+
self.assertIn("article_data", update)
|
96
101
|
self.assertIn("last_displayed_papers", update)
|
97
102
|
self.assertIn("messages", update)
|
98
103
|
|
99
|
-
filtered_papers = update["
|
104
|
+
filtered_papers = update["article_data"]
|
100
105
|
self.assertIn("paper1", filtered_papers)
|
101
106
|
self.assertIn("paper2", filtered_papers)
|
102
107
|
message_content = update["messages"][0].content
|
@@ -149,7 +154,7 @@ class TestZoteroSearchTool(unittest.TestCase):
|
|
149
154
|
result = zotero_read.run(tool_input)
|
150
155
|
|
151
156
|
update = result.update
|
152
|
-
filtered_papers = update["
|
157
|
+
filtered_papers = update["article_data"]
|
153
158
|
self.assertIn("paper1", filtered_papers)
|
154
159
|
fake_zot.items.assert_called_with(
|
155
160
|
limit=dummy_cfg.tools.zotero_read.zotero.max_limit
|
@@ -198,17 +203,19 @@ class TestZoteroSearchTool(unittest.TestCase):
|
|
198
203
|
@patch(
|
199
204
|
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
|
200
205
|
)
|
206
|
+
@patch(
|
207
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper."
|
208
|
+
"ZoteroSearchData._download_pdfs_in_parallel"
|
209
|
+
)
|
201
210
|
def test_filtering_no_matching_papers(
|
202
211
|
self,
|
212
|
+
mock_batch_download,
|
203
213
|
mock_hydra_init,
|
204
214
|
mock_hydra_compose,
|
205
215
|
mock_zotero_class,
|
206
216
|
mock_get_item_collections,
|
207
217
|
):
|
208
|
-
"""
|
209
|
-
Test that when non-research items (e.g. attachments, notes) are returned,
|
210
|
-
they are still included since filtering is disabled.
|
211
|
-
"""
|
218
|
+
"""Testing filtering when no paper matching"""
|
212
219
|
mock_hydra_compose.return_value = dummy_cfg
|
213
220
|
mock_hydra_init.return_value.__enter__.return_value = None
|
214
221
|
|
@@ -222,6 +229,8 @@ class TestZoteroSearchTool(unittest.TestCase):
|
|
222
229
|
"date": "2021",
|
223
230
|
"url": "http://example.com",
|
224
231
|
"itemType": "attachment",
|
232
|
+
"contentType": "application/pdf", # orphaned
|
233
|
+
"filename": "paper1.pdf",
|
225
234
|
}
|
226
235
|
},
|
227
236
|
{
|
@@ -242,20 +251,25 @@ class TestZoteroSearchTool(unittest.TestCase):
|
|
242
251
|
"paper2": ["/Test Collection"],
|
243
252
|
}
|
244
253
|
|
245
|
-
|
254
|
+
mock_batch_download.return_value = {
|
255
|
+
"paper1": ("/tmp/fake_path.pdf", "paper1.pdf", "paper1")
|
256
|
+
}
|
257
|
+
|
246
258
|
tool_input = {
|
247
259
|
"query": "test",
|
248
|
-
"only_articles":
|
249
|
-
"tool_call_id":
|
260
|
+
"only_articles": False,
|
261
|
+
"tool_call_id": "test_id_4",
|
250
262
|
"limit": 2,
|
251
263
|
}
|
252
|
-
|
264
|
+
|
253
265
|
result = zotero_read.run(tool_input)
|
254
|
-
|
255
|
-
|
266
|
+
filtered_papers = result.update["article_data"]
|
267
|
+
|
256
268
|
self.assertIn("paper1", filtered_papers)
|
257
269
|
self.assertIn("paper2", filtered_papers)
|
258
|
-
self.assertEqual(
|
270
|
+
self.assertEqual(filtered_papers["paper1"]["filename"], "paper1.pdf")
|
271
|
+
self.assertEqual(filtered_papers["paper1"]["pdf_url"], "/tmp/fake_path.pdf")
|
272
|
+
self.assertEqual(filtered_papers["paper1"]["source"], "zotero")
|
259
273
|
|
260
274
|
@patch(
|
261
275
|
"aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
|
@@ -349,7 +363,7 @@ class TestZoteroSearchTool(unittest.TestCase):
|
|
349
363
|
result = zotero_read.run(tool_input)
|
350
364
|
|
351
365
|
update = result.update
|
352
|
-
filtered_papers = update["
|
366
|
+
filtered_papers = update["article_data"]
|
353
367
|
self.assertIn("paper_valid", filtered_papers)
|
354
368
|
self.assertEqual(len(filtered_papers), 1)
|
355
369
|
|
@@ -408,26 +422,390 @@ class TestZoteroSearchTool(unittest.TestCase):
|
|
408
422
|
mock_zotero_class,
|
409
423
|
mock_get_item_collections,
|
410
424
|
):
|
411
|
-
"""
|
412
|
-
Test that if an item has a 'data' field that is not a dict, it is skipped.
|
413
|
-
"""
|
425
|
+
"""Test for no dict"""
|
414
426
|
mock_hydra_compose.return_value = dummy_cfg
|
415
427
|
mock_hydra_init.return_value.__enter__.return_value = None
|
416
428
|
|
417
429
|
fake_zot = MagicMock()
|
418
|
-
#
|
419
|
-
fake_items = [
|
430
|
+
# Make the item itself non-dict (not just `data`)
|
431
|
+
fake_items = ["this is not a dict"]
|
420
432
|
fake_zot.items.return_value = fake_items
|
421
433
|
mock_zotero_class.return_value = fake_zot
|
422
434
|
mock_get_item_collections.return_value = {}
|
423
435
|
|
424
|
-
tool_call_id = "test_id_8"
|
425
436
|
tool_input = {
|
426
437
|
"query": "dummy",
|
427
438
|
"only_articles": True,
|
428
|
-
"tool_call_id":
|
439
|
+
"tool_call_id": "test_id_8",
|
429
440
|
"limit": 2,
|
430
441
|
}
|
442
|
+
|
431
443
|
with self.assertRaises(RuntimeError) as context:
|
432
444
|
zotero_read.run(tool_input)
|
433
445
|
self.assertIn("No matching papers returned from Zotero", str(context.exception))
|
446
|
+
|
447
|
+
@patch(
|
448
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
|
449
|
+
)
|
450
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
|
451
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
|
452
|
+
@patch(
|
453
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
|
454
|
+
)
|
455
|
+
@patch(
|
456
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.requests.Session.get"
|
457
|
+
)
|
458
|
+
def test_pdf_attachment_success(
|
459
|
+
self,
|
460
|
+
mock_session_get,
|
461
|
+
mock_hydra_init,
|
462
|
+
mock_hydra_compose,
|
463
|
+
mock_zotero_class,
|
464
|
+
mock_get_item_collections,
|
465
|
+
):
|
466
|
+
"""Test for pdf attachment success"""
|
467
|
+
mock_hydra_compose.return_value = dummy_cfg
|
468
|
+
mock_hydra_init.return_value.__enter__.return_value = None
|
469
|
+
|
470
|
+
fake_zot = MagicMock()
|
471
|
+
fake_zot.items.return_value = [
|
472
|
+
{
|
473
|
+
"data": {
|
474
|
+
"key": "paper1",
|
475
|
+
"title": "Paper 1",
|
476
|
+
"abstractNote": "Abstract 1",
|
477
|
+
"date": "2021",
|
478
|
+
"url": "http://example.com",
|
479
|
+
"itemType": "journalArticle",
|
480
|
+
"creators": [
|
481
|
+
{
|
482
|
+
"firstName": "John",
|
483
|
+
"lastName": "Doe",
|
484
|
+
"creatorType": "author",
|
485
|
+
}
|
486
|
+
],
|
487
|
+
}
|
488
|
+
}
|
489
|
+
]
|
490
|
+
|
491
|
+
fake_pdf_child = {
|
492
|
+
"data": {
|
493
|
+
"key": "attachment1",
|
494
|
+
"filename": "file1.pdf",
|
495
|
+
"contentType": "application/pdf",
|
496
|
+
}
|
497
|
+
}
|
498
|
+
fake_zot.children.return_value = [fake_pdf_child]
|
499
|
+
mock_zotero_class.return_value = fake_zot
|
500
|
+
mock_get_item_collections.return_value = {"paper1": ["/Test Collection"]}
|
501
|
+
|
502
|
+
# Mock successful PDF download via session
|
503
|
+
mock_response = MagicMock()
|
504
|
+
mock_response.status_code = 200
|
505
|
+
mock_response.iter_content = lambda chunk_size: [b"fake pdf content"]
|
506
|
+
mock_response.headers = {
|
507
|
+
"Content-Disposition": 'attachment; filename="file1.pdf"'
|
508
|
+
}
|
509
|
+
mock_response.raise_for_status = lambda: None
|
510
|
+
mock_session_get.return_value = mock_response
|
511
|
+
|
512
|
+
tool_input = {
|
513
|
+
"query": "pdf test",
|
514
|
+
"only_articles": True,
|
515
|
+
"tool_call_id": "test_pdf_success",
|
516
|
+
"limit": 1,
|
517
|
+
}
|
518
|
+
|
519
|
+
result = zotero_read.run(tool_input)
|
520
|
+
paper = result.update["article_data"]["paper1"]
|
521
|
+
|
522
|
+
self.assertIn("pdf_url", paper)
|
523
|
+
self.assertTrue(paper["pdf_url"].endswith(".pdf"))
|
524
|
+
self.assertEqual(paper["filename"], "file1.pdf")
|
525
|
+
self.assertEqual(paper["attachment_key"], "attachment1")
|
526
|
+
|
527
|
+
@patch(
|
528
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
|
529
|
+
)
|
530
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
|
531
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
|
532
|
+
@patch(
|
533
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
|
534
|
+
)
|
535
|
+
def test_pdf_attachment_children_exception(
|
536
|
+
self,
|
537
|
+
mock_hydra_init,
|
538
|
+
mock_hydra_compose,
|
539
|
+
mock_zotero_class,
|
540
|
+
mock_get_item_collections,
|
541
|
+
):
|
542
|
+
"""Test that when children() raises an exception, PDF info is not added."""
|
543
|
+
mock_hydra_compose.return_value = dummy_cfg
|
544
|
+
mock_hydra_init.return_value.__enter__.return_value = None
|
545
|
+
|
546
|
+
fake_zot = MagicMock()
|
547
|
+
fake_items = [
|
548
|
+
{
|
549
|
+
"data": {
|
550
|
+
"key": "paper1",
|
551
|
+
"title": "Paper 1",
|
552
|
+
"abstractNote": "Abstract 1",
|
553
|
+
"date": "2021",
|
554
|
+
"url": "http://example.com",
|
555
|
+
"itemType": "journalArticle",
|
556
|
+
"creators": [
|
557
|
+
{
|
558
|
+
"firstName": "John",
|
559
|
+
"lastName": "Doe",
|
560
|
+
"creatorType": "author",
|
561
|
+
}
|
562
|
+
],
|
563
|
+
}
|
564
|
+
},
|
565
|
+
]
|
566
|
+
fake_zot.items.return_value = fake_items
|
567
|
+
|
568
|
+
# Simulate children() raising an exception
|
569
|
+
fake_zot.children.side_effect = Exception("Child fetch error")
|
570
|
+
mock_zotero_class.return_value = fake_zot
|
571
|
+
mock_get_item_collections.return_value = {"paper1": ["/Test Collection"]}
|
572
|
+
|
573
|
+
tool_call_id = "test_pdf_children_exception"
|
574
|
+
tool_input = {
|
575
|
+
"query": "pdf test exception",
|
576
|
+
"only_articles": True,
|
577
|
+
"tool_call_id": tool_call_id,
|
578
|
+
"limit": 1,
|
579
|
+
}
|
580
|
+
result = zotero_read.run(tool_input)
|
581
|
+
filtered_papers = result.update["article_data"]
|
582
|
+
|
583
|
+
# Ensure no PDF-related keys are added
|
584
|
+
self.assertIn("paper1", filtered_papers)
|
585
|
+
self.assertNotIn("pdf_url", filtered_papers["paper1"])
|
586
|
+
self.assertNotIn("filename", filtered_papers["paper1"])
|
587
|
+
self.assertNotIn("attachment_key", filtered_papers["paper1"])
|
588
|
+
|
589
|
+
@patch(
|
590
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
|
591
|
+
)
|
592
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
|
593
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
|
594
|
+
@patch(
|
595
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
|
596
|
+
)
|
597
|
+
def test_pdf_attachment_missing_key(
|
598
|
+
self,
|
599
|
+
mock_hydra_init,
|
600
|
+
mock_hydra_compose,
|
601
|
+
mock_zotero_class,
|
602
|
+
mock_get_item_collections,
|
603
|
+
):
|
604
|
+
"""Test for pdf attachment missing"""
|
605
|
+
mock_hydra_compose.return_value = dummy_cfg
|
606
|
+
mock_hydra_init.return_value.__enter__.return_value = None
|
607
|
+
|
608
|
+
fake_zot = MagicMock()
|
609
|
+
fake_zot.items.return_value = [
|
610
|
+
{
|
611
|
+
"data": {
|
612
|
+
"key": "paper1",
|
613
|
+
"title": "Paper 1",
|
614
|
+
"abstractNote": "Abstract 1",
|
615
|
+
"date": "2021",
|
616
|
+
"url": "http://example.com",
|
617
|
+
"itemType": "journalArticle",
|
618
|
+
"creators": [
|
619
|
+
{
|
620
|
+
"firstName": "Alice",
|
621
|
+
"lastName": "Smith",
|
622
|
+
"creatorType": "author",
|
623
|
+
}
|
624
|
+
],
|
625
|
+
}
|
626
|
+
},
|
627
|
+
]
|
628
|
+
|
629
|
+
fake_pdf_child = {
|
630
|
+
"data": {
|
631
|
+
"filename": "no_key.pdf",
|
632
|
+
"contentType": "application/pdf",
|
633
|
+
}
|
634
|
+
}
|
635
|
+
fake_zot.children.return_value = [fake_pdf_child]
|
636
|
+
mock_zotero_class.return_value = fake_zot
|
637
|
+
mock_get_item_collections.return_value = {"paper1": ["/Test Collection"]}
|
638
|
+
|
639
|
+
tool_input = {
|
640
|
+
"query": "missing key test",
|
641
|
+
"only_articles": True,
|
642
|
+
"tool_call_id": "test_pdf_missing_key",
|
643
|
+
"limit": 1,
|
644
|
+
}
|
645
|
+
|
646
|
+
result = zotero_read.run(tool_input)
|
647
|
+
paper = result.update["article_data"]["paper1"]
|
648
|
+
|
649
|
+
self.assertNotIn("pdf_url", paper)
|
650
|
+
self.assertNotIn("attachment_key", paper)
|
651
|
+
|
652
|
+
@patch(
|
653
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
|
654
|
+
)
|
655
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
|
656
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
|
657
|
+
@patch(
|
658
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
|
659
|
+
)
|
660
|
+
def test_pdf_attachment_outer_exception(
|
661
|
+
self,
|
662
|
+
mock_hydra_init,
|
663
|
+
mock_hydra_compose,
|
664
|
+
mock_zotero_class,
|
665
|
+
mock_get_item_collections,
|
666
|
+
):
|
667
|
+
"""Test that if children() returns a non-iterable (causing an exception),
|
668
|
+
PDF info is not added."""
|
669
|
+
mock_hydra_compose.return_value = dummy_cfg
|
670
|
+
mock_hydra_init.return_value.__enter__.return_value = None
|
671
|
+
|
672
|
+
fake_zot = MagicMock()
|
673
|
+
fake_items = [
|
674
|
+
{
|
675
|
+
"data": {
|
676
|
+
"key": "paper1",
|
677
|
+
"title": "Paper 1",
|
678
|
+
"abstractNote": "Abstract 1",
|
679
|
+
"date": "2021",
|
680
|
+
"url": "http://example.com",
|
681
|
+
"itemType": "journalArticle",
|
682
|
+
"creators": [
|
683
|
+
{
|
684
|
+
"firstName": "Bob",
|
685
|
+
"lastName": "Jones",
|
686
|
+
"creatorType": "author",
|
687
|
+
}
|
688
|
+
],
|
689
|
+
}
|
690
|
+
},
|
691
|
+
]
|
692
|
+
fake_zot.items.return_value = fake_items
|
693
|
+
|
694
|
+
# Simulate children() returning None to trigger an exception in list comprehension.
|
695
|
+
fake_zot.children.return_value = None
|
696
|
+
|
697
|
+
mock_zotero_class.return_value = fake_zot
|
698
|
+
mock_get_item_collections.return_value = {"paper1": ["/Test Collection"]}
|
699
|
+
|
700
|
+
tool_call_id = "test_pdf_outer_exception"
|
701
|
+
tool_input = {
|
702
|
+
"query": "outer exception test",
|
703
|
+
"only_articles": True,
|
704
|
+
"tool_call_id": tool_call_id,
|
705
|
+
"limit": 1,
|
706
|
+
}
|
707
|
+
result = zotero_read.run(tool_input)
|
708
|
+
filtered_papers = result.update["article_data"]
|
709
|
+
|
710
|
+
# Ensure no PDF-related keys are added if an exception occurs
|
711
|
+
self.assertIn("paper1", filtered_papers)
|
712
|
+
self.assertNotIn("pdf_url", filtered_papers["paper1"])
|
713
|
+
self.assertNotIn("filename", filtered_papers["paper1"])
|
714
|
+
self.assertNotIn("attachment_key", filtered_papers["paper1"])
|
715
|
+
|
716
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.requests.get")
|
717
|
+
@patch(
|
718
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
|
719
|
+
)
|
720
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
|
721
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
|
722
|
+
@patch(
|
723
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
|
724
|
+
)
|
725
|
+
def test_download_zotero_pdf_exception(
|
726
|
+
self,
|
727
|
+
mock_hydra_init,
|
728
|
+
mock_hydra_compose,
|
729
|
+
mock_zotero_class,
|
730
|
+
mock_get_item_collections,
|
731
|
+
mock_requests_get,
|
732
|
+
):
|
733
|
+
"""Test that _download_zotero_pdf returns None and logs error on request exception."""
|
734
|
+
# Setup mocks for config and Zotero client
|
735
|
+
mock_hydra_compose.return_value = dummy_cfg
|
736
|
+
mock_hydra_init.return_value.__enter__.return_value = None
|
737
|
+
mock_zotero_class.return_value = MagicMock()
|
738
|
+
mock_get_item_collections.return_value = {}
|
739
|
+
|
740
|
+
# Simulate a request exception during PDF download
|
741
|
+
mock_requests_get.side_effect = Exception("Simulated download failure")
|
742
|
+
|
743
|
+
zotero_search = ZoteroSearchData(
|
744
|
+
query="test", only_articles=False, limit=1, tool_call_id="test123"
|
745
|
+
)
|
746
|
+
|
747
|
+
result = zotero_search._download_zotero_pdf("FAKE_ATTACHMENT_KEY")
|
748
|
+
|
749
|
+
self.assertIsNone(result)
|
750
|
+
|
751
|
+
@patch(
|
752
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
|
753
|
+
)
|
754
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
|
755
|
+
@patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
|
756
|
+
@patch(
|
757
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
|
758
|
+
)
|
759
|
+
def test_download_pdf_exception_logging(
|
760
|
+
self,
|
761
|
+
mock_hydra_init,
|
762
|
+
mock_hydra_compose,
|
763
|
+
mock_zotero_class,
|
764
|
+
mock_get_item_collections,
|
765
|
+
):
|
766
|
+
"""Test that a failed download logs the error and does not break the pipeline."""
|
767
|
+
mock_hydra_compose.return_value = dummy_cfg
|
768
|
+
mock_hydra_init.return_value.__enter__.return_value = None
|
769
|
+
|
770
|
+
fake_zot = MagicMock()
|
771
|
+
fake_zot.items.return_value = [
|
772
|
+
{
|
773
|
+
"data": {
|
774
|
+
"key": "paper1",
|
775
|
+
"title": "Fake Title",
|
776
|
+
"itemType": "journalArticle",
|
777
|
+
}
|
778
|
+
}
|
779
|
+
]
|
780
|
+
# Simulate an attachment
|
781
|
+
fake_zot.children.return_value = [
|
782
|
+
{
|
783
|
+
"data": {
|
784
|
+
"key": "attachment1",
|
785
|
+
"filename": "file1.pdf",
|
786
|
+
"contentType": "application/pdf",
|
787
|
+
}
|
788
|
+
}
|
789
|
+
]
|
790
|
+
|
791
|
+
mock_zotero_class.return_value = fake_zot
|
792
|
+
mock_get_item_collections.return_value = {"paper1": ["/Fake Collection"]}
|
793
|
+
|
794
|
+
# Patch just the internal _download_zotero_pdf to raise an exception
|
795
|
+
with patch(
|
796
|
+
"aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper."
|
797
|
+
"ZoteroSearchData._download_zotero_pdf"
|
798
|
+
) as mock_download_pdf:
|
799
|
+
mock_download_pdf.side_effect = Exception("Simulated download error")
|
800
|
+
|
801
|
+
search = ZoteroSearchData(
|
802
|
+
query="failure test",
|
803
|
+
only_articles=True,
|
804
|
+
limit=1,
|
805
|
+
tool_call_id="fail_test",
|
806
|
+
)
|
807
|
+
search.process_search()
|
808
|
+
|
809
|
+
article_data = search.get_search_results()["article_data"]
|
810
|
+
assert "paper1" in article_data
|
811
|
+
assert "pdf_url" not in article_data["paper1"] # download failed, no URL
|
@@ -4,14 +4,8 @@ This package provides modules for fetching and downloading academic papers from
|
|
4
4
|
"""
|
5
5
|
|
6
6
|
# Import modules
|
7
|
-
from . import abstract_downloader
|
8
|
-
from . import arxiv_downloader
|
9
7
|
from . import download_arxiv_input
|
10
|
-
from .download_arxiv_input import download_arxiv_paper
|
11
8
|
|
12
9
|
__all__ = [
|
13
|
-
"abstract_downloader",
|
14
|
-
"arxiv_downloader",
|
15
10
|
"download_arxiv_input",
|
16
|
-
"download_arxiv_paper",
|
17
11
|
]
|
@@ -1,64 +1,122 @@
|
|
1
|
-
|
1
|
+
#!/usr/bin/env python3
|
2
2
|
"""
|
3
|
-
|
4
|
-
`ArxivPaperDownloader` class to fetch and download academic papers from arXiv
|
5
|
-
based on their unique arXiv ID.
|
3
|
+
Tool for downloading arXiv paper metadata and retrieving the PDF URL.
|
6
4
|
"""
|
5
|
+
|
6
|
+
import logging
|
7
|
+
import xml.etree.ElementTree as ET
|
7
8
|
from typing import Annotated, Any
|
8
|
-
|
9
|
-
|
9
|
+
|
10
|
+
import hydra
|
11
|
+
import requests
|
10
12
|
from langchain_core.messages import ToolMessage
|
13
|
+
from langchain_core.tools import tool
|
11
14
|
from langchain_core.tools.base import InjectedToolCallId
|
12
15
|
from langgraph.types import Command
|
16
|
+
from pydantic import BaseModel, Field
|
13
17
|
|
14
|
-
#
|
15
|
-
|
18
|
+
# Configure logging
|
19
|
+
logging.basicConfig(level=logging.INFO)
|
20
|
+
logger = logging.getLogger(__name__)
|
16
21
|
|
17
22
|
|
18
23
|
class DownloadArxivPaperInput(BaseModel):
|
19
|
-
"""
|
20
|
-
Input schema for the arXiv paper download tool.
|
21
|
-
(Optional: if you decide to keep Pydantic validation in the future)
|
22
|
-
"""
|
24
|
+
"""Input schema for the arXiv paper download tool."""
|
23
25
|
|
24
26
|
arxiv_id: str = Field(
|
25
|
-
description="The arXiv paper ID used to retrieve the paper details and PDF."
|
27
|
+
description="The arXiv paper ID used to retrieve the paper details and PDF URL."
|
26
28
|
)
|
27
29
|
tool_call_id: Annotated[str, InjectedToolCallId]
|
28
30
|
|
29
31
|
|
32
|
+
def fetch_arxiv_metadata(
|
33
|
+
api_url: str, arxiv_id: str, request_timeout: int
|
34
|
+
) -> ET.Element:
|
35
|
+
"""Fetch and parse metadata from the arXiv API."""
|
36
|
+
query_url = f"{api_url}?search_query=id:{arxiv_id}&start=0&max_results=1"
|
37
|
+
response = requests.get(query_url, timeout=request_timeout)
|
38
|
+
response.raise_for_status()
|
39
|
+
return ET.fromstring(response.text)
|
40
|
+
|
41
|
+
|
42
|
+
def extract_metadata(entry: ET.Element, ns: dict, arxiv_id: str) -> dict:
|
43
|
+
"""Extract metadata from the XML entry."""
|
44
|
+
title_elem = entry.find("atom:title", ns)
|
45
|
+
title = title_elem.text.strip() if title_elem is not None else "N/A"
|
46
|
+
|
47
|
+
authors = [
|
48
|
+
author_elem.find("atom:name", ns).text.strip()
|
49
|
+
for author_elem in entry.findall("atom:author", ns)
|
50
|
+
if author_elem.find("atom:name", ns) is not None
|
51
|
+
]
|
52
|
+
|
53
|
+
summary_elem = entry.find("atom:summary", ns)
|
54
|
+
abstract = summary_elem.text.strip() if summary_elem is not None else "N/A"
|
55
|
+
|
56
|
+
published_elem = entry.find("atom:published", ns)
|
57
|
+
pub_date = published_elem.text.strip() if published_elem is not None else "N/A"
|
58
|
+
|
59
|
+
pdf_url = next(
|
60
|
+
(
|
61
|
+
link.attrib.get("href")
|
62
|
+
for link in entry.findall("atom:link", ns)
|
63
|
+
if link.attrib.get("title") == "pdf"
|
64
|
+
),
|
65
|
+
None,
|
66
|
+
)
|
67
|
+
if not pdf_url:
|
68
|
+
raise RuntimeError(f"Could not find PDF URL for arXiv ID {arxiv_id}")
|
69
|
+
|
70
|
+
return {
|
71
|
+
"Title": title,
|
72
|
+
"Authors": authors,
|
73
|
+
"Abstract": abstract,
|
74
|
+
"Publication Date": pub_date,
|
75
|
+
"URL": pdf_url,
|
76
|
+
"pdf_url": pdf_url,
|
77
|
+
"filename": f"{arxiv_id}.pdf",
|
78
|
+
"source": "arxiv",
|
79
|
+
"arxiv_id": arxiv_id,
|
80
|
+
}
|
81
|
+
|
82
|
+
|
30
83
|
@tool(args_schema=DownloadArxivPaperInput, parse_docstring=True)
|
31
84
|
def download_arxiv_paper(
|
32
85
|
arxiv_id: str,
|
33
86
|
tool_call_id: Annotated[str, InjectedToolCallId],
|
34
87
|
) -> Command[Any]:
|
35
88
|
"""
|
36
|
-
|
89
|
+
Get metadata and PDF URL for an arXiv paper using its unique arXiv ID.
|
90
|
+
"""
|
91
|
+
logger.info("Fetching metadata from arXiv for paper ID: %s", arxiv_id)
|
37
92
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
93
|
+
# Load configuration
|
94
|
+
with hydra.initialize(version_base=None, config_path="../../configs"):
|
95
|
+
cfg = hydra.compose(
|
96
|
+
config_name="config", overrides=["tools/download_arxiv_paper=default"]
|
97
|
+
)
|
98
|
+
api_url = cfg.tools.download_arxiv_paper.api_url
|
99
|
+
request_timeout = cfg.tools.download_arxiv_paper.request_timeout
|
43
100
|
|
44
|
-
|
45
|
-
|
46
|
-
|
101
|
+
# Fetch and parse metadata
|
102
|
+
root = fetch_arxiv_metadata(api_url, arxiv_id, request_timeout)
|
103
|
+
ns = {"atom": "http://www.w3.org/2005/Atom"}
|
47
104
|
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
105
|
+
entry = root.find("atom:entry", ns)
|
106
|
+
if entry is None:
|
107
|
+
raise ValueError(f"No entry found for arXiv ID {arxiv_id}")
|
108
|
+
|
109
|
+
# Extract metadata
|
110
|
+
metadata = extract_metadata(entry, ns, arxiv_id)
|
52
111
|
|
53
|
-
#
|
54
|
-
|
55
|
-
# print (pdf_data)
|
112
|
+
# Create article_data entry with the paper ID as the key
|
113
|
+
article_data = {arxiv_id: metadata}
|
56
114
|
|
57
|
-
content = f"Successfully
|
115
|
+
content = f"Successfully retrieved metadata and PDF URL for arXiv ID {arxiv_id}"
|
58
116
|
|
59
117
|
return Command(
|
60
118
|
update={
|
61
|
-
"
|
119
|
+
"article_data": article_data,
|
62
120
|
"messages": [ToolMessage(content=content, tool_call_id=tool_call_id)],
|
63
121
|
}
|
64
122
|
)
|