aiagents4pharma 1.31.0__py3-none-any.whl → 1.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/config.yaml +1 -0
  2. aiagents4pharma/talk2knowledgegraphs/tests/test_utils_enrichments_uniprot.py +44 -0
  3. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/__init__.py +1 -0
  4. aiagents4pharma/talk2knowledgegraphs/utils/enrichments/uniprot_proteins.py +90 -0
  5. aiagents4pharma/talk2scholars/agents/main_agent.py +4 -3
  6. aiagents4pharma/talk2scholars/agents/paper_download_agent.py +3 -4
  7. aiagents4pharma/talk2scholars/agents/pdf_agent.py +6 -7
  8. aiagents4pharma/talk2scholars/agents/s2_agent.py +23 -20
  9. aiagents4pharma/talk2scholars/agents/zotero_agent.py +11 -11
  10. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +19 -19
  11. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +20 -15
  12. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +27 -6
  13. aiagents4pharma/talk2scholars/state/state_talk2scholars.py +7 -7
  14. aiagents4pharma/talk2scholars/tests/test_main_agent.py +16 -16
  15. aiagents4pharma/talk2scholars/tests/test_paper_download_agent.py +17 -24
  16. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +152 -135
  17. aiagents4pharma/talk2scholars/tests/test_pdf_agent.py +9 -16
  18. aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +790 -218
  19. aiagents4pharma/talk2scholars/tests/test_s2_agent.py +9 -9
  20. aiagents4pharma/talk2scholars/tests/test_s2_display.py +8 -8
  21. aiagents4pharma/talk2scholars/tests/test_s2_query.py +8 -8
  22. aiagents4pharma/talk2scholars/tests/test_zotero_agent.py +12 -12
  23. aiagents4pharma/talk2scholars/tests/test_zotero_path.py +11 -12
  24. aiagents4pharma/talk2scholars/tests/test_zotero_read.py +400 -22
  25. aiagents4pharma/talk2scholars/tools/paper_download/__init__.py +0 -6
  26. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +89 -31
  27. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +540 -156
  28. aiagents4pharma/talk2scholars/tools/s2/__init__.py +4 -4
  29. aiagents4pharma/talk2scholars/tools/s2/{display_results.py → display_dataframe.py} +19 -21
  30. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +71 -0
  31. aiagents4pharma/talk2scholars/tools/zotero/utils/read_helper.py +213 -35
  32. aiagents4pharma/talk2scholars/tools/zotero/zotero_read.py +3 -3
  33. {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/METADATA +3 -1
  34. {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/RECORD +37 -37
  35. {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/WHEEL +1 -1
  36. aiagents4pharma/talk2scholars/tools/paper_download/abstract_downloader.py +0 -45
  37. aiagents4pharma/talk2scholars/tools/paper_download/arxiv_downloader.py +0 -115
  38. aiagents4pharma/talk2scholars/tools/s2/query_results.py +0 -61
  39. {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/licenses/LICENSE +0 -0
  40. {aiagents4pharma-1.31.0.dist-info → aiagents4pharma-1.33.0.dist-info}/top_level.txt +0 -0
@@ -7,7 +7,12 @@ import unittest
7
7
  from unittest.mock import patch, MagicMock
8
8
  from langgraph.types import Command
9
9
  from aiagents4pharma.talk2scholars.tools.zotero.zotero_read import zotero_read
10
+ from aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper import (
11
+ ZoteroSearchData,
12
+ )
10
13
 
14
+ # pylint: disable=protected-access
15
+ # pylint: disable=protected-access, too-many-arguments, too-many-positional-arguments
11
16
 
12
17
  # Dummy Hydra configuration to be used in tests
13
18
  dummy_zotero_read_config = SimpleNamespace(
@@ -92,11 +97,11 @@ class TestZoteroSearchTool(unittest.TestCase):
92
97
  # Verify the Command update structure and contents
93
98
  self.assertIsInstance(result, Command)
94
99
  update = result.update
95
- self.assertIn("zotero_read", update)
100
+ self.assertIn("article_data", update)
96
101
  self.assertIn("last_displayed_papers", update)
97
102
  self.assertIn("messages", update)
98
103
 
99
- filtered_papers = update["zotero_read"]
104
+ filtered_papers = update["article_data"]
100
105
  self.assertIn("paper1", filtered_papers)
101
106
  self.assertIn("paper2", filtered_papers)
102
107
  message_content = update["messages"][0].content
@@ -149,7 +154,7 @@ class TestZoteroSearchTool(unittest.TestCase):
149
154
  result = zotero_read.run(tool_input)
150
155
 
151
156
  update = result.update
152
- filtered_papers = update["zotero_read"]
157
+ filtered_papers = update["article_data"]
153
158
  self.assertIn("paper1", filtered_papers)
154
159
  fake_zot.items.assert_called_with(
155
160
  limit=dummy_cfg.tools.zotero_read.zotero.max_limit
@@ -198,17 +203,19 @@ class TestZoteroSearchTool(unittest.TestCase):
198
203
  @patch(
199
204
  "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
200
205
  )
206
+ @patch(
207
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper."
208
+ "ZoteroSearchData._download_pdfs_in_parallel"
209
+ )
201
210
  def test_filtering_no_matching_papers(
202
211
  self,
212
+ mock_batch_download,
203
213
  mock_hydra_init,
204
214
  mock_hydra_compose,
205
215
  mock_zotero_class,
206
216
  mock_get_item_collections,
207
217
  ):
208
- """
209
- Test that when non-research items (e.g. attachments, notes) are returned,
210
- they are still included since filtering is disabled.
211
- """
218
+ """Testing filtering when no paper matching"""
212
219
  mock_hydra_compose.return_value = dummy_cfg
213
220
  mock_hydra_init.return_value.__enter__.return_value = None
214
221
 
@@ -222,6 +229,8 @@ class TestZoteroSearchTool(unittest.TestCase):
222
229
  "date": "2021",
223
230
  "url": "http://example.com",
224
231
  "itemType": "attachment",
232
+ "contentType": "application/pdf", # orphaned
233
+ "filename": "paper1.pdf",
225
234
  }
226
235
  },
227
236
  {
@@ -242,20 +251,25 @@ class TestZoteroSearchTool(unittest.TestCase):
242
251
  "paper2": ["/Test Collection"],
243
252
  }
244
253
 
245
- tool_call_id = "test_id_4"
254
+ mock_batch_download.return_value = {
255
+ "paper1": ("/tmp/fake_path.pdf", "paper1.pdf", "paper1")
256
+ }
257
+
246
258
  tool_input = {
247
259
  "query": "test",
248
- "only_articles": True,
249
- "tool_call_id": tool_call_id,
260
+ "only_articles": False,
261
+ "tool_call_id": "test_id_4",
250
262
  "limit": 2,
251
263
  }
252
- # Instead of expecting a RuntimeError, we now expect both items to be returned.
264
+
253
265
  result = zotero_read.run(tool_input)
254
- update = result.update
255
- filtered_papers = update["zotero_read"]
266
+ filtered_papers = result.update["article_data"]
267
+
256
268
  self.assertIn("paper1", filtered_papers)
257
269
  self.assertIn("paper2", filtered_papers)
258
- self.assertEqual(len(filtered_papers), 2)
270
+ self.assertEqual(filtered_papers["paper1"]["filename"], "paper1.pdf")
271
+ self.assertEqual(filtered_papers["paper1"]["pdf_url"], "/tmp/fake_path.pdf")
272
+ self.assertEqual(filtered_papers["paper1"]["source"], "zotero")
259
273
 
260
274
  @patch(
261
275
  "aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
@@ -349,7 +363,7 @@ class TestZoteroSearchTool(unittest.TestCase):
349
363
  result = zotero_read.run(tool_input)
350
364
 
351
365
  update = result.update
352
- filtered_papers = update["zotero_read"]
366
+ filtered_papers = update["article_data"]
353
367
  self.assertIn("paper_valid", filtered_papers)
354
368
  self.assertEqual(len(filtered_papers), 1)
355
369
 
@@ -408,26 +422,390 @@ class TestZoteroSearchTool(unittest.TestCase):
408
422
  mock_zotero_class,
409
423
  mock_get_item_collections,
410
424
  ):
411
- """
412
- Test that if an item has a 'data' field that is not a dict, it is skipped.
413
- """
425
+ """Test for no dict"""
414
426
  mock_hydra_compose.return_value = dummy_cfg
415
427
  mock_hydra_init.return_value.__enter__.return_value = None
416
428
 
417
429
  fake_zot = MagicMock()
418
- # Supply one item whose "data" field is not a dict.
419
- fake_items = [{"data": "this is not a dict"}]
430
+ # Make the item itself non-dict (not just `data`)
431
+ fake_items = ["this is not a dict"]
420
432
  fake_zot.items.return_value = fake_items
421
433
  mock_zotero_class.return_value = fake_zot
422
434
  mock_get_item_collections.return_value = {}
423
435
 
424
- tool_call_id = "test_id_8"
425
436
  tool_input = {
426
437
  "query": "dummy",
427
438
  "only_articles": True,
428
- "tool_call_id": tool_call_id,
439
+ "tool_call_id": "test_id_8",
429
440
  "limit": 2,
430
441
  }
442
+
431
443
  with self.assertRaises(RuntimeError) as context:
432
444
  zotero_read.run(tool_input)
433
445
  self.assertIn("No matching papers returned from Zotero", str(context.exception))
446
+
447
+ @patch(
448
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
449
+ )
450
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
451
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
452
+ @patch(
453
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
454
+ )
455
+ @patch(
456
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.requests.Session.get"
457
+ )
458
+ def test_pdf_attachment_success(
459
+ self,
460
+ mock_session_get,
461
+ mock_hydra_init,
462
+ mock_hydra_compose,
463
+ mock_zotero_class,
464
+ mock_get_item_collections,
465
+ ):
466
+ """Test for pdf attachment success"""
467
+ mock_hydra_compose.return_value = dummy_cfg
468
+ mock_hydra_init.return_value.__enter__.return_value = None
469
+
470
+ fake_zot = MagicMock()
471
+ fake_zot.items.return_value = [
472
+ {
473
+ "data": {
474
+ "key": "paper1",
475
+ "title": "Paper 1",
476
+ "abstractNote": "Abstract 1",
477
+ "date": "2021",
478
+ "url": "http://example.com",
479
+ "itemType": "journalArticle",
480
+ "creators": [
481
+ {
482
+ "firstName": "John",
483
+ "lastName": "Doe",
484
+ "creatorType": "author",
485
+ }
486
+ ],
487
+ }
488
+ }
489
+ ]
490
+
491
+ fake_pdf_child = {
492
+ "data": {
493
+ "key": "attachment1",
494
+ "filename": "file1.pdf",
495
+ "contentType": "application/pdf",
496
+ }
497
+ }
498
+ fake_zot.children.return_value = [fake_pdf_child]
499
+ mock_zotero_class.return_value = fake_zot
500
+ mock_get_item_collections.return_value = {"paper1": ["/Test Collection"]}
501
+
502
+ # Mock successful PDF download via session
503
+ mock_response = MagicMock()
504
+ mock_response.status_code = 200
505
+ mock_response.iter_content = lambda chunk_size: [b"fake pdf content"]
506
+ mock_response.headers = {
507
+ "Content-Disposition": 'attachment; filename="file1.pdf"'
508
+ }
509
+ mock_response.raise_for_status = lambda: None
510
+ mock_session_get.return_value = mock_response
511
+
512
+ tool_input = {
513
+ "query": "pdf test",
514
+ "only_articles": True,
515
+ "tool_call_id": "test_pdf_success",
516
+ "limit": 1,
517
+ }
518
+
519
+ result = zotero_read.run(tool_input)
520
+ paper = result.update["article_data"]["paper1"]
521
+
522
+ self.assertIn("pdf_url", paper)
523
+ self.assertTrue(paper["pdf_url"].endswith(".pdf"))
524
+ self.assertEqual(paper["filename"], "file1.pdf")
525
+ self.assertEqual(paper["attachment_key"], "attachment1")
526
+
527
+ @patch(
528
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
529
+ )
530
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
531
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
532
+ @patch(
533
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
534
+ )
535
+ def test_pdf_attachment_children_exception(
536
+ self,
537
+ mock_hydra_init,
538
+ mock_hydra_compose,
539
+ mock_zotero_class,
540
+ mock_get_item_collections,
541
+ ):
542
+ """Test that when children() raises an exception, PDF info is not added."""
543
+ mock_hydra_compose.return_value = dummy_cfg
544
+ mock_hydra_init.return_value.__enter__.return_value = None
545
+
546
+ fake_zot = MagicMock()
547
+ fake_items = [
548
+ {
549
+ "data": {
550
+ "key": "paper1",
551
+ "title": "Paper 1",
552
+ "abstractNote": "Abstract 1",
553
+ "date": "2021",
554
+ "url": "http://example.com",
555
+ "itemType": "journalArticle",
556
+ "creators": [
557
+ {
558
+ "firstName": "John",
559
+ "lastName": "Doe",
560
+ "creatorType": "author",
561
+ }
562
+ ],
563
+ }
564
+ },
565
+ ]
566
+ fake_zot.items.return_value = fake_items
567
+
568
+ # Simulate children() raising an exception
569
+ fake_zot.children.side_effect = Exception("Child fetch error")
570
+ mock_zotero_class.return_value = fake_zot
571
+ mock_get_item_collections.return_value = {"paper1": ["/Test Collection"]}
572
+
573
+ tool_call_id = "test_pdf_children_exception"
574
+ tool_input = {
575
+ "query": "pdf test exception",
576
+ "only_articles": True,
577
+ "tool_call_id": tool_call_id,
578
+ "limit": 1,
579
+ }
580
+ result = zotero_read.run(tool_input)
581
+ filtered_papers = result.update["article_data"]
582
+
583
+ # Ensure no PDF-related keys are added
584
+ self.assertIn("paper1", filtered_papers)
585
+ self.assertNotIn("pdf_url", filtered_papers["paper1"])
586
+ self.assertNotIn("filename", filtered_papers["paper1"])
587
+ self.assertNotIn("attachment_key", filtered_papers["paper1"])
588
+
589
+ @patch(
590
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
591
+ )
592
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
593
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
594
+ @patch(
595
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
596
+ )
597
+ def test_pdf_attachment_missing_key(
598
+ self,
599
+ mock_hydra_init,
600
+ mock_hydra_compose,
601
+ mock_zotero_class,
602
+ mock_get_item_collections,
603
+ ):
604
+ """Test for pdf attachment missing"""
605
+ mock_hydra_compose.return_value = dummy_cfg
606
+ mock_hydra_init.return_value.__enter__.return_value = None
607
+
608
+ fake_zot = MagicMock()
609
+ fake_zot.items.return_value = [
610
+ {
611
+ "data": {
612
+ "key": "paper1",
613
+ "title": "Paper 1",
614
+ "abstractNote": "Abstract 1",
615
+ "date": "2021",
616
+ "url": "http://example.com",
617
+ "itemType": "journalArticle",
618
+ "creators": [
619
+ {
620
+ "firstName": "Alice",
621
+ "lastName": "Smith",
622
+ "creatorType": "author",
623
+ }
624
+ ],
625
+ }
626
+ },
627
+ ]
628
+
629
+ fake_pdf_child = {
630
+ "data": {
631
+ "filename": "no_key.pdf",
632
+ "contentType": "application/pdf",
633
+ }
634
+ }
635
+ fake_zot.children.return_value = [fake_pdf_child]
636
+ mock_zotero_class.return_value = fake_zot
637
+ mock_get_item_collections.return_value = {"paper1": ["/Test Collection"]}
638
+
639
+ tool_input = {
640
+ "query": "missing key test",
641
+ "only_articles": True,
642
+ "tool_call_id": "test_pdf_missing_key",
643
+ "limit": 1,
644
+ }
645
+
646
+ result = zotero_read.run(tool_input)
647
+ paper = result.update["article_data"]["paper1"]
648
+
649
+ self.assertNotIn("pdf_url", paper)
650
+ self.assertNotIn("attachment_key", paper)
651
+
652
+ @patch(
653
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
654
+ )
655
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
656
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
657
+ @patch(
658
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
659
+ )
660
+ def test_pdf_attachment_outer_exception(
661
+ self,
662
+ mock_hydra_init,
663
+ mock_hydra_compose,
664
+ mock_zotero_class,
665
+ mock_get_item_collections,
666
+ ):
667
+ """Test that if children() returns a non-iterable (causing an exception),
668
+ PDF info is not added."""
669
+ mock_hydra_compose.return_value = dummy_cfg
670
+ mock_hydra_init.return_value.__enter__.return_value = None
671
+
672
+ fake_zot = MagicMock()
673
+ fake_items = [
674
+ {
675
+ "data": {
676
+ "key": "paper1",
677
+ "title": "Paper 1",
678
+ "abstractNote": "Abstract 1",
679
+ "date": "2021",
680
+ "url": "http://example.com",
681
+ "itemType": "journalArticle",
682
+ "creators": [
683
+ {
684
+ "firstName": "Bob",
685
+ "lastName": "Jones",
686
+ "creatorType": "author",
687
+ }
688
+ ],
689
+ }
690
+ },
691
+ ]
692
+ fake_zot.items.return_value = fake_items
693
+
694
+ # Simulate children() returning None to trigger an exception in list comprehension.
695
+ fake_zot.children.return_value = None
696
+
697
+ mock_zotero_class.return_value = fake_zot
698
+ mock_get_item_collections.return_value = {"paper1": ["/Test Collection"]}
699
+
700
+ tool_call_id = "test_pdf_outer_exception"
701
+ tool_input = {
702
+ "query": "outer exception test",
703
+ "only_articles": True,
704
+ "tool_call_id": tool_call_id,
705
+ "limit": 1,
706
+ }
707
+ result = zotero_read.run(tool_input)
708
+ filtered_papers = result.update["article_data"]
709
+
710
+ # Ensure no PDF-related keys are added if an exception occurs
711
+ self.assertIn("paper1", filtered_papers)
712
+ self.assertNotIn("pdf_url", filtered_papers["paper1"])
713
+ self.assertNotIn("filename", filtered_papers["paper1"])
714
+ self.assertNotIn("attachment_key", filtered_papers["paper1"])
715
+
716
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.requests.get")
717
+ @patch(
718
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
719
+ )
720
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
721
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
722
+ @patch(
723
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
724
+ )
725
+ def test_download_zotero_pdf_exception(
726
+ self,
727
+ mock_hydra_init,
728
+ mock_hydra_compose,
729
+ mock_zotero_class,
730
+ mock_get_item_collections,
731
+ mock_requests_get,
732
+ ):
733
+ """Test that _download_zotero_pdf returns None and logs error on request exception."""
734
+ # Setup mocks for config and Zotero client
735
+ mock_hydra_compose.return_value = dummy_cfg
736
+ mock_hydra_init.return_value.__enter__.return_value = None
737
+ mock_zotero_class.return_value = MagicMock()
738
+ mock_get_item_collections.return_value = {}
739
+
740
+ # Simulate a request exception during PDF download
741
+ mock_requests_get.side_effect = Exception("Simulated download failure")
742
+
743
+ zotero_search = ZoteroSearchData(
744
+ query="test", only_articles=False, limit=1, tool_call_id="test123"
745
+ )
746
+
747
+ result = zotero_search._download_zotero_pdf("FAKE_ATTACHMENT_KEY")
748
+
749
+ self.assertIsNone(result)
750
+
751
+ @patch(
752
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.zotero_path.get_item_collections"
753
+ )
754
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.zotero.Zotero")
755
+ @patch("aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.compose")
756
+ @patch(
757
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper.hydra.initialize"
758
+ )
759
+ def test_download_pdf_exception_logging(
760
+ self,
761
+ mock_hydra_init,
762
+ mock_hydra_compose,
763
+ mock_zotero_class,
764
+ mock_get_item_collections,
765
+ ):
766
+ """Test that a failed download logs the error and does not break the pipeline."""
767
+ mock_hydra_compose.return_value = dummy_cfg
768
+ mock_hydra_init.return_value.__enter__.return_value = None
769
+
770
+ fake_zot = MagicMock()
771
+ fake_zot.items.return_value = [
772
+ {
773
+ "data": {
774
+ "key": "paper1",
775
+ "title": "Fake Title",
776
+ "itemType": "journalArticle",
777
+ }
778
+ }
779
+ ]
780
+ # Simulate an attachment
781
+ fake_zot.children.return_value = [
782
+ {
783
+ "data": {
784
+ "key": "attachment1",
785
+ "filename": "file1.pdf",
786
+ "contentType": "application/pdf",
787
+ }
788
+ }
789
+ ]
790
+
791
+ mock_zotero_class.return_value = fake_zot
792
+ mock_get_item_collections.return_value = {"paper1": ["/Fake Collection"]}
793
+
794
+ # Patch just the internal _download_zotero_pdf to raise an exception
795
+ with patch(
796
+ "aiagents4pharma.talk2scholars.tools.zotero.utils.read_helper."
797
+ "ZoteroSearchData._download_zotero_pdf"
798
+ ) as mock_download_pdf:
799
+ mock_download_pdf.side_effect = Exception("Simulated download error")
800
+
801
+ search = ZoteroSearchData(
802
+ query="failure test",
803
+ only_articles=True,
804
+ limit=1,
805
+ tool_call_id="fail_test",
806
+ )
807
+ search.process_search()
808
+
809
+ article_data = search.get_search_results()["article_data"]
810
+ assert "paper1" in article_data
811
+ assert "pdf_url" not in article_data["paper1"] # download failed, no URL
@@ -4,14 +4,8 @@ This package provides modules for fetching and downloading academic papers from
4
4
  """
5
5
 
6
6
  # Import modules
7
- from . import abstract_downloader
8
- from . import arxiv_downloader
9
7
  from . import download_arxiv_input
10
- from .download_arxiv_input import download_arxiv_paper
11
8
 
12
9
  __all__ = [
13
- "abstract_downloader",
14
- "arxiv_downloader",
15
10
  "download_arxiv_input",
16
- "download_arxiv_paper",
17
11
  ]
@@ -1,64 +1,122 @@
1
- # File: aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py
1
+ #!/usr/bin/env python3
2
2
  """
3
- This module defines the `download_arxiv_paper` tool, which leverages the
4
- `ArxivPaperDownloader` class to fetch and download academic papers from arXiv
5
- based on their unique arXiv ID.
3
+ Tool for downloading arXiv paper metadata and retrieving the PDF URL.
6
4
  """
5
+
6
+ import logging
7
+ import xml.etree.ElementTree as ET
7
8
  from typing import Annotated, Any
8
- from pydantic import BaseModel, Field
9
- from langchain_core.tools import tool
9
+
10
+ import hydra
11
+ import requests
10
12
  from langchain_core.messages import ToolMessage
13
+ from langchain_core.tools import tool
11
14
  from langchain_core.tools.base import InjectedToolCallId
12
15
  from langgraph.types import Command
16
+ from pydantic import BaseModel, Field
13
17
 
14
- # Local import from the same package:
15
- from .arxiv_downloader import ArxivPaperDownloader
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
16
21
 
17
22
 
18
23
  class DownloadArxivPaperInput(BaseModel):
19
- """
20
- Input schema for the arXiv paper download tool.
21
- (Optional: if you decide to keep Pydantic validation in the future)
22
- """
24
+ """Input schema for the arXiv paper download tool."""
23
25
 
24
26
  arxiv_id: str = Field(
25
- description="The arXiv paper ID used to retrieve the paper details and PDF."
27
+ description="The arXiv paper ID used to retrieve the paper details and PDF URL."
26
28
  )
27
29
  tool_call_id: Annotated[str, InjectedToolCallId]
28
30
 
29
31
 
32
+ def fetch_arxiv_metadata(
33
+ api_url: str, arxiv_id: str, request_timeout: int
34
+ ) -> ET.Element:
35
+ """Fetch and parse metadata from the arXiv API."""
36
+ query_url = f"{api_url}?search_query=id:{arxiv_id}&start=0&max_results=1"
37
+ response = requests.get(query_url, timeout=request_timeout)
38
+ response.raise_for_status()
39
+ return ET.fromstring(response.text)
40
+
41
+
42
+ def extract_metadata(entry: ET.Element, ns: dict, arxiv_id: str) -> dict:
43
+ """Extract metadata from the XML entry."""
44
+ title_elem = entry.find("atom:title", ns)
45
+ title = title_elem.text.strip() if title_elem is not None else "N/A"
46
+
47
+ authors = [
48
+ author_elem.find("atom:name", ns).text.strip()
49
+ for author_elem in entry.findall("atom:author", ns)
50
+ if author_elem.find("atom:name", ns) is not None
51
+ ]
52
+
53
+ summary_elem = entry.find("atom:summary", ns)
54
+ abstract = summary_elem.text.strip() if summary_elem is not None else "N/A"
55
+
56
+ published_elem = entry.find("atom:published", ns)
57
+ pub_date = published_elem.text.strip() if published_elem is not None else "N/A"
58
+
59
+ pdf_url = next(
60
+ (
61
+ link.attrib.get("href")
62
+ for link in entry.findall("atom:link", ns)
63
+ if link.attrib.get("title") == "pdf"
64
+ ),
65
+ None,
66
+ )
67
+ if not pdf_url:
68
+ raise RuntimeError(f"Could not find PDF URL for arXiv ID {arxiv_id}")
69
+
70
+ return {
71
+ "Title": title,
72
+ "Authors": authors,
73
+ "Abstract": abstract,
74
+ "Publication Date": pub_date,
75
+ "URL": pdf_url,
76
+ "pdf_url": pdf_url,
77
+ "filename": f"{arxiv_id}.pdf",
78
+ "source": "arxiv",
79
+ "arxiv_id": arxiv_id,
80
+ }
81
+
82
+
30
83
  @tool(args_schema=DownloadArxivPaperInput, parse_docstring=True)
31
84
  def download_arxiv_paper(
32
85
  arxiv_id: str,
33
86
  tool_call_id: Annotated[str, InjectedToolCallId],
34
87
  ) -> Command[Any]:
35
88
  """
36
- Download an arXiv paper's PDF using its unique arXiv ID.
89
+ Get metadata and PDF URL for an arXiv paper using its unique arXiv ID.
90
+ """
91
+ logger.info("Fetching metadata from arXiv for paper ID: %s", arxiv_id)
37
92
 
38
- This function:
39
- 1. Creates an `ArxivPaperDownloader` instance.
40
- 2. Fetches metadata from arXiv using the provided `arxiv_id`.
41
- 3. Downloads the PDF from the returned link.
42
- 4. Returns a `Command` object containing the PDF data and a success message.
93
+ # Load configuration
94
+ with hydra.initialize(version_base=None, config_path="../../configs"):
95
+ cfg = hydra.compose(
96
+ config_name="config", overrides=["tools/download_arxiv_paper=default"]
97
+ )
98
+ api_url = cfg.tools.download_arxiv_paper.api_url
99
+ request_timeout = cfg.tools.download_arxiv_paper.request_timeout
43
100
 
44
- Args:
45
- arxiv_id (str): The unique arXiv paper ID.
46
- tool_call_id (InjectedToolCallId): A unique identifier for tracking this tool call.
101
+ # Fetch and parse metadata
102
+ root = fetch_arxiv_metadata(api_url, arxiv_id, request_timeout)
103
+ ns = {"atom": "http://www.w3.org/2005/Atom"}
47
104
 
48
- Returns:
49
- Command[Any]: Contains metadata and messages about the success of the operation.
50
- """
51
- downloader = ArxivPaperDownloader()
105
+ entry = root.find("atom:entry", ns)
106
+ if entry is None:
107
+ raise ValueError(f"No entry found for arXiv ID {arxiv_id}")
108
+
109
+ # Extract metadata
110
+ metadata = extract_metadata(entry, ns, arxiv_id)
52
111
 
53
- # If the downloader fails or the arxiv_id is invalid, this might raise an error
54
- pdf_data = downloader.download_pdf(arxiv_id)
55
- # print (pdf_data)
112
+ # Create article_data entry with the paper ID as the key
113
+ article_data = {arxiv_id: metadata}
56
114
 
57
- content = f"Successfully downloaded PDF for arXiv ID {arxiv_id}"
115
+ content = f"Successfully retrieved metadata and PDF URL for arXiv ID {arxiv_id}"
58
116
 
59
117
  return Command(
60
118
  update={
61
- "pdf_data": pdf_data,
119
+ "article_data": article_data,
62
120
  "messages": [ToolMessage(content=content, tool_call_id=tool_call_id)],
63
121
  }
64
122
  )