aiagents4pharma 1.40.1__py3-none-any.whl → 1.41.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +4 -0
  2. aiagents4pharma/talk2scholars/configs/tools/question_and_answer/default.yaml +44 -4
  3. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker.py +127 -0
  4. aiagents4pharma/talk2scholars/tests/test_pdf_answer_formatter.py +66 -0
  5. aiagents4pharma/talk2scholars/tests/test_pdf_batch_processor.py +101 -0
  6. aiagents4pharma/talk2scholars/tests/test_pdf_collection_manager.py +150 -0
  7. aiagents4pharma/talk2scholars/tests/test_pdf_document_processor.py +69 -0
  8. aiagents4pharma/talk2scholars/tests/test_pdf_generate_answer.py +75 -0
  9. aiagents4pharma/talk2scholars/tests/test_pdf_gpu_detection.py +140 -0
  10. aiagents4pharma/talk2scholars/tests/test_pdf_paper_loader.py +116 -0
  11. aiagents4pharma/talk2scholars/tests/test_pdf_rag_pipeline.py +98 -0
  12. aiagents4pharma/talk2scholars/tests/test_pdf_retrieve_chunks.py +197 -0
  13. aiagents4pharma/talk2scholars/tests/test_pdf_singleton_manager.py +156 -0
  14. aiagents4pharma/talk2scholars/tests/test_pdf_vector_normalization.py +121 -0
  15. aiagents4pharma/talk2scholars/tests/test_pdf_vector_store.py +434 -0
  16. aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +89 -509
  17. aiagents4pharma/talk2scholars/tests/test_tool_helper_utils.py +34 -89
  18. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +8 -6
  19. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +6 -4
  20. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +74 -40
  21. aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +26 -1
  22. aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +62 -0
  23. aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +200 -0
  24. aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +172 -0
  25. aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +76 -0
  26. aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +14 -14
  27. aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +63 -0
  28. aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +154 -0
  29. aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +60 -40
  30. aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +123 -0
  31. aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +122 -0
  32. aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +162 -40
  33. aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +140 -0
  34. aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +40 -78
  35. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +159 -0
  36. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +277 -96
  37. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +12 -9
  38. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +0 -1
  39. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +9 -8
  40. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +5 -5
  41. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.41.0.dist-info}/METADATA +27 -115
  42. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.41.0.dist-info}/RECORD +45 -23
  43. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker_utils.py +0 -28
  44. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.41.0.dist-info}/WHEEL +0 -0
  45. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.41.0.dist-info}/licenses/LICENSE +0 -0
  46. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.41.0.dist-info}/top_level.txt +0 -0
@@ -3,3 +3,7 @@ s2_agent: |
3
3
  You are the S2 Agent.
4
4
 
5
5
  You are responsible for searching academic papers, getting recommendations based on the searched articles, and displaying the results.
6
+
7
+ IMPORTANT INSTRUCTION FOR AGENT BEHAVIOR:
8
+ If the user's request involves extracting paper IDs to download papers, your task is only to extract those IDs using the `query_dataframe`. Do not attempt to download the paper yourself or call any other tools after extracting the IDs.
9
+ Once the IDs are successfully extracted, immediately pause execution and return control to the main agent. The main agent is responsible for invoking the appropriate tool or sub-agent to handle the paper download.
@@ -1,17 +1,45 @@
1
- # Default configuration for the PDF question_and_answer Tool
1
+ # Configuration for the PDF question_and_answer Tool - Traditional RAG Pipeline with GPU Support
2
+
3
+ # Milvus vector database settings
4
+ milvus:
5
+ # Connection settings
6
+ host: ${oc.env:MILVUS_HOST,localhost} # Changed default from 127.0.0.1 to localhost
7
+ port: ${oc.env:MILVUS_PORT,19530}
8
+
9
+ # Database and collection settings
10
+ db_name: ${oc.env:MILVUS_DB_NAME,pdf_rag_db}
11
+ collection_name: ${oc.env:MILVUS_COLLECTION_NAME,pdf_rag_documents}
12
+
13
+ # Ensure collection persists across restarts
14
+ consistency_level: "Strong"
15
+ embedding_dim: 768
16
+
17
+ # Document processing settings
2
18
  chunk_size: 1200 # Number of characters per text chunk
3
19
  chunk_overlap: 200 # Overlap between adjacent chunks
4
- top_k_papers: 5 # Number of papers to rank and retrieve
5
- top_k_chunks: 25 # Number of chunks to retrieve
20
+
21
+ # Parallel processing settings
22
+ embedding_batch_size: 1500 # Number of chunks to embed in a single API call
23
+ max_parallel_pdfs: 10 # Maximum number of PDFs to process in parallel
24
+
25
+ # Traditional RAG Pipeline Settings
26
+ # Step 1: Initial retrieval (cast wide net)
27
+ initial_retrieval_k: 100 # Number of chunks to retrieve before reranking
28
+ mmr_diversity: 0.8 # MMR diversity parameter (0=max diversity, 1=max relevance)
29
+
30
+ # Step 2: Reranking settings
31
+ top_k_chunks: 25 # Final number of chunks after reranking
6
32
  reranker:
7
33
  model: "nvidia/nv-rerankqa-mistral-4b-v3"
8
34
  api_key: ${oc.env:NVIDIA_API_KEY}
35
+
36
+ # Answer generation settings
9
37
  prompt_template: |
10
38
  You are a scientific research assistant specialized in reading and extracting information from research papers.
11
39
  Your role is to answer questions by retrieving relevant information from the provided context.
12
40
 
13
41
  - Provide detailed, structured, and well-argued explanations—not just brief summaries.
14
- - Cite specific sources using onky the title of the paper.
42
+ - Cite specific sources using only the title of the paper.
15
43
  - If the context is insufficient, clearly state that more information is needed.
16
44
 
17
45
  Context:
@@ -20,3 +48,15 @@ prompt_template: |
20
48
  Question: {question}
21
49
 
22
50
  Your answer should be comprehensive, accurate, and clearly structured for a scientific audience.
51
+
52
+ # GPU Detection and Performance Settings
53
+ gpu_detection:
54
+ # Timeout for GPU detection command (seconds)
55
+ detection_timeout: 10
56
+
57
+ # Log GPU detection results
58
+ log_detection: true
59
+
60
+ # Force CPU mode even if GPU is detected (for testing CPU Milvus)
61
+ # Uncomment the line below to force CPU mode:
62
+ # force_cpu_mode: true
@@ -0,0 +1,127 @@
1
+ """
2
+ Unit tests for NVIDIA NIM reranker error handling in nvidia_nim_reranker.py
3
+ """
4
+
5
+ from unittest.mock import MagicMock, patch
6
+
7
+ import pytest
8
+ from langchain_core.documents import Document
9
+
10
+ from aiagents4pharma.talk2scholars.tools.pdf.utils import nvidia_nim_reranker
11
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker import (
12
+ rerank_chunks,
13
+ )
14
+
15
+
16
+ @pytest.fixture(name="chunks_fixture")
17
+ def fixture_chunks():
18
+ """chunks_fixture fixture to simulate PDF chunks."""
19
+ return [
20
+ Document(
21
+ page_content=f"chunk {i}",
22
+ metadata={"paper_id": f"P{i%2}", "relevance_score": 0.9 - 0.01 * i},
23
+ )
24
+ for i in range(10)
25
+ ]
26
+
27
+
28
+ def test_rerank_chunks_short_input(chunks_fixture):
29
+ """rerank_chunks with fewer chunks than top_k should return original."""
30
+ result = rerank_chunks(
31
+ chunks_fixture[:3], "What is cancer?", config=MagicMock(), top_k=5
32
+ )
33
+ assert result == chunks_fixture[:3]
34
+
35
+
36
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.logger")
37
+ def test_rerank_chunks_missing_api_key_logs_and_raises(mock_logger, chunks_fixture):
38
+ """
39
+ If config.reranker.api_key is None:
40
+ - logger.error(...) should be called
41
+ - rerank_chunks should raise ValueError
42
+ """
43
+ mock_config = MagicMock()
44
+ mock_config.reranker.api_key = None
45
+
46
+ with pytest.raises(
47
+ ValueError,
48
+ match="Configuration 'reranker.api_key' must be set for reranking",
49
+ ):
50
+ rerank_chunks(chunks_fixture, "What is cancer?", config=mock_config, top_k=5)
51
+
52
+ mock_logger.error.assert_called_once_with(
53
+ "No NVIDIA API key found in configuration for reranking"
54
+ )
55
+
56
+
57
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.NVIDIARerank")
58
+ def test_rerank_chunks_success(mock_reranker_cls, chunks_fixture):
59
+ """rerank_chunks with successful reranking."""
60
+ reranker_instance = MagicMock()
61
+ reranker_instance.compress_documents.return_value = list(reversed(chunks_fixture))
62
+ mock_reranker_cls.return_value = reranker_instance
63
+
64
+ mock_config = MagicMock()
65
+ mock_config.reranker.api_key = "test_key"
66
+ mock_config.reranker.model = "test_model"
67
+
68
+ result = rerank_chunks(
69
+ chunks_fixture, "Explain mitochondria.", config=mock_config, top_k=5
70
+ )
71
+
72
+ assert isinstance(result, list)
73
+ assert result == list(reversed(chunks_fixture))[:5]
74
+ reranker_instance.compress_documents.assert_called_once_with(
75
+ query="Explain mitochondria.", documents=chunks_fixture
76
+ )
77
+
78
+
79
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.NVIDIARerank")
80
+ def test_rerank_chunks_reranker_fails_raises_and_calls_compress(
81
+ mock_reranker_cls, chunks_fixture
82
+ ):
83
+ """
84
+ If NVIDIARerank.compress_documents raises RuntimeError:
85
+ - rerank_chunks should propagate the RuntimeError
86
+ - and compress_documents should have been called
87
+ """
88
+ reranker_instance = MagicMock()
89
+ reranker_instance.compress_documents.side_effect = RuntimeError("API failure")
90
+ mock_reranker_cls.return_value = reranker_instance
91
+
92
+ mock_config = MagicMock()
93
+ mock_config.reranker.api_key = "valid_key"
94
+ mock_config.reranker.model = "reranker"
95
+
96
+ with pytest.raises(RuntimeError, match="API failure"):
97
+ rerank_chunks(
98
+ chunks_fixture, "How does light affect plants?", config=mock_config, top_k=3
99
+ )
100
+
101
+ reranker_instance.compress_documents.assert_called_once_with(
102
+ query="How does light affect plants?", documents=chunks_fixture
103
+ )
104
+
105
+
106
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.logger")
107
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.NVIDIARerank")
108
+ def test_rerank_chunks_debug_block_triggered(
109
+ mock_reranker_cls, mock_logger, chunks_fixture
110
+ ):
111
+ """rerank_chunks should log debug info if debug logging is enabled."""
112
+ mock_logger.isEnabledFor.return_value = True
113
+
114
+ reranker_instance = MagicMock()
115
+ reranker_instance.compress_documents.return_value = chunks_fixture
116
+ mock_reranker_cls.return_value = reranker_instance
117
+
118
+ mock_config = MagicMock()
119
+ mock_config.reranker.api_key = "abc"
120
+ mock_config.reranker.model = "mymodel"
121
+
122
+ result = nvidia_nim_reranker.rerank_chunks(
123
+ chunks_fixture * 2, "Test query", mock_config, top_k=3
124
+ )
125
+
126
+ assert result == chunks_fixture[:3]
127
+ assert mock_logger.debug.called
@@ -0,0 +1,66 @@
1
+ """answer_formatter tests."""
2
+
3
+ from unittest.mock import patch
4
+ import pytest
5
+
6
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.answer_formatter import format_answer
7
+
8
+
9
+ @pytest.fixture(name="base_args")
10
+ def _base_args():
11
+ """base_args fixture to provide common arguments for tests."""
12
+ return {
13
+ "question": "What is the conclusion?",
14
+ "chunks": [{"content": "chunk1"}, {"content": "chunk2"}],
15
+ "llm": "mock_llm",
16
+ "articles": {
17
+ "paper1": {"Title": "Paper One"},
18
+ "paper2": {"Title": "Paper Two"},
19
+ },
20
+ "config": {"key": "value"},
21
+ "call_id": "test_call_123",
22
+ "has_gpu": True,
23
+ }
24
+
25
+
26
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.answer_formatter.generate_answer")
27
+ def test_format_answer_with_sources(mock_generate_answer, base_args):
28
+ """test format_answer with sources."""
29
+ mock_generate_answer.return_value = {
30
+ "output_text": "This is the generated answer.",
31
+ "papers_used": ["paper1", "paper2"],
32
+ }
33
+
34
+ result = format_answer(**base_args)
35
+
36
+ assert "This is the generated answer." in result
37
+ assert "Sources:" in result
38
+ assert "- Paper One" in result
39
+ assert "- Paper Two" in result
40
+ mock_generate_answer.assert_called_once()
41
+
42
+
43
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.answer_formatter.generate_answer")
44
+ def test_format_answer_no_sources(mock_generate_answer, base_args):
45
+ """test format_answer with no sources."""
46
+ mock_generate_answer.return_value = {
47
+ "output_text": "No sources were used.",
48
+ "papers_used": [], # No papers used
49
+ }
50
+
51
+ result = format_answer(**base_args)
52
+
53
+ assert result == "No sources were used." # No sources section expected
54
+ mock_generate_answer.assert_called_once()
55
+
56
+
57
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.answer_formatter.generate_answer")
58
+ def test_format_answer_missing_output_text(mock_generate_answer, base_args):
59
+ """test format_answer with missing output text."""
60
+ mock_generate_answer.return_value = {"papers_used": ["paper1"]}
61
+
62
+ result = format_answer(**base_args)
63
+
64
+ assert result.startswith("No answer generated.")
65
+ assert "Sources:" in result
66
+ mock_generate_answer.assert_called_once()
@@ -0,0 +1,101 @@
1
+ """Tests for the PDF batch processor module."""
2
+
3
+ from unittest.mock import MagicMock, patch
4
+ import pytest
5
+
6
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor import (
7
+ add_papers_batch,
8
+ )
9
+
10
+
11
+ @pytest.fixture(name="args_fixture")
12
+ def _args_fixture():
13
+ """Provides common arguments for tests."""
14
+ return {
15
+ "vector_store": MagicMock(),
16
+ "loaded_papers": set(),
17
+ "paper_metadata": {},
18
+ "documents": {},
19
+ "config": {"param": "value"},
20
+ "metadata_fields": ["Title", "Author"],
21
+ "has_gpu": False,
22
+ }
23
+
24
+
25
+ @patch(
26
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
27
+ )
28
+ def test_no_papers_to_add(mock_loader, args_fixture):
29
+ """Test case where no papers are provided to add."""
30
+ add_papers_batch(papers_to_add=[], **args_fixture)
31
+ mock_loader.assert_not_called()
32
+
33
+
34
+ @patch(
35
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
36
+ )
37
+ def test_all_papers_already_loaded(mock_loader, args_fixture):
38
+ """Test case where all papers are already loaded."""
39
+ args_fixture["loaded_papers"].update(["p1", "p2"])
40
+ add_papers_batch(
41
+ papers_to_add=[("p1", "url1", {}), ("p2", "url2", {})],
42
+ **args_fixture,
43
+ )
44
+ mock_loader.assert_not_called()
45
+
46
+
47
+ @patch(
48
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
49
+ )
50
+ def test_successful_batch_embedding(mock_loader, args_fixture):
51
+ """Test case where papers are successfully loaded and embedded."""
52
+ mock_loader.return_value = [
53
+ MagicMock(page_content="Page 1"),
54
+ MagicMock(page_content="Page 2"),
55
+ ]
56
+
57
+ mock_collection = MagicMock()
58
+ mock_collection.num_entities = 2
59
+ mock_collection.query.return_value = [{"paper_id": "p1"}]
60
+ args_fixture["vector_store"].col = mock_collection
61
+
62
+ add_papers_batch(
63
+ papers_to_add=[("p1", "url1", {"Title": "Paper One"})],
64
+ **args_fixture,
65
+ )
66
+
67
+ assert "p1" in args_fixture["paper_metadata"]
68
+ assert "p1" in args_fixture["loaded_papers"]
69
+ args_fixture["vector_store"].add_documents.assert_called_once()
70
+ mock_collection.flush.assert_called()
71
+
72
+
73
+ @patch(
74
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
75
+ )
76
+ def test_empty_chunks_after_loading(mock_loader, args_fixture):
77
+ """Test case where no chunks are returned after loading PDF."""
78
+ mock_loader.return_value = []
79
+
80
+ add_papers_batch(papers_to_add=[("p1", "url1", {})], **args_fixture)
81
+
82
+ args_fixture["vector_store"].add_documents.assert_not_called()
83
+
84
+
85
+ @patch(
86
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
87
+ )
88
+ def test_vector_store_insert_failure(mock_loader, args_fixture):
89
+ """Test case where vector store insertion fails."""
90
+ mock_loader.return_value = [MagicMock(page_content="page")]
91
+
92
+ def raise_error(*_, **__):
93
+ raise RuntimeError("Vector store failed")
94
+
95
+ args_fixture["vector_store"].add_documents.side_effect = raise_error
96
+
97
+ mock_collection = MagicMock()
98
+ args_fixture["vector_store"].col = mock_collection
99
+
100
+ with pytest.raises(RuntimeError, match="Vector store failed"):
101
+ add_papers_batch(papers_to_add=[("p1", "url1", {})], **args_fixture)
@@ -0,0 +1,150 @@
1
+ """collection_manager for managing Milvus collections for PDF chunks."""
2
+
3
+ from unittest.mock import MagicMock, patch
4
+ from dataclasses import dataclass, field
5
+ import pytest
6
+
7
+ from aiagents4pharma.talk2scholars.tools.pdf.utils import collection_manager
8
+
9
+
10
+ # -- Fixtures --
11
+
12
+
13
+ @pytest.fixture
14
+ def config_mock():
15
+ """Dataclass config fixture to simulate Milvus config."""
16
+
17
+ @dataclass
18
+ class MilvusConfig:
19
+ """Simulated Milvus inner config."""
20
+
21
+ embedding_dim: int = 768
22
+
23
+ @dataclass
24
+ class Config:
25
+ """Simulated outer config."""
26
+
27
+ milvus: MilvusConfig = field(default_factory=MilvusConfig)
28
+
29
+ return Config()
30
+
31
+
32
+ @pytest.fixture
33
+ def index_params():
34
+ """Fixture to provide index parameters for tests."""
35
+ return {"index_type": "IVF_FLAT", "params": {"nlist": 128}, "metric_type": "L2"}
36
+
37
+
38
+ # -- Safe collection_cache access --
39
+
40
+
41
+ def set_collection_cache(key, value):
42
+ """Set a mocked collection into the cache."""
43
+ getattr(collection_manager, "_collection_cache")[key] = value
44
+
45
+
46
+ def clear_collection_cache(key):
47
+ """Remove a mocked collection from the cache."""
48
+ getattr(collection_manager, "_collection_cache").pop(key, None)
49
+
50
+
51
+ # -- Tests --
52
+
53
+
54
+ def test_cached_collection_returned(request):
55
+ """Check if cached collection is returned."""
56
+ config = request.getfixturevalue("config_mock")
57
+ index = request.getfixturevalue("index_params")
58
+ mock_collection = MagicMock()
59
+ collection_name = "test_cached"
60
+
61
+ set_collection_cache(collection_name, mock_collection)
62
+
63
+ result = collection_manager.ensure_collection_exists(
64
+ collection_name, config, index, has_gpu=False
65
+ )
66
+
67
+ assert result == mock_collection
68
+ clear_collection_cache(collection_name)
69
+
70
+
71
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.Collection")
72
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.utility")
73
+ def test_create_new_collection(mock_utility, mock_collection_cls, request):
74
+ """Check if new collection is created when it does not exist."""
75
+ config = request.getfixturevalue("config_mock")
76
+ index = request.getfixturevalue("index_params")
77
+ mock_utility.list_collections.return_value = []
78
+
79
+ mock_collection = MagicMock()
80
+ mock_collection_cls.return_value = mock_collection
81
+ mock_collection.indexes = [MagicMock(field_name="embedding")]
82
+ mock_collection.num_entities = 5
83
+
84
+ result = collection_manager.ensure_collection_exists(
85
+ "new_collection", config, index, has_gpu=True
86
+ )
87
+
88
+ assert mock_collection.create_index.called
89
+ assert mock_collection.load.called
90
+ assert result == mock_collection
91
+
92
+
93
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.Collection")
94
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.utility")
95
+ def test_load_existing_collection(mock_utility, mock_collection_cls, request):
96
+ """Test loading an existing collection."""
97
+ config = request.getfixturevalue("config_mock")
98
+ index = request.getfixturevalue("index_params")
99
+ mock_utility.list_collections.return_value = ["existing_collection"]
100
+
101
+ mock_collection = MagicMock()
102
+ mock_collection_cls.return_value = mock_collection
103
+ mock_collection.indexes = []
104
+ mock_collection.num_entities = 0
105
+
106
+ result = collection_manager.ensure_collection_exists(
107
+ "existing_collection", config, index, has_gpu=False
108
+ )
109
+
110
+ mock_collection.load.assert_called_once()
111
+ assert result == mock_collection
112
+
113
+
114
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.Collection")
115
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.utility")
116
+ def test_debug_collection_state_failure(mock_utility, mock_collection_cls, request):
117
+ """debug_collection_state should log but not raise on failure."""
118
+ config = request.getfixturevalue("config_mock")
119
+ index = request.getfixturevalue("index_params")
120
+ mock_utility.list_collections.return_value = ["bad_collection"]
121
+
122
+ mock_collection = MagicMock()
123
+ mock_collection_cls.return_value = mock_collection
124
+ mock_collection.indexes = []
125
+ mock_collection.num_entities = 10
126
+
127
+ mock_collection.schema = property(
128
+ lambda _: (_ for _ in ()).throw(Exception("bad schema"))
129
+ )
130
+
131
+ result = collection_manager.ensure_collection_exists(
132
+ "bad_collection", config, index, has_gpu=True
133
+ )
134
+
135
+ assert result == mock_collection
136
+
137
+
138
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.Collection")
139
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.utility")
140
+ def test_ensure_collection_exception(mock_utility, mock_collection_cls, request):
141
+ """ensure_collection_exists should raise on utility failure."""
142
+ config = request.getfixturevalue("config_mock")
143
+ index = request.getfixturevalue("index_params")
144
+ mock_utility.list_collections.side_effect = RuntimeError("milvus failure")
145
+ mock_collection_cls.return_value = MagicMock()
146
+
147
+ with pytest.raises(RuntimeError, match="milvus failure"):
148
+ collection_manager.ensure_collection_exists(
149
+ "fail_collection", config, index, has_gpu=False
150
+ )
@@ -0,0 +1,69 @@
1
+ """Unit tests for PDF document processing utilities."""
2
+
3
+ from unittest.mock import MagicMock, patch
4
+ import pytest
5
+
6
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.document_processor import (
7
+ load_and_split_pdf,
8
+ )
9
+
10
+
11
+ @pytest.fixture(name="base_args_params")
12
+ def _base_args_params():
13
+ """base_args_params fixture to provide common arguments for tests."""
14
+ return {
15
+ "paper_id": "P123",
16
+ "pdf_url": "mock/path/to/paper.pdf",
17
+ "paper_metadata": {"Title": "Test Paper", "Author": "A. Researcher"},
18
+ "config": type("Config", (), {"chunk_size": 1000, "chunk_overlap": 200})(),
19
+ "metadata_fields": ["Author"],
20
+ "documents_dict": {},
21
+ }
22
+
23
+
24
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.document_processor.PyPDFLoader")
25
+ @patch(
26
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.document_processor."
27
+ "RecursiveCharacterTextSplitter"
28
+ )
29
+ def test_load_and_split_pdf_success(
30
+ mock_splitter_cls, mock_loader_cls, base_args_params
31
+ ):
32
+ """load_and_split_pdf should load and split PDF correctly."""
33
+ mock_doc = MagicMock()
34
+ mock_doc.metadata = {"page": 1}
35
+ mock_loader = MagicMock()
36
+ mock_loader.load.return_value = [mock_doc]
37
+ mock_loader_cls.return_value = mock_loader
38
+
39
+ mock_splitter = MagicMock()
40
+ chunk1 = MagicMock()
41
+ chunk1.metadata = {"page": 1}
42
+ mock_splitter.split_documents.return_value = [chunk1]
43
+ mock_splitter_cls.return_value = mock_splitter
44
+
45
+ chunks = load_and_split_pdf(**base_args_params)
46
+
47
+ assert len(chunks) == 1
48
+ assert "P123_0" in base_args_params["documents_dict"]
49
+ stored_chunk = base_args_params["documents_dict"]["P123_0"]
50
+ assert stored_chunk.metadata["paper_id"] == "P123"
51
+ assert stored_chunk.metadata["title"] == "Test Paper"
52
+ assert stored_chunk.metadata["chunk_id"] == 0
53
+ assert stored_chunk.metadata["page"] == 1
54
+ assert stored_chunk.metadata["source"] == base_args_params["pdf_url"]
55
+ assert stored_chunk.metadata["Author"] == "A. Researcher"
56
+
57
+
58
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.document_processor.PyPDFLoader")
59
+ def test_load_and_split_pdf_raises_if_config_missing(mock_loader_cls, base_args_params):
60
+ """load_and_split_pdf should raise ValueError if config is None."""
61
+ mock_loader = MagicMock()
62
+ mock_loader.load.return_value = [MagicMock()]
63
+ mock_loader_cls.return_value = mock_loader
64
+
65
+ base_args_params["config"] = None
66
+ with pytest.raises(
67
+ ValueError, match="Configuration is required for text splitting in Vectorstore."
68
+ ):
69
+ load_and_split_pdf(**base_args_params)
@@ -0,0 +1,75 @@
1
+ """generate_answer tests for the PDF tool"""
2
+
3
+ from unittest.mock import MagicMock
4
+ import pytest
5
+
6
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.generate_answer import (
7
+ _build_context_and_sources,
8
+ generate_answer,
9
+ )
10
+
11
+
12
+ @pytest.fixture(name="chunks_fixture")
13
+ def _chunks_fixture():
14
+ """Fixture providing sample document chunks."""
15
+ doc1 = MagicMock()
16
+ doc1.page_content = "This is chunk one."
17
+ doc1.metadata = {"paper_id": "P1", "title": "Title 1", "page": 1}
18
+
19
+ doc2 = MagicMock()
20
+ doc2.page_content = "This is chunk two."
21
+ doc2.metadata = {"paper_id": "P1", "title": "Title 1", "page": 2}
22
+
23
+ doc3 = MagicMock()
24
+ doc3.page_content = "This is chunk three."
25
+ doc3.metadata = {"paper_id": "P2", "title": "Title 2", "page": 1}
26
+
27
+ return [doc1, doc2, doc3]
28
+
29
+
30
+ def test_build_context_and_sources_formatting(chunks_fixture):
31
+ """_build_context_and_sources should format context and sources correctly."""
32
+ context, sources = _build_context_and_sources(chunks_fixture)
33
+
34
+ assert "[Document 1] From: 'Title 1' (ID: P1)" in context
35
+ assert "Page 1: This is chunk one." in context
36
+ assert "Page 2: This is chunk two." in context
37
+ assert "[Document 2] From: 'Title 2' (ID: P2)" in context
38
+ assert "Page 1: This is chunk three." in context
39
+ assert sources == {"P1", "P2"}
40
+
41
+
42
+ def test_generate_answer_success(chunks_fixture):
43
+ """generate_answer should return formatted answer and sources."""
44
+ mock_llm = MagicMock()
45
+ mock_llm.invoke.return_value.content = "The answer is XYZ."
46
+
47
+ config = {
48
+ "prompt_template": "Answer the question based on the context."
49
+ "\n\n{context}\n\nQ: {question}\nA:"
50
+ }
51
+
52
+ result = generate_answer("What is the result?", chunks_fixture, mock_llm, config)
53
+
54
+ assert result["output_text"] == "The answer is XYZ."
55
+ assert len(result["sources"]) == 3
56
+ assert result["num_sources"] == 3
57
+ assert set(result["papers_used"]) == {"P1", "P2"}
58
+
59
+
60
+ def test_generate_answer_raises_for_none_config(chunks_fixture):
61
+ """generate_answer should raise ValueError for None config."""
62
+ mock_llm = MagicMock()
63
+ with pytest.raises(
64
+ ValueError, match="Configuration for generate_answer is required."
65
+ ):
66
+ generate_answer("Why?", chunks_fixture, mock_llm, config=None)
67
+
68
+
69
+ def test_generate_answer_raises_for_missing_template(chunks_fixture):
70
+ """generate_answer should raise ValueError for missing prompt_template in config."""
71
+ mock_llm = MagicMock()
72
+ with pytest.raises(
73
+ ValueError, match="The prompt_template is missing from the configuration."
74
+ ):
75
+ generate_answer("Why?", chunks_fixture, mock_llm, config={})