aiagents4pharma 1.40.1__py3-none-any.whl → 1.42.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
- aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
- aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
- aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +4 -0
- aiagents4pharma/talk2scholars/configs/tools/question_and_answer/default.yaml +44 -4
- aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker.py +127 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_answer_formatter.py +66 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_batch_processor.py +101 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_collection_manager.py +150 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_document_processor.py +69 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_generate_answer.py +75 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_gpu_detection.py +140 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_paper_loader.py +116 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_rag_pipeline.py +98 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_retrieve_chunks.py +197 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_singleton_manager.py +156 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_vector_normalization.py +121 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_vector_store.py +434 -0
- aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +89 -509
- aiagents4pharma/talk2scholars/tests/test_tool_helper_utils.py +34 -89
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +8 -6
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +6 -4
- aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +74 -40
- aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +26 -1
- aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +62 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +200 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +172 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +76 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +14 -14
- aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +63 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +154 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +60 -40
- aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +123 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +122 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +162 -40
- aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +140 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +40 -78
- aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +159 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +277 -96
- aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +12 -9
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +0 -1
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +9 -8
- aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +5 -5
- {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/METADATA +52 -126
- {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/RECORD +52 -25
- aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker_utils.py +0 -28
- {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/top_level.txt +0 -0
@@ -3,3 +3,7 @@ s2_agent: |
|
|
3
3
|
You are the S2 Agent.
|
4
4
|
|
5
5
|
You are responsible for searching academic papers, getting recommendations based on the searched articles, and displaying the results.
|
6
|
+
|
7
|
+
IMPORTANT INSTRUCTION FOR AGENT BEHAVIOR:
|
8
|
+
If the user's request involves extracting paper IDs to download papers, your task is only to extract those IDs using the `query_dataframe`. Do not attempt to download the paper yourself or call any other tools after extracting the IDs.
|
9
|
+
Once the IDs are successfully extracted, immediately pause execution and return control to the main agent. The main agent is responsible for invoking the appropriate tool or sub-agent to handle the paper download.
|
@@ -1,17 +1,45 @@
|
|
1
|
-
#
|
1
|
+
# Configuration for the PDF question_and_answer Tool - Traditional RAG Pipeline with GPU Support
|
2
|
+
|
3
|
+
# Milvus vector database settings
|
4
|
+
milvus:
|
5
|
+
# Connection settings
|
6
|
+
host: ${oc.env:MILVUS_HOST,localhost} # Changed default from 127.0.0.1 to localhost
|
7
|
+
port: ${oc.env:MILVUS_PORT,19530}
|
8
|
+
|
9
|
+
# Database and collection settings
|
10
|
+
db_name: ${oc.env:MILVUS_DB_NAME,pdf_rag_db}
|
11
|
+
collection_name: ${oc.env:MILVUS_COLLECTION_NAME,pdf_rag_documents}
|
12
|
+
|
13
|
+
# Ensure collection persists across restarts
|
14
|
+
consistency_level: "Strong"
|
15
|
+
embedding_dim: 768
|
16
|
+
|
17
|
+
# Document processing settings
|
2
18
|
chunk_size: 1200 # Number of characters per text chunk
|
3
19
|
chunk_overlap: 200 # Overlap between adjacent chunks
|
4
|
-
|
5
|
-
|
20
|
+
|
21
|
+
# Parallel processing settings
|
22
|
+
embedding_batch_size: 1500 # Number of chunks to embed in a single API call
|
23
|
+
max_parallel_pdfs: 10 # Maximum number of PDFs to process in parallel
|
24
|
+
|
25
|
+
# Traditional RAG Pipeline Settings
|
26
|
+
# Step 1: Initial retrieval (cast wide net)
|
27
|
+
initial_retrieval_k: 100 # Number of chunks to retrieve before reranking
|
28
|
+
mmr_diversity: 0.8 # MMR diversity parameter (0=max diversity, 1=max relevance)
|
29
|
+
|
30
|
+
# Step 2: Reranking settings
|
31
|
+
top_k_chunks: 25 # Final number of chunks after reranking
|
6
32
|
reranker:
|
7
33
|
model: "nvidia/nv-rerankqa-mistral-4b-v3"
|
8
34
|
api_key: ${oc.env:NVIDIA_API_KEY}
|
35
|
+
|
36
|
+
# Answer generation settings
|
9
37
|
prompt_template: |
|
10
38
|
You are a scientific research assistant specialized in reading and extracting information from research papers.
|
11
39
|
Your role is to answer questions by retrieving relevant information from the provided context.
|
12
40
|
|
13
41
|
- Provide detailed, structured, and well-argued explanations—not just brief summaries.
|
14
|
-
- Cite specific sources using
|
42
|
+
- Cite specific sources using only the title of the paper.
|
15
43
|
- If the context is insufficient, clearly state that more information is needed.
|
16
44
|
|
17
45
|
Context:
|
@@ -20,3 +48,15 @@ prompt_template: |
|
|
20
48
|
Question: {question}
|
21
49
|
|
22
50
|
Your answer should be comprehensive, accurate, and clearly structured for a scientific audience.
|
51
|
+
|
52
|
+
# GPU Detection and Performance Settings
|
53
|
+
gpu_detection:
|
54
|
+
# Timeout for GPU detection command (seconds)
|
55
|
+
detection_timeout: 10
|
56
|
+
|
57
|
+
# Log GPU detection results
|
58
|
+
log_detection: true
|
59
|
+
|
60
|
+
# Force CPU mode even if GPU is detected (for testing CPU Milvus)
|
61
|
+
# Uncomment the line below to force CPU mode:
|
62
|
+
# force_cpu_mode: true
|
@@ -0,0 +1,127 @@
|
|
1
|
+
"""
|
2
|
+
Unit tests for NVIDIA NIM reranker error handling in nvidia_nim_reranker.py
|
3
|
+
"""
|
4
|
+
|
5
|
+
from unittest.mock import MagicMock, patch
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
from langchain_core.documents import Document
|
9
|
+
|
10
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils import nvidia_nim_reranker
|
11
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker import (
|
12
|
+
rerank_chunks,
|
13
|
+
)
|
14
|
+
|
15
|
+
|
16
|
+
@pytest.fixture(name="chunks_fixture")
|
17
|
+
def fixture_chunks():
|
18
|
+
"""chunks_fixture fixture to simulate PDF chunks."""
|
19
|
+
return [
|
20
|
+
Document(
|
21
|
+
page_content=f"chunk {i}",
|
22
|
+
metadata={"paper_id": f"P{i%2}", "relevance_score": 0.9 - 0.01 * i},
|
23
|
+
)
|
24
|
+
for i in range(10)
|
25
|
+
]
|
26
|
+
|
27
|
+
|
28
|
+
def test_rerank_chunks_short_input(chunks_fixture):
|
29
|
+
"""rerank_chunks with fewer chunks than top_k should return original."""
|
30
|
+
result = rerank_chunks(
|
31
|
+
chunks_fixture[:3], "What is cancer?", config=MagicMock(), top_k=5
|
32
|
+
)
|
33
|
+
assert result == chunks_fixture[:3]
|
34
|
+
|
35
|
+
|
36
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.logger")
|
37
|
+
def test_rerank_chunks_missing_api_key_logs_and_raises(mock_logger, chunks_fixture):
|
38
|
+
"""
|
39
|
+
If config.reranker.api_key is None:
|
40
|
+
- logger.error(...) should be called
|
41
|
+
- rerank_chunks should raise ValueError
|
42
|
+
"""
|
43
|
+
mock_config = MagicMock()
|
44
|
+
mock_config.reranker.api_key = None
|
45
|
+
|
46
|
+
with pytest.raises(
|
47
|
+
ValueError,
|
48
|
+
match="Configuration 'reranker.api_key' must be set for reranking",
|
49
|
+
):
|
50
|
+
rerank_chunks(chunks_fixture, "What is cancer?", config=mock_config, top_k=5)
|
51
|
+
|
52
|
+
mock_logger.error.assert_called_once_with(
|
53
|
+
"No NVIDIA API key found in configuration for reranking"
|
54
|
+
)
|
55
|
+
|
56
|
+
|
57
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.NVIDIARerank")
|
58
|
+
def test_rerank_chunks_success(mock_reranker_cls, chunks_fixture):
|
59
|
+
"""rerank_chunks with successful reranking."""
|
60
|
+
reranker_instance = MagicMock()
|
61
|
+
reranker_instance.compress_documents.return_value = list(reversed(chunks_fixture))
|
62
|
+
mock_reranker_cls.return_value = reranker_instance
|
63
|
+
|
64
|
+
mock_config = MagicMock()
|
65
|
+
mock_config.reranker.api_key = "test_key"
|
66
|
+
mock_config.reranker.model = "test_model"
|
67
|
+
|
68
|
+
result = rerank_chunks(
|
69
|
+
chunks_fixture, "Explain mitochondria.", config=mock_config, top_k=5
|
70
|
+
)
|
71
|
+
|
72
|
+
assert isinstance(result, list)
|
73
|
+
assert result == list(reversed(chunks_fixture))[:5]
|
74
|
+
reranker_instance.compress_documents.assert_called_once_with(
|
75
|
+
query="Explain mitochondria.", documents=chunks_fixture
|
76
|
+
)
|
77
|
+
|
78
|
+
|
79
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.NVIDIARerank")
|
80
|
+
def test_rerank_chunks_reranker_fails_raises_and_calls_compress(
|
81
|
+
mock_reranker_cls, chunks_fixture
|
82
|
+
):
|
83
|
+
"""
|
84
|
+
If NVIDIARerank.compress_documents raises RuntimeError:
|
85
|
+
- rerank_chunks should propagate the RuntimeError
|
86
|
+
- and compress_documents should have been called
|
87
|
+
"""
|
88
|
+
reranker_instance = MagicMock()
|
89
|
+
reranker_instance.compress_documents.side_effect = RuntimeError("API failure")
|
90
|
+
mock_reranker_cls.return_value = reranker_instance
|
91
|
+
|
92
|
+
mock_config = MagicMock()
|
93
|
+
mock_config.reranker.api_key = "valid_key"
|
94
|
+
mock_config.reranker.model = "reranker"
|
95
|
+
|
96
|
+
with pytest.raises(RuntimeError, match="API failure"):
|
97
|
+
rerank_chunks(
|
98
|
+
chunks_fixture, "How does light affect plants?", config=mock_config, top_k=3
|
99
|
+
)
|
100
|
+
|
101
|
+
reranker_instance.compress_documents.assert_called_once_with(
|
102
|
+
query="How does light affect plants?", documents=chunks_fixture
|
103
|
+
)
|
104
|
+
|
105
|
+
|
106
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.logger")
|
107
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.NVIDIARerank")
|
108
|
+
def test_rerank_chunks_debug_block_triggered(
|
109
|
+
mock_reranker_cls, mock_logger, chunks_fixture
|
110
|
+
):
|
111
|
+
"""rerank_chunks should log debug info if debug logging is enabled."""
|
112
|
+
mock_logger.isEnabledFor.return_value = True
|
113
|
+
|
114
|
+
reranker_instance = MagicMock()
|
115
|
+
reranker_instance.compress_documents.return_value = chunks_fixture
|
116
|
+
mock_reranker_cls.return_value = reranker_instance
|
117
|
+
|
118
|
+
mock_config = MagicMock()
|
119
|
+
mock_config.reranker.api_key = "abc"
|
120
|
+
mock_config.reranker.model = "mymodel"
|
121
|
+
|
122
|
+
result = nvidia_nim_reranker.rerank_chunks(
|
123
|
+
chunks_fixture * 2, "Test query", mock_config, top_k=3
|
124
|
+
)
|
125
|
+
|
126
|
+
assert result == chunks_fixture[:3]
|
127
|
+
assert mock_logger.debug.called
|
@@ -0,0 +1,66 @@
|
|
1
|
+
"""answer_formatter tests."""
|
2
|
+
|
3
|
+
from unittest.mock import patch
|
4
|
+
import pytest
|
5
|
+
|
6
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils.answer_formatter import format_answer
|
7
|
+
|
8
|
+
|
9
|
+
@pytest.fixture(name="base_args")
|
10
|
+
def _base_args():
|
11
|
+
"""base_args fixture to provide common arguments for tests."""
|
12
|
+
return {
|
13
|
+
"question": "What is the conclusion?",
|
14
|
+
"chunks": [{"content": "chunk1"}, {"content": "chunk2"}],
|
15
|
+
"llm": "mock_llm",
|
16
|
+
"articles": {
|
17
|
+
"paper1": {"Title": "Paper One"},
|
18
|
+
"paper2": {"Title": "Paper Two"},
|
19
|
+
},
|
20
|
+
"config": {"key": "value"},
|
21
|
+
"call_id": "test_call_123",
|
22
|
+
"has_gpu": True,
|
23
|
+
}
|
24
|
+
|
25
|
+
|
26
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.answer_formatter.generate_answer")
|
27
|
+
def test_format_answer_with_sources(mock_generate_answer, base_args):
|
28
|
+
"""test format_answer with sources."""
|
29
|
+
mock_generate_answer.return_value = {
|
30
|
+
"output_text": "This is the generated answer.",
|
31
|
+
"papers_used": ["paper1", "paper2"],
|
32
|
+
}
|
33
|
+
|
34
|
+
result = format_answer(**base_args)
|
35
|
+
|
36
|
+
assert "This is the generated answer." in result
|
37
|
+
assert "Sources:" in result
|
38
|
+
assert "- Paper One" in result
|
39
|
+
assert "- Paper Two" in result
|
40
|
+
mock_generate_answer.assert_called_once()
|
41
|
+
|
42
|
+
|
43
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.answer_formatter.generate_answer")
|
44
|
+
def test_format_answer_no_sources(mock_generate_answer, base_args):
|
45
|
+
"""test format_answer with no sources."""
|
46
|
+
mock_generate_answer.return_value = {
|
47
|
+
"output_text": "No sources were used.",
|
48
|
+
"papers_used": [], # No papers used
|
49
|
+
}
|
50
|
+
|
51
|
+
result = format_answer(**base_args)
|
52
|
+
|
53
|
+
assert result == "No sources were used." # No sources section expected
|
54
|
+
mock_generate_answer.assert_called_once()
|
55
|
+
|
56
|
+
|
57
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.answer_formatter.generate_answer")
|
58
|
+
def test_format_answer_missing_output_text(mock_generate_answer, base_args):
|
59
|
+
"""test format_answer with missing output text."""
|
60
|
+
mock_generate_answer.return_value = {"papers_used": ["paper1"]}
|
61
|
+
|
62
|
+
result = format_answer(**base_args)
|
63
|
+
|
64
|
+
assert result.startswith("No answer generated.")
|
65
|
+
assert "Sources:" in result
|
66
|
+
mock_generate_answer.assert_called_once()
|
@@ -0,0 +1,101 @@
|
|
1
|
+
"""Tests for the PDF batch processor module."""
|
2
|
+
|
3
|
+
from unittest.mock import MagicMock, patch
|
4
|
+
import pytest
|
5
|
+
|
6
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor import (
|
7
|
+
add_papers_batch,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
@pytest.fixture(name="args_fixture")
|
12
|
+
def _args_fixture():
|
13
|
+
"""Provides common arguments for tests."""
|
14
|
+
return {
|
15
|
+
"vector_store": MagicMock(),
|
16
|
+
"loaded_papers": set(),
|
17
|
+
"paper_metadata": {},
|
18
|
+
"documents": {},
|
19
|
+
"config": {"param": "value"},
|
20
|
+
"metadata_fields": ["Title", "Author"],
|
21
|
+
"has_gpu": False,
|
22
|
+
}
|
23
|
+
|
24
|
+
|
25
|
+
@patch(
|
26
|
+
"aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
|
27
|
+
)
|
28
|
+
def test_no_papers_to_add(mock_loader, args_fixture):
|
29
|
+
"""Test case where no papers are provided to add."""
|
30
|
+
add_papers_batch(papers_to_add=[], **args_fixture)
|
31
|
+
mock_loader.assert_not_called()
|
32
|
+
|
33
|
+
|
34
|
+
@patch(
|
35
|
+
"aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
|
36
|
+
)
|
37
|
+
def test_all_papers_already_loaded(mock_loader, args_fixture):
|
38
|
+
"""Test case where all papers are already loaded."""
|
39
|
+
args_fixture["loaded_papers"].update(["p1", "p2"])
|
40
|
+
add_papers_batch(
|
41
|
+
papers_to_add=[("p1", "url1", {}), ("p2", "url2", {})],
|
42
|
+
**args_fixture,
|
43
|
+
)
|
44
|
+
mock_loader.assert_not_called()
|
45
|
+
|
46
|
+
|
47
|
+
@patch(
|
48
|
+
"aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
|
49
|
+
)
|
50
|
+
def test_successful_batch_embedding(mock_loader, args_fixture):
|
51
|
+
"""Test case where papers are successfully loaded and embedded."""
|
52
|
+
mock_loader.return_value = [
|
53
|
+
MagicMock(page_content="Page 1"),
|
54
|
+
MagicMock(page_content="Page 2"),
|
55
|
+
]
|
56
|
+
|
57
|
+
mock_collection = MagicMock()
|
58
|
+
mock_collection.num_entities = 2
|
59
|
+
mock_collection.query.return_value = [{"paper_id": "p1"}]
|
60
|
+
args_fixture["vector_store"].col = mock_collection
|
61
|
+
|
62
|
+
add_papers_batch(
|
63
|
+
papers_to_add=[("p1", "url1", {"Title": "Paper One"})],
|
64
|
+
**args_fixture,
|
65
|
+
)
|
66
|
+
|
67
|
+
assert "p1" in args_fixture["paper_metadata"]
|
68
|
+
assert "p1" in args_fixture["loaded_papers"]
|
69
|
+
args_fixture["vector_store"].add_documents.assert_called_once()
|
70
|
+
mock_collection.flush.assert_called()
|
71
|
+
|
72
|
+
|
73
|
+
@patch(
|
74
|
+
"aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
|
75
|
+
)
|
76
|
+
def test_empty_chunks_after_loading(mock_loader, args_fixture):
|
77
|
+
"""Test case where no chunks are returned after loading PDF."""
|
78
|
+
mock_loader.return_value = []
|
79
|
+
|
80
|
+
add_papers_batch(papers_to_add=[("p1", "url1", {})], **args_fixture)
|
81
|
+
|
82
|
+
args_fixture["vector_store"].add_documents.assert_not_called()
|
83
|
+
|
84
|
+
|
85
|
+
@patch(
|
86
|
+
"aiagents4pharma.talk2scholars.tools.pdf.utils.batch_processor.load_and_split_pdf"
|
87
|
+
)
|
88
|
+
def test_vector_store_insert_failure(mock_loader, args_fixture):
|
89
|
+
"""Test case where vector store insertion fails."""
|
90
|
+
mock_loader.return_value = [MagicMock(page_content="page")]
|
91
|
+
|
92
|
+
def raise_error(*_, **__):
|
93
|
+
raise RuntimeError("Vector store failed")
|
94
|
+
|
95
|
+
args_fixture["vector_store"].add_documents.side_effect = raise_error
|
96
|
+
|
97
|
+
mock_collection = MagicMock()
|
98
|
+
args_fixture["vector_store"].col = mock_collection
|
99
|
+
|
100
|
+
with pytest.raises(RuntimeError, match="Vector store failed"):
|
101
|
+
add_papers_batch(papers_to_add=[("p1", "url1", {})], **args_fixture)
|
@@ -0,0 +1,150 @@
|
|
1
|
+
"""collection_manager for managing Milvus collections for PDF chunks."""
|
2
|
+
|
3
|
+
from unittest.mock import MagicMock, patch
|
4
|
+
from dataclasses import dataclass, field
|
5
|
+
import pytest
|
6
|
+
|
7
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils import collection_manager
|
8
|
+
|
9
|
+
|
10
|
+
# -- Fixtures --
|
11
|
+
|
12
|
+
|
13
|
+
@pytest.fixture
|
14
|
+
def config_mock():
|
15
|
+
"""Dataclass config fixture to simulate Milvus config."""
|
16
|
+
|
17
|
+
@dataclass
|
18
|
+
class MilvusConfig:
|
19
|
+
"""Simulated Milvus inner config."""
|
20
|
+
|
21
|
+
embedding_dim: int = 768
|
22
|
+
|
23
|
+
@dataclass
|
24
|
+
class Config:
|
25
|
+
"""Simulated outer config."""
|
26
|
+
|
27
|
+
milvus: MilvusConfig = field(default_factory=MilvusConfig)
|
28
|
+
|
29
|
+
return Config()
|
30
|
+
|
31
|
+
|
32
|
+
@pytest.fixture
|
33
|
+
def index_params():
|
34
|
+
"""Fixture to provide index parameters for tests."""
|
35
|
+
return {"index_type": "IVF_FLAT", "params": {"nlist": 128}, "metric_type": "L2"}
|
36
|
+
|
37
|
+
|
38
|
+
# -- Safe collection_cache access --
|
39
|
+
|
40
|
+
|
41
|
+
def set_collection_cache(key, value):
|
42
|
+
"""Set a mocked collection into the cache."""
|
43
|
+
getattr(collection_manager, "_collection_cache")[key] = value
|
44
|
+
|
45
|
+
|
46
|
+
def clear_collection_cache(key):
|
47
|
+
"""Remove a mocked collection from the cache."""
|
48
|
+
getattr(collection_manager, "_collection_cache").pop(key, None)
|
49
|
+
|
50
|
+
|
51
|
+
# -- Tests --
|
52
|
+
|
53
|
+
|
54
|
+
def test_cached_collection_returned(request):
|
55
|
+
"""Check if cached collection is returned."""
|
56
|
+
config = request.getfixturevalue("config_mock")
|
57
|
+
index = request.getfixturevalue("index_params")
|
58
|
+
mock_collection = MagicMock()
|
59
|
+
collection_name = "test_cached"
|
60
|
+
|
61
|
+
set_collection_cache(collection_name, mock_collection)
|
62
|
+
|
63
|
+
result = collection_manager.ensure_collection_exists(
|
64
|
+
collection_name, config, index, has_gpu=False
|
65
|
+
)
|
66
|
+
|
67
|
+
assert result == mock_collection
|
68
|
+
clear_collection_cache(collection_name)
|
69
|
+
|
70
|
+
|
71
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.Collection")
|
72
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.utility")
|
73
|
+
def test_create_new_collection(mock_utility, mock_collection_cls, request):
|
74
|
+
"""Check if new collection is created when it does not exist."""
|
75
|
+
config = request.getfixturevalue("config_mock")
|
76
|
+
index = request.getfixturevalue("index_params")
|
77
|
+
mock_utility.list_collections.return_value = []
|
78
|
+
|
79
|
+
mock_collection = MagicMock()
|
80
|
+
mock_collection_cls.return_value = mock_collection
|
81
|
+
mock_collection.indexes = [MagicMock(field_name="embedding")]
|
82
|
+
mock_collection.num_entities = 5
|
83
|
+
|
84
|
+
result = collection_manager.ensure_collection_exists(
|
85
|
+
"new_collection", config, index, has_gpu=True
|
86
|
+
)
|
87
|
+
|
88
|
+
assert mock_collection.create_index.called
|
89
|
+
assert mock_collection.load.called
|
90
|
+
assert result == mock_collection
|
91
|
+
|
92
|
+
|
93
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.Collection")
|
94
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.utility")
|
95
|
+
def test_load_existing_collection(mock_utility, mock_collection_cls, request):
|
96
|
+
"""Test loading an existing collection."""
|
97
|
+
config = request.getfixturevalue("config_mock")
|
98
|
+
index = request.getfixturevalue("index_params")
|
99
|
+
mock_utility.list_collections.return_value = ["existing_collection"]
|
100
|
+
|
101
|
+
mock_collection = MagicMock()
|
102
|
+
mock_collection_cls.return_value = mock_collection
|
103
|
+
mock_collection.indexes = []
|
104
|
+
mock_collection.num_entities = 0
|
105
|
+
|
106
|
+
result = collection_manager.ensure_collection_exists(
|
107
|
+
"existing_collection", config, index, has_gpu=False
|
108
|
+
)
|
109
|
+
|
110
|
+
mock_collection.load.assert_called_once()
|
111
|
+
assert result == mock_collection
|
112
|
+
|
113
|
+
|
114
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.Collection")
|
115
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.utility")
|
116
|
+
def test_debug_collection_state_failure(mock_utility, mock_collection_cls, request):
|
117
|
+
"""debug_collection_state should log but not raise on failure."""
|
118
|
+
config = request.getfixturevalue("config_mock")
|
119
|
+
index = request.getfixturevalue("index_params")
|
120
|
+
mock_utility.list_collections.return_value = ["bad_collection"]
|
121
|
+
|
122
|
+
mock_collection = MagicMock()
|
123
|
+
mock_collection_cls.return_value = mock_collection
|
124
|
+
mock_collection.indexes = []
|
125
|
+
mock_collection.num_entities = 10
|
126
|
+
|
127
|
+
mock_collection.schema = property(
|
128
|
+
lambda _: (_ for _ in ()).throw(Exception("bad schema"))
|
129
|
+
)
|
130
|
+
|
131
|
+
result = collection_manager.ensure_collection_exists(
|
132
|
+
"bad_collection", config, index, has_gpu=True
|
133
|
+
)
|
134
|
+
|
135
|
+
assert result == mock_collection
|
136
|
+
|
137
|
+
|
138
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.Collection")
|
139
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.collection_manager.utility")
|
140
|
+
def test_ensure_collection_exception(mock_utility, mock_collection_cls, request):
|
141
|
+
"""ensure_collection_exists should raise on utility failure."""
|
142
|
+
config = request.getfixturevalue("config_mock")
|
143
|
+
index = request.getfixturevalue("index_params")
|
144
|
+
mock_utility.list_collections.side_effect = RuntimeError("milvus failure")
|
145
|
+
mock_collection_cls.return_value = MagicMock()
|
146
|
+
|
147
|
+
with pytest.raises(RuntimeError, match="milvus failure"):
|
148
|
+
collection_manager.ensure_collection_exists(
|
149
|
+
"fail_collection", config, index, has_gpu=False
|
150
|
+
)
|
@@ -0,0 +1,69 @@
|
|
1
|
+
"""Unit tests for PDF document processing utilities."""
|
2
|
+
|
3
|
+
from unittest.mock import MagicMock, patch
|
4
|
+
import pytest
|
5
|
+
|
6
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils.document_processor import (
|
7
|
+
load_and_split_pdf,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
@pytest.fixture(name="base_args_params")
|
12
|
+
def _base_args_params():
|
13
|
+
"""base_args_params fixture to provide common arguments for tests."""
|
14
|
+
return {
|
15
|
+
"paper_id": "P123",
|
16
|
+
"pdf_url": "mock/path/to/paper.pdf",
|
17
|
+
"paper_metadata": {"Title": "Test Paper", "Author": "A. Researcher"},
|
18
|
+
"config": type("Config", (), {"chunk_size": 1000, "chunk_overlap": 200})(),
|
19
|
+
"metadata_fields": ["Author"],
|
20
|
+
"documents_dict": {},
|
21
|
+
}
|
22
|
+
|
23
|
+
|
24
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.document_processor.PyPDFLoader")
|
25
|
+
@patch(
|
26
|
+
"aiagents4pharma.talk2scholars.tools.pdf.utils.document_processor."
|
27
|
+
"RecursiveCharacterTextSplitter"
|
28
|
+
)
|
29
|
+
def test_load_and_split_pdf_success(
|
30
|
+
mock_splitter_cls, mock_loader_cls, base_args_params
|
31
|
+
):
|
32
|
+
"""load_and_split_pdf should load and split PDF correctly."""
|
33
|
+
mock_doc = MagicMock()
|
34
|
+
mock_doc.metadata = {"page": 1}
|
35
|
+
mock_loader = MagicMock()
|
36
|
+
mock_loader.load.return_value = [mock_doc]
|
37
|
+
mock_loader_cls.return_value = mock_loader
|
38
|
+
|
39
|
+
mock_splitter = MagicMock()
|
40
|
+
chunk1 = MagicMock()
|
41
|
+
chunk1.metadata = {"page": 1}
|
42
|
+
mock_splitter.split_documents.return_value = [chunk1]
|
43
|
+
mock_splitter_cls.return_value = mock_splitter
|
44
|
+
|
45
|
+
chunks = load_and_split_pdf(**base_args_params)
|
46
|
+
|
47
|
+
assert len(chunks) == 1
|
48
|
+
assert "P123_0" in base_args_params["documents_dict"]
|
49
|
+
stored_chunk = base_args_params["documents_dict"]["P123_0"]
|
50
|
+
assert stored_chunk.metadata["paper_id"] == "P123"
|
51
|
+
assert stored_chunk.metadata["title"] == "Test Paper"
|
52
|
+
assert stored_chunk.metadata["chunk_id"] == 0
|
53
|
+
assert stored_chunk.metadata["page"] == 1
|
54
|
+
assert stored_chunk.metadata["source"] == base_args_params["pdf_url"]
|
55
|
+
assert stored_chunk.metadata["Author"] == "A. Researcher"
|
56
|
+
|
57
|
+
|
58
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.document_processor.PyPDFLoader")
|
59
|
+
def test_load_and_split_pdf_raises_if_config_missing(mock_loader_cls, base_args_params):
|
60
|
+
"""load_and_split_pdf should raise ValueError if config is None."""
|
61
|
+
mock_loader = MagicMock()
|
62
|
+
mock_loader.load.return_value = [MagicMock()]
|
63
|
+
mock_loader_cls.return_value = mock_loader
|
64
|
+
|
65
|
+
base_args_params["config"] = None
|
66
|
+
with pytest.raises(
|
67
|
+
ValueError, match="Configuration is required for text splitting in Vectorstore."
|
68
|
+
):
|
69
|
+
load_and_split_pdf(**base_args_params)
|
@@ -0,0 +1,75 @@
|
|
1
|
+
"""generate_answer tests for the PDF tool"""
|
2
|
+
|
3
|
+
from unittest.mock import MagicMock
|
4
|
+
import pytest
|
5
|
+
|
6
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils.generate_answer import (
|
7
|
+
_build_context_and_sources,
|
8
|
+
generate_answer,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture(name="chunks_fixture")
|
13
|
+
def _chunks_fixture():
|
14
|
+
"""Fixture providing sample document chunks."""
|
15
|
+
doc1 = MagicMock()
|
16
|
+
doc1.page_content = "This is chunk one."
|
17
|
+
doc1.metadata = {"paper_id": "P1", "title": "Title 1", "page": 1}
|
18
|
+
|
19
|
+
doc2 = MagicMock()
|
20
|
+
doc2.page_content = "This is chunk two."
|
21
|
+
doc2.metadata = {"paper_id": "P1", "title": "Title 1", "page": 2}
|
22
|
+
|
23
|
+
doc3 = MagicMock()
|
24
|
+
doc3.page_content = "This is chunk three."
|
25
|
+
doc3.metadata = {"paper_id": "P2", "title": "Title 2", "page": 1}
|
26
|
+
|
27
|
+
return [doc1, doc2, doc3]
|
28
|
+
|
29
|
+
|
30
|
+
def test_build_context_and_sources_formatting(chunks_fixture):
|
31
|
+
"""_build_context_and_sources should format context and sources correctly."""
|
32
|
+
context, sources = _build_context_and_sources(chunks_fixture)
|
33
|
+
|
34
|
+
assert "[Document 1] From: 'Title 1' (ID: P1)" in context
|
35
|
+
assert "Page 1: This is chunk one." in context
|
36
|
+
assert "Page 2: This is chunk two." in context
|
37
|
+
assert "[Document 2] From: 'Title 2' (ID: P2)" in context
|
38
|
+
assert "Page 1: This is chunk three." in context
|
39
|
+
assert sources == {"P1", "P2"}
|
40
|
+
|
41
|
+
|
42
|
+
def test_generate_answer_success(chunks_fixture):
|
43
|
+
"""generate_answer should return formatted answer and sources."""
|
44
|
+
mock_llm = MagicMock()
|
45
|
+
mock_llm.invoke.return_value.content = "The answer is XYZ."
|
46
|
+
|
47
|
+
config = {
|
48
|
+
"prompt_template": "Answer the question based on the context."
|
49
|
+
"\n\n{context}\n\nQ: {question}\nA:"
|
50
|
+
}
|
51
|
+
|
52
|
+
result = generate_answer("What is the result?", chunks_fixture, mock_llm, config)
|
53
|
+
|
54
|
+
assert result["output_text"] == "The answer is XYZ."
|
55
|
+
assert len(result["sources"]) == 3
|
56
|
+
assert result["num_sources"] == 3
|
57
|
+
assert set(result["papers_used"]) == {"P1", "P2"}
|
58
|
+
|
59
|
+
|
60
|
+
def test_generate_answer_raises_for_none_config(chunks_fixture):
|
61
|
+
"""generate_answer should raise ValueError for None config."""
|
62
|
+
mock_llm = MagicMock()
|
63
|
+
with pytest.raises(
|
64
|
+
ValueError, match="Configuration for generate_answer is required."
|
65
|
+
):
|
66
|
+
generate_answer("Why?", chunks_fixture, mock_llm, config=None)
|
67
|
+
|
68
|
+
|
69
|
+
def test_generate_answer_raises_for_missing_template(chunks_fixture):
|
70
|
+
"""generate_answer should raise ValueError for missing prompt_template in config."""
|
71
|
+
mock_llm = MagicMock()
|
72
|
+
with pytest.raises(
|
73
|
+
ValueError, match="The prompt_template is missing from the configuration."
|
74
|
+
):
|
75
|
+
generate_answer("Why?", chunks_fixture, mock_llm, config={})
|