aiagents4pharma 1.40.0__py3-none-any.whl → 1.41.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +4 -0
- aiagents4pharma/talk2scholars/configs/tools/question_and_answer/default.yaml +44 -4
- aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker.py +127 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_answer_formatter.py +66 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_batch_processor.py +101 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_collection_manager.py +150 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_document_processor.py +69 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_generate_answer.py +75 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_gpu_detection.py +140 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_paper_loader.py +116 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_rag_pipeline.py +98 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_retrieve_chunks.py +197 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_singleton_manager.py +156 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_vector_normalization.py +121 -0
- aiagents4pharma/talk2scholars/tests/test_pdf_vector_store.py +434 -0
- aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +89 -509
- aiagents4pharma/talk2scholars/tests/test_tool_helper_utils.py +34 -89
- aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +8 -6
- aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +6 -4
- aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +74 -40
- aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +26 -1
- aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +62 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +200 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +172 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +76 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +14 -14
- aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +63 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +154 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +60 -40
- aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +123 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +122 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +162 -40
- aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +140 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +40 -78
- aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +159 -0
- aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +277 -96
- aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +12 -9
- aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +0 -1
- aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +9 -8
- aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +5 -5
- {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/METADATA +27 -115
- {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/RECORD +45 -23
- aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker_utils.py +0 -28
- {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/WHEEL +0 -0
- {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/licenses/LICENSE +0 -0
- {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
"""gpu detection and index configuration tests."""
|
2
|
+
|
3
|
+
import subprocess
|
4
|
+
from types import SimpleNamespace
|
5
|
+
from unittest.mock import MagicMock, patch
|
6
|
+
|
7
|
+
|
8
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils.gpu_detection import (
|
9
|
+
detect_nvidia_gpu,
|
10
|
+
get_optimal_index_config,
|
11
|
+
log_index_configuration,
|
12
|
+
)
|
13
|
+
|
14
|
+
# === detect_nvidia_gpu ===
|
15
|
+
|
16
|
+
|
17
|
+
def test_detect_nvidia_gpu_force_cpu_from_config():
|
18
|
+
"""detect_nvidia_gpu should return False if force_cpu_mode is set."""
|
19
|
+
config = SimpleNamespace(gpu_detection=SimpleNamespace(force_cpu_mode=True))
|
20
|
+
assert detect_nvidia_gpu(config) is False
|
21
|
+
|
22
|
+
|
23
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.gpu_detection.subprocess.run")
|
24
|
+
def test_detect_nvidia_gpu_success(mock_run):
|
25
|
+
"""detect_nvidia_gpu should return True if NVIDIA GPUs are detected."""
|
26
|
+
mock_run.return_value = MagicMock(
|
27
|
+
returncode=0, stdout="NVIDIA A100\nNVIDIA RTX 3090"
|
28
|
+
)
|
29
|
+
|
30
|
+
assert detect_nvidia_gpu() is True
|
31
|
+
mock_run.assert_called_once()
|
32
|
+
|
33
|
+
|
34
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.gpu_detection.subprocess.run")
|
35
|
+
def test_detect_nvidia_gpu_no_output(mock_run):
|
36
|
+
"""detect_nvidia_gpu should return False if no GPUs are detected."""
|
37
|
+
mock_run.return_value = MagicMock(returncode=0, stdout="")
|
38
|
+
|
39
|
+
assert detect_nvidia_gpu() is False
|
40
|
+
|
41
|
+
|
42
|
+
# === get_optimal_index_config ===
|
43
|
+
|
44
|
+
|
45
|
+
def test_get_optimal_index_config_gpu():
|
46
|
+
"""get_optimal_index_config should return GPU_CAGRA for GPU setup."""
|
47
|
+
index_params, search_params = get_optimal_index_config(
|
48
|
+
has_gpu=True, embedding_dim=768
|
49
|
+
)
|
50
|
+
|
51
|
+
assert index_params["index_type"] == "GPU_CAGRA"
|
52
|
+
assert "cache_dataset_on_device" in index_params["params"]
|
53
|
+
assert search_params["params"]["search_width"] == 16
|
54
|
+
|
55
|
+
|
56
|
+
def test_get_optimal_index_config_cpu():
|
57
|
+
"""get_optimal_index_config should return IVF_FLAT for CPU setup."""
|
58
|
+
index_params, search_params = get_optimal_index_config(
|
59
|
+
has_gpu=False, embedding_dim=768
|
60
|
+
)
|
61
|
+
|
62
|
+
assert index_params["index_type"] == "IVF_FLAT"
|
63
|
+
assert index_params["params"]["nlist"] == 96 # 768 / 8 = 96
|
64
|
+
assert search_params["params"]["nprobe"] == 16
|
65
|
+
|
66
|
+
|
67
|
+
# === log_index_configuration ===
|
68
|
+
|
69
|
+
|
70
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.gpu_detection.logger")
|
71
|
+
def test_log_index_configuration_logs_all(mock_logger):
|
72
|
+
"""log_index_configuration should log all parameters correctly."""
|
73
|
+
index_params = {
|
74
|
+
"index_type": "IVF_FLAT",
|
75
|
+
"metric_type": "COSINE",
|
76
|
+
"params": {"nlist": 128},
|
77
|
+
}
|
78
|
+
search_params = {"metric_type": "COSINE", "params": {"nprobe": 16}}
|
79
|
+
|
80
|
+
log_index_configuration(index_params, search_params)
|
81
|
+
|
82
|
+
assert mock_logger.info.call_count >= 5
|
83
|
+
|
84
|
+
|
85
|
+
def test_get_optimal_index_config_gpu_without_cosine():
|
86
|
+
"""Ensure GPU config defaults to IP when use_cosine is False."""
|
87
|
+
index_params, search_params = get_optimal_index_config(
|
88
|
+
has_gpu=True, embedding_dim=768, use_cosine=False
|
89
|
+
)
|
90
|
+
|
91
|
+
assert index_params["index_type"] == "GPU_CAGRA"
|
92
|
+
assert index_params["metric_type"] == "IP"
|
93
|
+
assert search_params["metric_type"] == "IP"
|
94
|
+
|
95
|
+
|
96
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.gpu_detection.logger")
|
97
|
+
def test_log_index_configuration_logs_cosine_simulation_note(mock_logger):
|
98
|
+
"""Test GPU_CAGRA COSINE -> IP note is logged properly."""
|
99
|
+
index_params = {
|
100
|
+
"index_type": "GPU_CAGRA",
|
101
|
+
"metric_type": "IP",
|
102
|
+
"params": {"itopk_size": 128},
|
103
|
+
}
|
104
|
+
search_params = {
|
105
|
+
"metric_type": "IP",
|
106
|
+
"params": {"search_width": 16},
|
107
|
+
}
|
108
|
+
|
109
|
+
log_index_configuration(index_params, search_params, use_cosine=True)
|
110
|
+
|
111
|
+
log_messages = [str(call.args[0]) for call in mock_logger.info.call_args_list]
|
112
|
+
assert any("simulate COSINE for GPU" in msg for msg in log_messages)
|
113
|
+
|
114
|
+
|
115
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.gpu_detection.logger")
|
116
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.gpu_detection.subprocess.run")
|
117
|
+
def test_detect_nvidia_gpu_timeout_raises_false(mock_run, mock_logger):
|
118
|
+
"""detect_nvidia_gpu should return False and log info on subprocess.TimeoutExpired."""
|
119
|
+
# Simulate a timeout
|
120
|
+
mock_run.side_effect = subprocess.TimeoutExpired(cmd="nvidia-smi", timeout=10)
|
121
|
+
|
122
|
+
result = detect_nvidia_gpu()
|
123
|
+
assert result is False
|
124
|
+
mock_logger.info.assert_called_with(
|
125
|
+
"NVIDIA GPU detection failed: %s", mock_run.side_effect
|
126
|
+
)
|
127
|
+
|
128
|
+
|
129
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.gpu_detection.logger")
|
130
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.gpu_detection.subprocess.run")
|
131
|
+
def test_detect_nvidia_gpu_file_not_found_raises_false(mock_run, mock_logger):
|
132
|
+
"""detect_nvidia_gpu should return False and log info on FileNotFoundError."""
|
133
|
+
# Simulate nvidia-smi not installed
|
134
|
+
mock_run.side_effect = FileNotFoundError("nvidia-smi not found")
|
135
|
+
|
136
|
+
result = detect_nvidia_gpu()
|
137
|
+
assert result is False
|
138
|
+
mock_logger.info.assert_called_with(
|
139
|
+
"NVIDIA GPU detection failed: %s", mock_run.side_effect
|
140
|
+
)
|
@@ -0,0 +1,116 @@
|
|
1
|
+
"""paper_loader tests for the load_all_papers function."""
|
2
|
+
|
3
|
+
from unittest.mock import MagicMock, patch
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
|
7
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils.paper_loader import (
|
8
|
+
load_all_papers,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture
|
13
|
+
def articles():
|
14
|
+
"""A fixture to provide a sample articles dictionary."""
|
15
|
+
return {
|
16
|
+
"p1": {"pdf_url": "http://example.com/p1.pdf", "title": "Paper 1"},
|
17
|
+
"p2": {"pdf_url": "http://example.com/p2.pdf", "title": "Paper 2"},
|
18
|
+
"p3": {"title": "No PDF paper"},
|
19
|
+
}
|
20
|
+
|
21
|
+
|
22
|
+
@pytest.fixture
|
23
|
+
def mock_vector_store():
|
24
|
+
"""Mock vector store fixture."""
|
25
|
+
return MagicMock(
|
26
|
+
loaded_papers={"p1"},
|
27
|
+
paper_metadata={},
|
28
|
+
documents={},
|
29
|
+
metadata_fields=["title"],
|
30
|
+
config={"embedding_batch_size": 1234},
|
31
|
+
has_gpu=False,
|
32
|
+
vector_store=MagicMock(),
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.paper_loader.add_papers_batch")
|
37
|
+
def test_all_papers_loaded_returns_early(mock_batch, request):
|
38
|
+
"""Test early return when all papers are already loaded."""
|
39
|
+
article_data = request.getfixturevalue("articles")
|
40
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
41
|
+
vector_store.loaded_papers = set(article_data.keys())
|
42
|
+
|
43
|
+
load_all_papers(
|
44
|
+
vector_store=vector_store,
|
45
|
+
articles=article_data,
|
46
|
+
call_id="test_call",
|
47
|
+
config={"embedding_batch_size": 1000},
|
48
|
+
has_gpu=False,
|
49
|
+
)
|
50
|
+
|
51
|
+
mock_batch.assert_not_called()
|
52
|
+
|
53
|
+
|
54
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.paper_loader.add_papers_batch")
|
55
|
+
def test_skips_papers_without_pdf(mock_batch, request):
|
56
|
+
"""Test that papers without PDF URLs are skipped."""
|
57
|
+
article_data = request.getfixturevalue("articles")
|
58
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
59
|
+
vector_store.loaded_papers = {"p2"} # p1 not loaded, p3 has no pdf
|
60
|
+
|
61
|
+
load_all_papers(
|
62
|
+
vector_store=vector_store,
|
63
|
+
articles=article_data,
|
64
|
+
call_id="test_call",
|
65
|
+
config={"embedding_batch_size": 1000},
|
66
|
+
has_gpu=False,
|
67
|
+
)
|
68
|
+
|
69
|
+
assert mock_batch.call_count == 1
|
70
|
+
call_args = mock_batch.call_args[1]["papers_to_add"]
|
71
|
+
assert len(call_args) == 1
|
72
|
+
assert call_args[0][0] == "p1"
|
73
|
+
|
74
|
+
|
75
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.paper_loader.add_papers_batch")
|
76
|
+
def test_gpu_parameters_used(mock_batch, request):
|
77
|
+
"""Test GPU-based parameters are used if has_gpu is True."""
|
78
|
+
article_data = request.getfixturevalue("articles")
|
79
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
80
|
+
vector_store.loaded_papers = set()
|
81
|
+
vector_store.has_gpu = True
|
82
|
+
|
83
|
+
load_all_papers(
|
84
|
+
vector_store=vector_store,
|
85
|
+
articles=article_data,
|
86
|
+
call_id="gpu_call",
|
87
|
+
config={"embedding_batch_size": 2048},
|
88
|
+
has_gpu=True,
|
89
|
+
)
|
90
|
+
|
91
|
+
args = mock_batch.call_args[1]
|
92
|
+
assert args["has_gpu"] is True
|
93
|
+
assert args["batch_size"] == 2048
|
94
|
+
assert args["max_workers"] >= 4
|
95
|
+
|
96
|
+
|
97
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.paper_loader.add_papers_batch")
|
98
|
+
def test_cpu_parameters_used(mock_batch, request):
|
99
|
+
"""Test CPU-based parameters are used if has_gpu is False."""
|
100
|
+
article_data = request.getfixturevalue("articles")
|
101
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
102
|
+
vector_store.loaded_papers = set()
|
103
|
+
vector_store.has_gpu = False
|
104
|
+
|
105
|
+
load_all_papers(
|
106
|
+
vector_store=vector_store,
|
107
|
+
articles=article_data,
|
108
|
+
call_id="cpu_call",
|
109
|
+
config={"embedding_batch_size": 512},
|
110
|
+
has_gpu=False,
|
111
|
+
)
|
112
|
+
|
113
|
+
args = mock_batch.call_args[1]
|
114
|
+
assert args["has_gpu"] is False
|
115
|
+
assert args["batch_size"] == 512
|
116
|
+
assert args["max_workers"] >= 3
|
@@ -0,0 +1,98 @@
|
|
1
|
+
"""pdf rag pipeline tests."""
|
2
|
+
|
3
|
+
from unittest.mock import MagicMock, patch
|
4
|
+
import pytest
|
5
|
+
from langchain_core.documents import Document
|
6
|
+
|
7
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils.rag_pipeline import (
|
8
|
+
retrieve_and_rerank_chunks,
|
9
|
+
)
|
10
|
+
|
11
|
+
|
12
|
+
@pytest.fixture(name="base_config_fixture")
|
13
|
+
def _base_config_fixture():
|
14
|
+
"""Provides a config-like object for testing."""
|
15
|
+
config = MagicMock()
|
16
|
+
config.get.side_effect = lambda key, default=None: {
|
17
|
+
"initial_retrieval_k": 120,
|
18
|
+
"mmr_diversity": 0.7,
|
19
|
+
}.get(key, default)
|
20
|
+
config.top_k_chunks = 5
|
21
|
+
return config
|
22
|
+
|
23
|
+
|
24
|
+
@pytest.fixture(name="mock_docs_fixture")
|
25
|
+
def _mock_docs_fixture():
|
26
|
+
"""Simulates PDF document chunks."""
|
27
|
+
return [
|
28
|
+
Document(page_content=f"chunk {i}", metadata={"paper_id": f"P{i % 2}"})
|
29
|
+
for i in range(10)
|
30
|
+
]
|
31
|
+
|
32
|
+
|
33
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.rag_pipeline.rerank_chunks")
|
34
|
+
@patch(
|
35
|
+
"aiagents4pharma.talk2scholars.tools.pdf.utils.rag_pipeline.retrieve_relevant_chunks"
|
36
|
+
)
|
37
|
+
def test_rag_pipeline_gpu_path(
|
38
|
+
mock_retrieve, mock_rerank, base_config_fixture, mock_docs_fixture
|
39
|
+
):
|
40
|
+
"""test RAG pipeline with GPU path."""
|
41
|
+
mock_retrieve.return_value = mock_docs_fixture
|
42
|
+
mock_rerank.return_value = mock_docs_fixture[:5]
|
43
|
+
|
44
|
+
result = retrieve_and_rerank_chunks(
|
45
|
+
vector_store=MagicMock(),
|
46
|
+
query="Explain AI.",
|
47
|
+
config=base_config_fixture,
|
48
|
+
call_id="gpu_test",
|
49
|
+
has_gpu=True,
|
50
|
+
)
|
51
|
+
|
52
|
+
assert result == mock_docs_fixture[:5]
|
53
|
+
mock_retrieve.assert_called_once()
|
54
|
+
mock_rerank.assert_called_once()
|
55
|
+
|
56
|
+
|
57
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.rag_pipeline.rerank_chunks")
|
58
|
+
@patch(
|
59
|
+
"aiagents4pharma.talk2scholars.tools.pdf.utils.rag_pipeline.retrieve_relevant_chunks"
|
60
|
+
)
|
61
|
+
def test_rag_pipeline_cpu_path(
|
62
|
+
mock_retrieve, mock_rerank, base_config_fixture, mock_docs_fixture
|
63
|
+
):
|
64
|
+
"""rag pipeline with CPU path."""
|
65
|
+
mock_retrieve.return_value = mock_docs_fixture
|
66
|
+
mock_rerank.return_value = mock_docs_fixture[:5]
|
67
|
+
|
68
|
+
result = retrieve_and_rerank_chunks(
|
69
|
+
vector_store=MagicMock(),
|
70
|
+
query="Explain quantum physics.",
|
71
|
+
config=base_config_fixture,
|
72
|
+
call_id="cpu_test",
|
73
|
+
has_gpu=False,
|
74
|
+
)
|
75
|
+
|
76
|
+
assert result == mock_docs_fixture[:5]
|
77
|
+
mock_retrieve.assert_called_once()
|
78
|
+
mock_rerank.assert_called_once()
|
79
|
+
|
80
|
+
|
81
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.rag_pipeline.rerank_chunks")
|
82
|
+
@patch(
|
83
|
+
"aiagents4pharma.talk2scholars.tools.pdf.utils.rag_pipeline.retrieve_relevant_chunks"
|
84
|
+
)
|
85
|
+
def test_rag_pipeline_empty_results(mock_retrieve, mock_rerank, base_config_fixture):
|
86
|
+
"""rag pipeline with no results."""
|
87
|
+
mock_retrieve.return_value = []
|
88
|
+
|
89
|
+
result = retrieve_and_rerank_chunks(
|
90
|
+
vector_store=MagicMock(),
|
91
|
+
query="No match?",
|
92
|
+
config=base_config_fixture,
|
93
|
+
call_id="empty_test",
|
94
|
+
has_gpu=False,
|
95
|
+
)
|
96
|
+
|
97
|
+
assert result == []
|
98
|
+
mock_rerank.assert_not_called()
|
@@ -0,0 +1,197 @@
|
|
1
|
+
"""retrieve_chunks for PDF tool tests"""
|
2
|
+
|
3
|
+
from unittest.mock import MagicMock, patch
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
from langchain_core.documents import Document
|
7
|
+
|
8
|
+
from aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks import (
|
9
|
+
retrieve_relevant_chunks,
|
10
|
+
retrieve_relevant_chunks_with_scores,
|
11
|
+
)
|
12
|
+
|
13
|
+
|
14
|
+
@pytest.fixture
|
15
|
+
def mock_vector_store():
|
16
|
+
"""Fixture to simulate a vector store."""
|
17
|
+
return MagicMock()
|
18
|
+
|
19
|
+
|
20
|
+
@pytest.fixture
|
21
|
+
def mock_chunks():
|
22
|
+
"""Fixture to simulate PDF chunks."""
|
23
|
+
return [
|
24
|
+
Document(page_content=f"chunk {i}", metadata={"paper_id": f"P{i%2}"})
|
25
|
+
for i in range(5)
|
26
|
+
]
|
27
|
+
|
28
|
+
|
29
|
+
@pytest.fixture
|
30
|
+
def mock_scored_chunks():
|
31
|
+
"""Fixture to simulate scored PDF chunks."""
|
32
|
+
return [
|
33
|
+
(Document(page_content=f"chunk {i}", metadata={}), score)
|
34
|
+
for i, score in enumerate([0.9, 0.8, 0.4, 0.95])
|
35
|
+
]
|
36
|
+
|
37
|
+
|
38
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
39
|
+
def test_retrieve_chunks_cpu_success(mock_logger, request):
|
40
|
+
"""Test retrieve_relevant_chunks with CPU path."""
|
41
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
42
|
+
chunks = request.getfixturevalue("mock_chunks")
|
43
|
+
vector_store.has_gpu = False
|
44
|
+
mock_logger.debug = MagicMock()
|
45
|
+
vector_store.max_marginal_relevance_search.return_value = chunks
|
46
|
+
|
47
|
+
results = retrieve_relevant_chunks(vector_store, query="AI", top_k=5)
|
48
|
+
|
49
|
+
assert results == chunks
|
50
|
+
vector_store.max_marginal_relevance_search.assert_called_once()
|
51
|
+
|
52
|
+
|
53
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
54
|
+
def test_retrieve_chunks_gpu_success(mock_logger, request):
|
55
|
+
"""Test retrieve_relevant_chunks with GPU path."""
|
56
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
57
|
+
chunks = request.getfixturevalue("mock_chunks")
|
58
|
+
vector_store.has_gpu = True
|
59
|
+
mock_logger.debug = MagicMock()
|
60
|
+
vector_store.max_marginal_relevance_search.return_value = chunks
|
61
|
+
|
62
|
+
results = retrieve_relevant_chunks(vector_store, query="AI", top_k=5)
|
63
|
+
|
64
|
+
assert results == chunks
|
65
|
+
vector_store.max_marginal_relevance_search.assert_called_once()
|
66
|
+
|
67
|
+
|
68
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
69
|
+
def test_retrieve_chunks_with_filter(mock_logger, request):
|
70
|
+
"""Test retrieve_relevant_chunks with paper_id filter."""
|
71
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
72
|
+
chunks = request.getfixturevalue("mock_chunks")
|
73
|
+
vector_store.has_gpu = False
|
74
|
+
mock_logger.debug = MagicMock()
|
75
|
+
vector_store.max_marginal_relevance_search.return_value = chunks
|
76
|
+
|
77
|
+
results = retrieve_relevant_chunks(
|
78
|
+
vector_store, query="filter test", paper_ids=["P1"], top_k=3
|
79
|
+
)
|
80
|
+
assert results == chunks
|
81
|
+
args, kwargs = vector_store.max_marginal_relevance_search.call_args
|
82
|
+
assert len(args) == 0
|
83
|
+
assert kwargs["filter"] == {"paper_id": ["P1"]}
|
84
|
+
|
85
|
+
|
86
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
87
|
+
def test_retrieve_chunks_no_vector_store(mock_logger):
|
88
|
+
"""Test when vector store is None."""
|
89
|
+
result = retrieve_relevant_chunks(vector_store=None, query="irrelevant")
|
90
|
+
assert result == []
|
91
|
+
mock_logger.error.assert_called_with("Vector store is not initialized")
|
92
|
+
|
93
|
+
|
94
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
95
|
+
def test_retrieve_chunks_with_scores_no_vector_store(mock_logger):
|
96
|
+
"""Test retrieve_relevant_chunks_with_scores when vector store is None."""
|
97
|
+
result = retrieve_relevant_chunks_with_scores(vector_store=None, query="none")
|
98
|
+
assert result == []
|
99
|
+
mock_logger.error.assert_called_with("Vector store is not initialized")
|
100
|
+
|
101
|
+
|
102
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
103
|
+
def test_retrieve_chunks_default_search_params(mock_logger, request):
|
104
|
+
"""Test default search params used when not defined."""
|
105
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
106
|
+
chunks = request.getfixturevalue("mock_chunks")
|
107
|
+
vector_store.has_gpu = False
|
108
|
+
delattr(vector_store, "search_params")
|
109
|
+
vector_store.max_marginal_relevance_search.return_value = chunks
|
110
|
+
|
111
|
+
results = retrieve_relevant_chunks(
|
112
|
+
vector_store,
|
113
|
+
query="default search param test",
|
114
|
+
top_k=5,
|
115
|
+
)
|
116
|
+
|
117
|
+
assert results == chunks
|
118
|
+
mock_logger.debug.assert_any_call(
|
119
|
+
"Using default search parameters (no hardware optimization)"
|
120
|
+
)
|
121
|
+
|
122
|
+
|
123
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
124
|
+
def test_retrieve_chunks_with_scores_paper_filter(mock_logger, request):
|
125
|
+
"""Test retrieve_relevant_chunks_with_scores applies paper_id filter."""
|
126
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
127
|
+
scored_chunks = request.getfixturevalue("mock_scored_chunks")
|
128
|
+
vector_store.similarity_search_with_score.return_value = scored_chunks
|
129
|
+
mock_logger.debug = MagicMock()
|
130
|
+
|
131
|
+
results = retrieve_relevant_chunks_with_scores(
|
132
|
+
vector_store=vector_store,
|
133
|
+
query="filtered score",
|
134
|
+
paper_ids=["P123"],
|
135
|
+
top_k=5,
|
136
|
+
score_threshold=0.0,
|
137
|
+
)
|
138
|
+
|
139
|
+
assert isinstance(results, list)
|
140
|
+
assert vector_store.similarity_search_with_score.call_args[1]["filter"] == {
|
141
|
+
"paper_id": ["P123"]
|
142
|
+
}
|
143
|
+
|
144
|
+
|
145
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
146
|
+
def test_retrieve_chunks_with_scores_gpu_debug(mock_logger, request):
|
147
|
+
"""Test GPU debug log and correct return in retrieve_relevant_chunks_with_scores."""
|
148
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
149
|
+
scored_chunks = request.getfixturevalue("mock_scored_chunks")
|
150
|
+
vector_store.has_gpu = True
|
151
|
+
vector_store.similarity_search_with_score.return_value = scored_chunks
|
152
|
+
mock_logger.debug = MagicMock()
|
153
|
+
|
154
|
+
results = retrieve_relevant_chunks_with_scores(
|
155
|
+
vector_store=vector_store, query="gpu test", top_k=4, score_threshold=0.0
|
156
|
+
)
|
157
|
+
|
158
|
+
# Should return all scored_chunks since threshold=0.0
|
159
|
+
assert results == scored_chunks
|
160
|
+
mock_logger.debug.assert_called_with("GPU-accelerated similarity search enabled")
|
161
|
+
|
162
|
+
|
163
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
164
|
+
def test_retrieve_chunks_with_scores_cpu_debug(mock_logger, request):
|
165
|
+
"""Test CPU debug log and correct return in retrieve_relevant_chunks_with_scores."""
|
166
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
167
|
+
scored_chunks = request.getfixturevalue("mock_scored_chunks")
|
168
|
+
vector_store.has_gpu = False
|
169
|
+
vector_store.similarity_search_with_score.return_value = scored_chunks
|
170
|
+
mock_logger.debug = MagicMock()
|
171
|
+
|
172
|
+
results = retrieve_relevant_chunks_with_scores(
|
173
|
+
vector_store=vector_store, query="cpu test", top_k=2, score_threshold=0.0
|
174
|
+
)
|
175
|
+
|
176
|
+
assert results == scored_chunks
|
177
|
+
mock_logger.debug.assert_called_with("Standard CPU similarity search")
|
178
|
+
|
179
|
+
|
180
|
+
@patch("aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.logger")
|
181
|
+
def test_retrieve_chunks_with_scores_not_implemented(mock_logger, request):
|
182
|
+
"""Test NotImplementedError path when similarity_search_with_score is missing."""
|
183
|
+
vector_store = request.getfixturevalue("mock_vector_store")
|
184
|
+
vector_store.has_gpu = True
|
185
|
+
# Remove the method to trigger NotImplementedError
|
186
|
+
if hasattr(vector_store, "similarity_search_with_score"):
|
187
|
+
delattr(vector_store, "similarity_search_with_score")
|
188
|
+
mock_logger.debug = MagicMock()
|
189
|
+
|
190
|
+
with pytest.raises(NotImplementedError) as excinfo:
|
191
|
+
retrieve_relevant_chunks_with_scores(
|
192
|
+
vector_store=vector_store, query="fail test", top_k=1, score_threshold=0.0
|
193
|
+
)
|
194
|
+
assert "Vector store does not support similarity_search_with_score" in str(
|
195
|
+
excinfo.value
|
196
|
+
)
|
197
|
+
mock_logger.debug.assert_called_with("GPU-accelerated similarity search enabled")
|