aiagents4pharma 1.39.0__py3-none-any.whl → 1.39.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. aiagents4pharma/talk2scholars/agents/main_agent.py +7 -7
  2. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/main_agent/default.yaml +88 -12
  3. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/paper_download_agent/default.yaml +5 -0
  4. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/pdf_agent/default.yaml +5 -0
  5. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +1 -20
  6. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/zotero_agent/default.yaml +1 -26
  7. aiagents4pharma/talk2scholars/configs/tools/download_arxiv_paper/default.yaml +4 -0
  8. aiagents4pharma/talk2scholars/configs/tools/download_biorxiv_paper/default.yaml +2 -0
  9. aiagents4pharma/talk2scholars/configs/tools/download_medrxiv_paper/default.yaml +2 -0
  10. aiagents4pharma/talk2scholars/configs/tools/question_and_answer/default.yaml +22 -0
  11. aiagents4pharma/talk2scholars/tests/test_main_agent.py +20 -2
  12. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker_utils.py +28 -0
  13. aiagents4pharma/talk2scholars/tests/test_paper_download_tools.py +107 -29
  14. aiagents4pharma/talk2scholars/tests/test_pdf_agent.py +2 -3
  15. aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +194 -543
  16. aiagents4pharma/talk2scholars/tests/test_s2_agent.py +2 -2
  17. aiagents4pharma/talk2scholars/tests/{test_s2_display.py → test_s2_display_dataframe.py} +2 -3
  18. aiagents4pharma/talk2scholars/tests/test_s2_query_dataframe.py +201 -0
  19. aiagents4pharma/talk2scholars/tests/test_s2_retrieve.py +7 -6
  20. aiagents4pharma/talk2scholars/tests/test_s2_utils_ext_ids.py +413 -0
  21. aiagents4pharma/talk2scholars/tests/test_tool_helper_utils.py +140 -0
  22. aiagents4pharma/talk2scholars/tests/test_zotero_agent.py +0 -1
  23. aiagents4pharma/talk2scholars/tests/test_zotero_read.py +16 -18
  24. aiagents4pharma/talk2scholars/tools/paper_download/download_arxiv_input.py +92 -37
  25. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +73 -575
  26. aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +10 -0
  27. aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +97 -0
  28. aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +77 -0
  29. aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +83 -0
  30. aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +125 -0
  31. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +162 -0
  32. aiagents4pharma/talk2scholars/tools/s2/display_dataframe.py +33 -10
  33. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +39 -16
  34. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +124 -10
  35. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +49 -17
  36. aiagents4pharma/talk2scholars/tools/s2/search.py +39 -16
  37. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +34 -16
  38. aiagents4pharma/talk2scholars/tools/s2/utils/multi_helper.py +49 -16
  39. aiagents4pharma/talk2scholars/tools/s2/utils/search_helper.py +51 -16
  40. aiagents4pharma/talk2scholars/tools/s2/utils/single_helper.py +50 -17
  41. {aiagents4pharma-1.39.0.dist-info → aiagents4pharma-1.39.2.dist-info}/METADATA +58 -105
  42. {aiagents4pharma-1.39.0.dist-info → aiagents4pharma-1.39.2.dist-info}/RECORD +45 -32
  43. aiagents4pharma/talk2scholars/tests/test_llm_main_integration.py +0 -89
  44. aiagents4pharma/talk2scholars/tests/test_routing_logic.py +0 -74
  45. aiagents4pharma/talk2scholars/tests/test_s2_query.py +0 -95
  46. {aiagents4pharma-1.39.0.dist-info → aiagents4pharma-1.39.2.dist-info}/WHEEL +0 -0
  47. {aiagents4pharma-1.39.0.dist-info → aiagents4pharma-1.39.2.dist-info}/licenses/LICENSE +0 -0
  48. {aiagents4pharma-1.39.0.dist-info → aiagents4pharma-1.39.2.dist-info}/top_level.txt +0 -0
@@ -8,20 +8,27 @@ from unittest.mock import MagicMock, patch
8
8
 
9
9
  from langchain_core.documents import Document
10
10
  from langchain_core.embeddings import Embeddings
11
- from langchain_core.messages import ToolMessage
12
11
 
13
- import aiagents4pharma.talk2scholars.tools.pdf.question_and_answer as qa_module
14
12
  from aiagents4pharma.talk2scholars.tools.pdf.question_and_answer import (
15
- Vectorstore,
16
- generate_answer,
17
13
  question_and_answer,
18
14
  )
15
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.generate_answer import (
16
+ generate_answer,
17
+ load_hydra_config,
18
+ )
19
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker import (
20
+ rank_papers_by_query,
21
+ )
22
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks import (
23
+ retrieve_relevant_chunks,
24
+ )
25
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.vector_store import Vectorstore
19
26
 
20
27
 
21
28
  class TestQuestionAndAnswerTool(unittest.TestCase):
22
29
  """tests for question_and_answer tool functionality."""
23
30
 
24
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.PyPDFLoader")
31
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.vector_store.PyPDFLoader")
25
32
  def test_add_paper(self, mock_pypdf_loader):
26
33
  """test adding a paper to the vector store."""
27
34
  # Mock the PDF loader
@@ -32,7 +39,10 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
32
39
  mock_embedding_model = MagicMock(spec=Embeddings)
33
40
 
34
41
  # Initialize Vectorstore
35
- vector_store = Vectorstore(embedding_model=mock_embedding_model)
42
+ vector_store = Vectorstore(
43
+ embedding_model=mock_embedding_model,
44
+ config=load_hydra_config(),
45
+ )
36
46
 
37
47
  # Add a paper
38
48
  vector_store.add_paper(
@@ -44,7 +54,7 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
44
54
  # Check if the paper was added
45
55
  self.assertIn("test_paper_0", vector_store.documents)
46
56
 
47
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.PyPDFLoader")
57
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.vector_store.PyPDFLoader")
48
58
  def test_add_paper_already_loaded(self, mock_pypdf_loader):
49
59
  """Test that adding a paper that is already loaded does not re-load or add new documents."""
50
60
  # Mock the PDF loader (it should not be used when the paper is already loaded)
@@ -55,7 +65,10 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
55
65
  mock_embedding_model = MagicMock(spec=Embeddings)
56
66
 
57
67
  # Initialize Vectorstore
58
- vector_store = Vectorstore(embedding_model=mock_embedding_model)
68
+ vector_store = Vectorstore(
69
+ embedding_model=mock_embedding_model,
70
+ config=load_hydra_config(),
71
+ )
59
72
 
60
73
  # Simulate the paper already being loaded.
61
74
  vector_store.loaded_papers.add("test_paper")
@@ -98,18 +111,16 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
98
111
  self.assertIsNotNone(vector_store.vector_store)
99
112
 
100
113
  @patch(
101
- "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.load_hydra_config"
114
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.nvidia_nim_reranker.NVIDIARerank"
102
115
  )
103
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.NVIDIARerank")
104
- def test_rank_papers_by_query(self, mock_nvidia_rerank, mock_load_config):
116
+ def test_rank_papers_by_query(self, mock_nvidia_rerank):
105
117
  """test ranking papers by query."""
106
- # Create a mock config object with attributes
107
- mock_config = MagicMock()
108
- mock_config.reranker.model = "nvidia/llama-3.2-nv-rerankqa-1b-v2"
109
- mock_config.reranker.api_key = "dummy_api_key"
110
-
111
- # Patch load_hydra_config to return the mock config object
112
- mock_load_config.return_value = mock_config
118
+ # Create a mock config object with the top_k_papers attribute
119
+ # Create a mock config object with required reranker settings and top_k_papers
120
+ mock_config = SimpleNamespace(
121
+ reranker=SimpleNamespace(model="dummy", api_key="key"),
122
+ top_k_papers=1,
123
+ )
113
124
 
114
125
  # Mock the re-ranker instance.
115
126
  mock_reranker = mock_nvidia_rerank.return_value
@@ -130,14 +141,16 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
130
141
  page_content="Test content", metadata={"paper_id": "test_paper"}
131
142
  )
132
143
 
133
- # Rank papers.
134
- ranked_papers = vector_store.rank_papers_by_query(query="test query")
144
+ # Rank papers using the standalone function
145
+ ranked_papers = rank_papers_by_query(
146
+ vector_store, "test query", mock_config, top_k=mock_config.top_k_papers
147
+ )
135
148
 
136
149
  # Check if the ranking is correct (updated expectation: a list of paper IDs)
137
150
  self.assertEqual(ranked_papers[0], "test_paper")
138
151
 
139
152
  @patch(
140
- "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.maximal_marginal_relevance"
153
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.retrieve_chunks.maximal_marginal_relevance"
141
154
  )
142
155
  def test_retrieve_relevant_chunks(self, mock_mmr):
143
156
  """Test retrieving relevant chunks without filters."""
@@ -150,14 +163,17 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
150
163
  vector_store.vector_store = True
151
164
  # Add a document chunk with required metadata including chunk_id
152
165
  vector_store.documents["test_doc"] = Document(
153
- page_content="Test content", metadata={"paper_id": "test_paper", "chunk_id": 0}
166
+ page_content="Test content",
167
+ metadata={"paper_id": "test_paper", "chunk_id": 0},
154
168
  )
155
169
 
156
- results = vector_store.retrieve_relevant_chunks(query="test query")
170
+ results = retrieve_relevant_chunks(vector_store, query="test query")
157
171
  assert len(results) == 1
158
172
  assert results[0].metadata["paper_id"] == "test_paper"
159
173
 
160
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.BaseChatModel")
174
+ @patch(
175
+ "aiagents4pharma.talk2scholars.tools.pdf.utils.generate_answer.BaseChatModel"
176
+ )
161
177
  def test_generate_answer(self, mock_base_chat_model):
162
178
  """test generating an answer."""
163
179
  # Mock the language model
@@ -169,17 +185,19 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
169
185
  page_content="Test content", metadata={"paper_id": "test_paper"}
170
186
  )
171
187
 
172
- # Generate answer
188
+ # Generate answer with dummy config
189
+ config = {"prompt_template": "{context} {question}"}
173
190
  result = generate_answer(
174
191
  question="What is the test?",
175
192
  retrieved_chunks=[mock_document],
176
193
  llm_model=mock_llm,
194
+ config=config,
177
195
  )
178
196
 
179
197
  # Check if the answer is generated correctly
180
198
  self.assertEqual(result["output_text"], "Generated answer")
181
199
 
182
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.PyPDFLoader")
200
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.vector_store.PyPDFLoader")
183
201
  def test_add_paper_exception_handling(self, mock_pypdf_loader):
184
202
  """Test exception handling when adding a paper."""
185
203
  # Mock the PDF loader to raise an exception.
@@ -203,6 +221,31 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
203
221
  # Verify that the exception message is as expected.
204
222
  self.assertEqual(str(context.exception), "Loading error")
205
223
 
224
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.vector_store.PyPDFLoader")
225
+ def test_add_paper_missing_config(self, mock_pypdf_loader):
226
+ """Test that add_paper raises ValueError when config is missing."""
227
+ # Mock the PDF loader to return a single page
228
+ mock_loader = mock_pypdf_loader.return_value
229
+ mock_loader.load.return_value = [Document(page_content="Page content")]
230
+
231
+ # Mock embedding model
232
+ mock_embedding_model = MagicMock(spec=Embeddings)
233
+
234
+ # Initialize Vectorstore without config (default None)
235
+ vector_store = Vectorstore(embedding_model=mock_embedding_model)
236
+
237
+ # Attempt to add a paper and expect a configuration error
238
+ with self.assertRaises(ValueError) as cm:
239
+ vector_store.add_paper(
240
+ paper_id="test_paper",
241
+ pdf_url="http://example.com/test.pdf",
242
+ paper_metadata={"Title": "Test Paper"},
243
+ )
244
+ self.assertEqual(
245
+ str(cm.exception),
246
+ "Configuration is required for text splitting in Vectorstore.",
247
+ )
248
+
206
249
  def test_build_vector_store_no_documents(self):
207
250
  """Test building vector store with no documents results in an unchanged vector_store."""
208
251
  # Mock embedding model
@@ -251,7 +294,7 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
251
294
  vector_store = Vectorstore(embedding_model=mock_embedding_model)
252
295
 
253
296
  # Attempt to retrieve relevant chunks (vector_store.vector_store is None)
254
- result = vector_store.retrieve_relevant_chunks(query="test query")
297
+ result = retrieve_relevant_chunks(vector_store, query="test query")
255
298
 
256
299
  # Verify that an empty list is returned since the vector store is not built.
257
300
  self.assertEqual(result, [])
@@ -275,364 +318,16 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
275
318
 
276
319
  # Call retrieve_relevant_chunks with specific paper_ids
277
320
  paper_ids = ["paper1"]
278
- result = vector_store.retrieve_relevant_chunks(
279
- query="test query", paper_ids=paper_ids
321
+ # Use module-level retrieve_relevant_chunks
322
+
323
+ result = retrieve_relevant_chunks(
324
+ vector_store, query="test query", paper_ids=paper_ids
280
325
  )
281
326
 
282
327
  # Verify that an empty list is returned since the vector store is not built.
283
328
  self.assertEqual(result, [])
284
329
 
285
- @patch(
286
- "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.generate_answer"
287
- )
288
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.Vectorstore")
289
- def test_question_and_answer_success(self, mock_vectorstore, mock_generate_answer):
290
- """test the main functionality of the question_and_answer tool."""
291
- # Create a dummy document to simulate a retrieved chunk
292
- dummy_doc = Document(
293
- page_content="Dummy content",
294
- metadata={"paper_id": "paper1", "title": "Paper One", "page": 1},
295
- )
296
-
297
- # Configure generate_answer to return a dummy answer result
298
- mock_generate_answer.return_value = {
299
- "output_text": "Test Answer",
300
- "papers_used": ["paper1"],
301
- }
302
-
303
- # Create a dummy embedding model
304
- dummy_embedding_model = MagicMock(spec=Embeddings)
305
-
306
- # Create a dummy vector store and simulate that it is already built and has the paper loaded
307
- dummy_vector_store = Vectorstore(embedding_model=dummy_embedding_model)
308
- dummy_vector_store.vector_store = (
309
- True # Simulate that the vector store is built
310
- )
311
- dummy_vector_store.loaded_papers.add("paper1")
312
- dummy_vector_store.retrieve_relevant_chunks = MagicMock(
313
- return_value=[dummy_doc]
314
- )
315
- # Return our dummy vector store when Vectorstore() is instantiated
316
- mock_vectorstore.return_value = dummy_vector_store
317
-
318
- # Create a dummy LLM model
319
- dummy_llm_model = MagicMock()
320
-
321
- # Construct the state with required keys
322
- state = {
323
- "article_data": {
324
- "paper1": {
325
- "pdf_url": "http://example.com/paper1.pdf",
326
- "Title": "Paper One",
327
- }
328
- },
329
- "text_embedding_model": dummy_embedding_model,
330
- "llm_model": dummy_llm_model,
331
- "vector_store": dummy_vector_store,
332
- }
333
-
334
- input_data = {
335
- "question": "What is the content?",
336
- "paper_ids": ["paper1"],
337
- "use_all_papers": False,
338
- "tool_call_id": "test_tool_call",
339
- "state": state,
340
- }
341
- result = question_and_answer.run(input_data)
342
-
343
- # Verify that generate_answer was called with expected arguments
344
- mock_generate_answer.assert_called_once()
345
- args, _ = mock_generate_answer.call_args
346
- self.assertEqual(args[0], "What is the content?")
347
- self.assertEqual(args[2], dummy_llm_model)
348
-
349
- # Verify the final response content and tool_call_id in the returned Command
350
- response_message = result.update["messages"][0]
351
- expected_output = "Test Answer\n\nSources:\n- Paper One"
352
- self.assertEqual(response_message.content, expected_output)
353
- self.assertEqual(response_message.tool_call_id, "test_tool_call")
354
-
355
- @patch(
356
- "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.generate_answer"
357
- )
358
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.Vectorstore")
359
- def test_question_and_answer_semantic_branch(
360
- self, mock_vectorstore, mock_generate_answer
361
- ):
362
- """test the semantic ranking branch of the question_and_answer tool."""
363
- # Create a dummy document to simulate a retrieved chunk from semantic ranking
364
- dummy_doc = Document(
365
- page_content="Semantic chunk",
366
- metadata={"paper_id": "paper_sem", "title": "Paper Semantic", "page": 2},
367
- )
368
-
369
- # Configure generate_answer to return a dummy answer result
370
- mock_generate_answer.return_value = {
371
- "output_text": "Semantic Answer",
372
- "papers_used": ["paper_sem"],
373
- }
374
-
375
- # Create a dummy Vectorstore instance to simulate the semantic branch behavior
376
- dummy_vs = MagicMock()
377
- # Initially, no papers are loaded
378
- dummy_vs.loaded_papers = set()
379
- # Explicitly set vector_store to None so that the build_vector_store branch is taken
380
- dummy_vs.vector_store = None
381
- # When build_vector_store is called, simulate that the vector store is built
382
- dummy_vs.build_vector_store.side_effect = lambda: setattr(
383
- dummy_vs, "vector_store", True
384
- )
385
- # Simulate ranking: return a single paper id with score as a tuple for unpacking
386
- dummy_vs.rank_papers_by_query.return_value = [("paper_sem", 1.0)]
387
- # Simulate retrieval: return our dummy document
388
- dummy_vs.retrieve_relevant_chunks.return_value = [dummy_doc]
389
- # Ensure add_paper is available (it may be called more than once)
390
- dummy_vs.add_paper.return_value = None
391
-
392
- # When the tool instantiates Vectorstore, return our dummy instance
393
- mock_vectorstore.return_value = dummy_vs
394
-
395
- # Create dummy embedding and LLM models
396
- dummy_embedding_model = MagicMock(spec=Embeddings)
397
- dummy_llm_model = MagicMock()
398
-
399
- # Construct the state WITHOUT a vector_store to force creation,
400
- # and without explicit paper_ids so the semantic branch is taken.
401
- state = {
402
- "article_data": {
403
- "paper_sem": {
404
- "pdf_url": "http://example.com/paper_sem.pdf",
405
- "Title": "Paper Semantic",
406
- }
407
- },
408
- "text_embedding_model": dummy_embedding_model,
409
- "llm_model": dummy_llm_model,
410
- # Note: "vector_store" key is omitted intentionally
411
- }
412
-
413
- input_data = {
414
- "question": "What is semantic content?",
415
- "paper_ids": None,
416
- "use_all_papers": False,
417
- "tool_call_id": "test_semantic_tool_call",
418
- "state": state,
419
- }
420
- result = question_and_answer.run(input_data)
421
-
422
- # Instead of checking that 'vector_store' was added to the original state dict,
423
- # verify that a new vector store was created by checking that Vectorstore was instantiated.
424
- mock_vectorstore.assert_called_once_with(embedding_model=dummy_embedding_model)
425
-
426
- # Check that add_paper was called at least once (semantic branch should load the paper)
427
- self.assertTrue(dummy_vs.add_paper.call_count >= 1)
428
-
429
- # Verify that build_vector_store was called to set up the store
430
- dummy_vs.build_vector_store.assert_called()
431
-
432
- # Verify that rank_papers_by_query was called with the expected question and top_k=3
433
- dummy_vs.rank_papers_by_query.assert_called_with(
434
- "What is semantic content?", top_k=40
435
- )
436
-
437
- # Verify that retrieve_relevant_chunks was called with the selected paper id.
438
- dummy_vs.retrieve_relevant_chunks.assert_called_with(
439
- query="What is semantic content?", paper_ids=["paper_sem"], top_k=25
440
- )
441
-
442
- # Verify that generate_answer was called with the expected arguments
443
- mock_generate_answer.assert_called_once()
444
- args, _ = mock_generate_answer.call_args
445
- self.assertEqual(args[0], "What is semantic content?")
446
- self.assertEqual(args[2], dummy_llm_model)
447
-
448
- # Verify that the final response message is correctly
449
- # formatted with answer and source attribution
450
- response_message = result.update["messages"][0]
451
- expected_output = "Semantic Answer\n\nSources:\n- Paper Semantic"
452
- self.assertEqual(response_message.content, expected_output)
453
- self.assertEqual(response_message.tool_call_id, "test_semantic_tool_call")
454
-
455
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.Vectorstore")
456
- def test_question_and_answer_fallback_no_relevant_chunks(self, mock_vectorstore):
457
- """Test the fallback branch of the question_and_answer
458
- tool when no relevant chunks are found."""
459
- # Create a dummy Vectorstore instance to simulate fallback and error conditions.
460
- dummy_vs = MagicMock()
461
- # Ensure no papers are loaded initially.
462
- dummy_vs.loaded_papers = set()
463
- # Simulate that the vector store is not built.
464
- dummy_vs.vector_store = None
465
- # Simulate ranking returning an empty list to force the fallback branch.
466
- dummy_vs.rank_papers_by_query.return_value = []
467
- # In the "load selected papers" loop, simulate that add_paper raises an exception.
468
- dummy_vs.add_paper.side_effect = IOError("Test error")
469
- # When build_vector_store is called, simulate setting the vector store.
470
- dummy_vs.build_vector_store.side_effect = lambda: setattr(
471
- dummy_vs, "vector_store", True
472
- )
473
- # Simulate retrieval returning an empty list so that a RuntimeError is raised.
474
- dummy_vs.retrieve_relevant_chunks.return_value = []
475
- mock_vectorstore.return_value = dummy_vs
476
-
477
- # Create dummy embedding and LLM models.
478
- dummy_embedding_model = MagicMock(spec=Embeddings)
479
- dummy_llm_model = MagicMock()
480
-
481
- # Construct state with article_data containing one paper.
482
- state = {
483
- "article_data": {
484
- "paper1": {
485
- "pdf_url": "http://example.com/paper1.pdf",
486
- "Title": "Paper One",
487
- }
488
- },
489
- "text_embedding_model": dummy_embedding_model,
490
- "llm_model": dummy_llm_model,
491
- # "vector_store" key is omitted intentionally to force creation.
492
- }
493
-
494
- input_data = {
495
- "question": "What is fallback test?",
496
- # Provide paper_ids that do not match article_data so that the
497
- # fallback branch is triggered.
498
- "paper_ids": ["nonexistent"],
499
- "use_all_papers": False,
500
- "tool_call_id": "test_fallback_call",
501
- "state": state,
502
- }
503
-
504
- with self.assertRaises(RuntimeError) as context:
505
- question_and_answer.run(input_data)
506
-
507
- # Verify that build_vector_store was called to ensure the store is built.
508
- dummy_vs.build_vector_store.assert_called()
509
-
510
- # Verify that the RuntimeError contains the expected error message.
511
- self.assertIn(
512
- "I couldn't find relevant information to answer your question",
513
- str(context.exception),
514
- )
515
-
516
- @patch(
517
- "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.generate_answer"
518
- )
519
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.Vectorstore")
520
- def test_question_and_answer_use_all_papers(
521
- self, mock_vectorstore, mock_generate_answer
522
- ):
523
- """test the use_all_papers branch of the question_and_answer tool."""
524
- # Test the branch where use_all_papers is True.
525
- # Create a dummy document for retrieval.
526
- dummy_doc = Document(
527
- page_content="Content from all papers branch",
528
- metadata={"paper_id": "paper_all", "title": "Paper All", "page": 1},
529
- )
530
- # Configure generate_answer to return a dummy answer.
531
- mock_generate_answer.return_value = {
532
- "output_text": "Answer from all papers",
533
- "papers_used": ["paper_all"],
534
- }
535
-
536
- # Create a dummy vector store that is already built and already loaded with the paper.
537
- dummy_vs = MagicMock()
538
- dummy_vs.vector_store = True
539
- # Simulate that the paper is already loaded.
540
- dummy_vs.loaded_papers = {"paper_all"}
541
- # Simulate retrieval returning the dummy document.
542
- dummy_vs.retrieve_relevant_chunks.return_value = [dummy_doc]
543
- # No add_paper call should be needed.
544
- dummy_vs.add_paper.return_value = None
545
- # Return our dummy vector store when Vectorstore() is instantiated
546
- mock_vectorstore.return_value = dummy_vs
547
-
548
- # Construct state with article_data containing one paper and an existing vector_store.
549
- dummy_embedding_model = MagicMock(spec=Embeddings)
550
- dummy_llm_model = MagicMock()
551
- state = {
552
- "article_data": {
553
- "paper_all": {
554
- "pdf_url": "http://example.com/paper_all.pdf",
555
- "Title": "Paper All",
556
- }
557
- },
558
- "text_embedding_model": dummy_embedding_model,
559
- "llm_model": dummy_llm_model,
560
- "vector_store": dummy_vs, # Existing vector store
561
- }
562
-
563
- input_data = {
564
- "question": "What is the content from all papers?",
565
- "paper_ids": None,
566
- "use_all_papers": True,
567
- "tool_call_id": "test_use_all_papers",
568
- "state": state,
569
- }
570
- result = question_and_answer.run(input_data)
571
-
572
- # Verify that the use_all_papers branch was
573
- # taken by checking that all article keys were selected.
574
- # (This is logged; here we indirectly verify
575
- # that generate_answer was called with the dummy_llm_model.)
576
- mock_generate_answer.assert_called_once()
577
- args, _ = mock_generate_answer.call_args
578
- self.assertEqual(args[0], "What is the content from all papers?")
579
- self.assertEqual(args[2], dummy_llm_model)
580
-
581
- # Verify that the final response message includes the answer and source attribution.
582
- response_message = result.update["messages"][0]
583
- expected_output = "Answer from all papers\n\nSources:\n- Paper All"
584
- self.assertEqual(response_message.content, expected_output)
585
- self.assertEqual(response_message.tool_call_id, "test_use_all_papers")
586
-
587
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.Vectorstore")
588
- def test_question_and_answer_add_paper_exception(self, mock_vectorstore):
589
- """test exception handling when add_paper fails."""
590
- # Test that in the semantic ranking branch, if add_paper raises an exception,
591
- # the error is logged and then re-raised.
592
- dummy_vs = MagicMock()
593
- # No papers are loaded.
594
- dummy_vs.loaded_papers = set()
595
- # Simulate that the vector store is not built.
596
- dummy_vs.vector_store = None
597
- # In the semantic branch, when trying to load the paper, add_paper will raise an exception.
598
- dummy_vs.add_paper.side_effect = IOError("Add paper failure")
599
- # Simulate that build_vector_store would set the store
600
- # (if reached, but it won't in this test).
601
- dummy_vs.build_vector_store.side_effect = lambda: setattr(
602
- dummy_vs, "vector_store", True
603
- )
604
- # Ensure retrieval is never reached because add_paper fails.
605
- dummy_vs.retrieve_relevant_chunks.return_value = []
606
- mock_vectorstore.return_value = dummy_vs
607
-
608
- dummy_embedding_model = MagicMock(spec=Embeddings)
609
- dummy_llm_model = MagicMock()
610
- # Construct state with article_data containing one paper.
611
- state = {
612
- "article_data": {
613
- "paper_err": {
614
- "pdf_url": "http://example.com/paper_err.pdf",
615
- "Title": "Paper Error",
616
- }
617
- },
618
- "text_embedding_model": dummy_embedding_model,
619
- "llm_model": dummy_llm_model,
620
- # No vector_store key provided to force creation of a new one.
621
- }
622
-
623
- # Use paper_ids=None and use_all_papers=False to trigger semantic ranking branch.
624
- input_data = {
625
- "question": "What happens when add_paper fails?",
626
- "paper_ids": None,
627
- "use_all_papers": False,
628
- "tool_call_id": "test_add_paper_exception",
629
- "state": state,
630
- }
631
- with self.assertRaises(IOError) as context:
632
- question_and_answer.run(input_data)
633
- self.assertIn("Add paper failure", str(context.exception))
634
-
635
- @patch("aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.PyPDFLoader")
330
+ @patch("aiagents4pharma.talk2scholars.tools.pdf.utils.vector_store.PyPDFLoader")
636
331
  def test_additional_metadata_field_added(self, mock_pypdf_loader):
637
332
  """test that additional metadata fields are added correctly."""
638
333
  # Setup the PDF loader to return a single document with empty metadata
@@ -647,7 +342,9 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
647
342
  # Define custom metadata fields including an additional field "custom_field"
648
343
  custom_fields = ["title", "paper_id", "page", "chunk_id", "custom_field"]
649
344
  vector_store = Vectorstore(
650
- embedding_model=dummy_embedding_model, metadata_fields=custom_fields
345
+ embedding_model=dummy_embedding_model,
346
+ metadata_fields=custom_fields,
347
+ config=load_hydra_config(),
651
348
  )
652
349
 
653
350
  # Paper metadata includes "Title" (for default title) and the additional "custom_field"
@@ -665,10 +362,7 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
665
362
  added_doc = vector_store.documents["test_paper_0"]
666
363
  self.assertEqual(added_doc.metadata.get("custom_field"), "custom_value")
667
364
 
668
- @patch(
669
- "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.load_hydra_config"
670
- )
671
- def test_generate_answer_missing_config_fields(self, mock_load_config):
365
+ def test_generate_answer_missing_config_fields(self):
672
366
  """test that generate_answer raises ValueError for missing config fields."""
673
367
  # Create a dummy document and dummy LLM model
674
368
  dummy_doc = Document(
@@ -677,133 +371,62 @@ class TestQuestionAndAnswerTool(unittest.TestCase):
677
371
  dummy_llm_model = MagicMock()
678
372
 
679
373
  # Case 1: Configuration is None, expect a ValueError
680
- mock_load_config.return_value = None
681
374
  with self.assertRaises(ValueError) as context_none:
682
- generate_answer("What is the test?", [dummy_doc], dummy_llm_model)
375
+ generate_answer("What is the test?", [dummy_doc], dummy_llm_model, None)
683
376
  self.assertEqual(
684
- str(context_none.exception), "Hydra config loading failed: config is None."
377
+ str(context_none.exception),
378
+ "Configuration for generate_answer is required.",
685
379
  )
686
380
 
687
381
  # Case 2: Configuration missing 'prompt_template', expect a ValueError
688
- mock_load_config.return_value = {}
689
382
  with self.assertRaises(ValueError) as context_missing:
690
- generate_answer("What is the test?", [dummy_doc], dummy_llm_model)
383
+ generate_answer("What is the test?", [dummy_doc], dummy_llm_model, {})
691
384
  self.assertEqual(
692
385
  str(context_missing.exception),
693
386
  "The prompt_template is missing from the configuration.",
694
387
  )
695
388
 
696
-
697
- class TestMissingState(unittest.TestCase):
698
- """Test error when missing from state."""
699
-
700
- def test_missing_text_embedding_model(self):
701
- """Test error when text_embedding_model is missing from state."""
702
- state = {
703
- # Missing text_embedding_model
704
- "llm_model": MagicMock(),
705
- "article_data": {
706
- "paper1": {
707
- "pdf_url": "http://example.com/test.pdf",
708
- "Title": "Test Paper",
709
- }
710
- },
711
- }
712
- tool_call_id = "test_call_2"
713
- question = "What is the conclusion?"
714
- tool_input = {
715
- "question": question,
716
- "tool_call_id": tool_call_id,
717
- "state": state,
718
- }
719
- with self.assertRaises(ValueError) as context:
720
- question_and_answer.run(tool_input)
721
- self.assertEqual(
722
- str(context.exception), "No text embedding model found in state."
723
- )
724
-
725
- def test_missing_llm_model(self):
726
- """Test error when llm_model is missing from state."""
727
- state = {
728
- "text_embedding_model": MagicMock(),
729
- # Missing llm_model
730
- "article_data": {
731
- "paper1": {
732
- "pdf_url": "http://example.com/test.pdf",
733
- "Title": "Test Paper",
734
- }
735
- },
736
- }
737
- tool_call_id = "test_call_3"
738
- question = "What is the conclusion?"
739
- tool_input = {
740
- "question": question,
741
- "tool_call_id": tool_call_id,
742
- "state": state,
743
- }
744
- with self.assertRaises(ValueError) as context:
745
- question_and_answer.run(tool_input)
746
- self.assertEqual(str(context.exception), "No LLM model found in state.")
747
-
748
- def test_missing_article_data(self):
749
- """Test error when article_data is missing from state."""
750
- state = {
751
- "text_embedding_model": MagicMock(),
752
- "llm_model": MagicMock(),
753
- # Missing article_data
754
- }
755
- tool_call_id = "test_call_4"
756
- question = "What is the conclusion?"
757
- tool_input = {
758
- "question": question,
759
- "tool_call_id": tool_call_id,
760
- "state": state,
761
- }
762
- with self.assertRaises(ValueError) as context:
763
- question_and_answer.run(tool_input)
764
- self.assertEqual(str(context.exception), "No article_data found in state.")
765
-
766
- def test_empty_article_data(self):
767
- """
768
- Test that when article_data exists but is empty (no paper keys), a ValueError is raised.
769
- """
770
- state = {
771
- "text_embedding_model": MagicMock(),
772
- "llm_model": MagicMock(),
773
- "article_data": {}, # empty dict
774
- }
775
- tool_call_id = "test_empty_article_data"
776
- question = "What is the summary?"
777
- tool_input = {
778
- "question": question,
779
- "tool_call_id": tool_call_id,
780
- "state": state,
781
- }
782
- with self.assertRaises(ValueError) as context:
783
- question_and_answer.run(tool_input)
784
- self.assertEqual(str(context.exception), "No article_data found in state.")
785
-
786
- @patch(
787
- "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.maximal_marginal_relevance"
788
- )
789
- def test_retrieve_relevant_chunks_with_filtering(self, mock_mmr):
389
+ def test_state_validation_errors(self):
390
+ """Test errors raised for missing state entries."""
391
+ valid_articles = {"paper1": {"pdf_url": "u", "Title": "T1"}}
392
+ cases = [
393
+ ({"llm_model": MagicMock(), "article_data": valid_articles},
394
+ "No text embedding model found in state."),
395
+ ({"text_embedding_model": MagicMock(), "article_data": valid_articles},
396
+ "No LLM model found in state."),
397
+ ({"text_embedding_model": MagicMock(), "llm_model": MagicMock()},
398
+ "No article_data found in state."),
399
+ ({"text_embedding_model": MagicMock(), "llm_model": MagicMock(), "article_data": {}},
400
+ "No article_data found in state."),
401
+ ]
402
+ for state_dict, expected_msg in cases:
403
+ with self.subTest(state=state_dict):
404
+ tool_input = {"question": "Q?", "state": state_dict, "tool_call_id": "id"}
405
+ with self.assertRaises(ValueError) as cm:
406
+ question_and_answer.run(tool_input)
407
+ self.assertEqual(str(cm.exception), expected_msg)
408
+
409
+ def test_retrieve_relevant_chunks_with_filtering(self):
790
410
  """Test that filtering works by paper_ids."""
791
- mock_mmr.return_value = [0]
792
- dummy_embedding = [0.1, 0.2, 0.3]
793
-
794
411
  mock_embedding_model = MagicMock(spec=Embeddings)
795
- mock_embedding_model.embed_query.return_value = dummy_embedding
796
- mock_embedding_model.embed_documents.return_value = [dummy_embedding]
412
+ mock_embedding_model.embed_query.return_value = [0.1, 0.2, 0.3]
413
+ mock_embedding_model.embed_documents.return_value = [[0.1, 0.2, 0.3]]
797
414
 
798
- vector_store = Vectorstore(embedding_model=mock_embedding_model)
415
+ vector_store = Vectorstore(
416
+ embedding_model=mock_embedding_model, config=load_hydra_config()
417
+ )
799
418
  vector_store.vector_store = True
800
419
  # Add document chunks with necessary metadata including chunk_ids
801
- doc1 = Document(page_content="Doc 1", metadata={"paper_id": "paper1", "chunk_id": 0})
802
- doc2 = Document(page_content="Doc 2", metadata={"paper_id": "paper2", "chunk_id": 1})
420
+ doc1 = Document(
421
+ page_content="Doc 1", metadata={"paper_id": "paper1", "chunk_id": 0}
422
+ )
423
+ doc2 = Document(
424
+ page_content="Doc 2", metadata={"paper_id": "paper2", "chunk_id": 1}
425
+ )
803
426
  vector_store.documents = {"doc1": doc1, "doc2": doc2}
804
427
 
805
- results = vector_store.retrieve_relevant_chunks(
806
- query="query", paper_ids=["paper1"]
428
+ results = retrieve_relevant_chunks(
429
+ vector_store, query="query", paper_ids=["paper1"]
807
430
  )
808
431
  assert len(results) == 1
809
432
  assert results[0].metadata["paper_id"] == "paper1"
@@ -814,65 +437,93 @@ class TestMissingState(unittest.TestCase):
814
437
  mock_embedding_model.embed_query.return_value = [0.1, 0.2, 0.3]
815
438
  mock_embedding_model.embed_documents.return_value = []
816
439
 
817
- vector_store = Vectorstore(embedding_model=mock_embedding_model)
440
+ vector_store = Vectorstore(
441
+ embedding_model=mock_embedding_model, config=load_hydra_config()
442
+ )
818
443
  vector_store.vector_store = True
819
444
  # Add doc with paper_id that won't match
820
445
  vector_store.documents["doc1"] = Document(
821
446
  page_content="No match", metadata={"paper_id": "unmatched_paper"}
822
447
  )
823
-
824
- results = vector_store.retrieve_relevant_chunks(
825
- query="test", paper_ids=["nonexistent_id"]
448
+ # Use util function for retrieval
449
+ results = retrieve_relevant_chunks(
450
+ vector_store, query="test", paper_ids=["nonexistent_id"]
826
451
  )
827
452
  assert results == []
828
453
 
829
454
  @patch(
830
- "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.load_hydra_config"
455
+ "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer."
456
+ "helper.get_state_models_and_data"
831
457
  )
832
458
  @patch(
833
- "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.generate_answer"
459
+ "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer."
460
+ "helper.init_vector_store"
834
461
  )
835
- def test_prebuilt_vector_store_branch(self, mock_generate, mock_load_config):
836
- """Test question_and_answer tool with a shared pre-built vector store branch."""
837
- # Mock configuration for tool-level thresholds
838
- config = SimpleNamespace(top_k_papers=1, top_k_chunks=1)
839
- mock_load_config.return_value = config
840
- # Mock generate_answer to return a simple response
841
- mock_generate.return_value = {"output_text": "Answer", "papers_used": ["p1"]}
842
-
843
- # Prepare a dummy pre-built vector store
844
- dummy_vs = SimpleNamespace(
845
- loaded_papers=set(),
846
- vector_store=True,
847
- retrieve_relevant_chunks=lambda *_args, **_kwargs: [
848
- Document(page_content="chunk", metadata={"paper_id": "p1"})
849
- ],
850
- )
851
- # Override the module-level prebuilt_vector_store
852
- qa_module.prebuilt_vector_store = dummy_vs
853
-
854
- # Prepare state with required models and article_data
855
- state = {
856
- "text_embedding_model": MagicMock(),
857
- "llm_model": MagicMock(),
858
- "article_data": {"p1": {"source": "upload"}},
859
- }
860
-
861
- # Invoke the tool-level function via .run with appropriate input schema
862
- input_data = {
863
- "question": "What?",
864
- "paper_ids": None,
865
- "use_all_papers": False,
866
- "tool_call_id": "testid",
867
- "state": state,
868
- }
869
- result = qa_module.question_and_answer.run(input_data)
870
-
871
- # Ensure the prebuilt branch was used and a Command is returned
872
- self.assertTrue(hasattr(result, "update"))
873
- messages = result.update.get("messages", [])
874
- self.assertEqual(len(messages), 1)
875
- self.assertIsInstance(messages[0], ToolMessage)
462
+ @patch(
463
+ "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer."
464
+ "retrieve_relevant_chunks"
465
+ )
466
+ @patch.multiple(
467
+ "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.helper",
468
+ run_reranker=lambda vs, query, candidates: ["p1"],
469
+ format_answer=lambda question, chunks, llm, articles: "formatted answer",
470
+ )
471
+ def test_question_and_answer_happy_path(
472
+ self, mock_retrieve, mock_init, mock_state
473
+ ):
474
+ """Test happy path for question_and_answer tool."""
475
+ # Setup helper and utility mocks
476
+ emb = object()
477
+ llm = object()
478
+ articles = {"p1": {"pdf_url": "u"}}
479
+ mock_state.return_value = (emb, llm, articles)
480
+ # Provide dummy vector store for loading
481
+ vs = SimpleNamespace(loaded_papers=set(), add_paper=MagicMock())
482
+ mock_init.return_value = vs
483
+ # Dummy chunk list for retrieval
484
+ dummy_chunk = Document(page_content="c", metadata={"paper_id": "p1"})
485
+ mock_retrieve.return_value = [dummy_chunk]
486
+
487
+ # Use module-level question_and_answer
488
+
489
+ state = {}
490
+ tool_input = {"question": "Q?", "state": state, "tool_call_id": "tid"}
491
+ result = question_and_answer.run(tool_input)
492
+ # Verify Command message content and tool_call_id
493
+ msgs = result.update.get("messages", [])
494
+ self.assertEqual(len(msgs), 1)
495
+ msg = msgs[0]
496
+ self.assertEqual(msg.content, "formatted answer")
497
+ self.assertEqual(msg.tool_call_id, "tid")
876
498
 
877
- # Clean up global override
878
- qa_module.prebuilt_vector_store = None
499
+ @patch(
500
+ "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.helper."
501
+ "get_state_models_and_data"
502
+ )
503
+ @patch(
504
+ "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.helper.init_vector_store"
505
+ )
506
+ @patch(
507
+ "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.helper.run_reranker",
508
+ return_value=["p1"],
509
+ )
510
+ @patch(
511
+ "aiagents4pharma.talk2scholars.tools.pdf.question_and_answer.retrieve_relevant_chunks",
512
+ return_value=[],
513
+ )
514
+ def test_question_and_answer_no_chunks(
515
+ self, _mock_retrieve, _mock_rerank, mock_init, mock_state
516
+ ):
517
+ """Test that no chunks raises RuntimeError."""
518
+ emb = object()
519
+ llm = object()
520
+ articles = {"p1": {"pdf_url": "u"}}
521
+ mock_state.return_value = (emb, llm, articles)
522
+ # Provide dummy vector store to satisfy load_candidate_papers
523
+ vs = SimpleNamespace(loaded_papers=set(), add_paper=MagicMock())
524
+ mock_init.return_value = vs
525
+
526
+ tool_input = {"question": "Q?", "state": {}, "tool_call_id": "id"}
527
+ with self.assertRaises(RuntimeError) as cm:
528
+ question_and_answer.run(tool_input)
529
+ self.assertIn("No relevant chunks found for question", str(cm.exception))