aiagents4pharma 1.40.0__py3-none-any.whl → 1.41.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +4 -0
  2. aiagents4pharma/talk2scholars/configs/tools/question_and_answer/default.yaml +44 -4
  3. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker.py +127 -0
  4. aiagents4pharma/talk2scholars/tests/test_pdf_answer_formatter.py +66 -0
  5. aiagents4pharma/talk2scholars/tests/test_pdf_batch_processor.py +101 -0
  6. aiagents4pharma/talk2scholars/tests/test_pdf_collection_manager.py +150 -0
  7. aiagents4pharma/talk2scholars/tests/test_pdf_document_processor.py +69 -0
  8. aiagents4pharma/talk2scholars/tests/test_pdf_generate_answer.py +75 -0
  9. aiagents4pharma/talk2scholars/tests/test_pdf_gpu_detection.py +140 -0
  10. aiagents4pharma/talk2scholars/tests/test_pdf_paper_loader.py +116 -0
  11. aiagents4pharma/talk2scholars/tests/test_pdf_rag_pipeline.py +98 -0
  12. aiagents4pharma/talk2scholars/tests/test_pdf_retrieve_chunks.py +197 -0
  13. aiagents4pharma/talk2scholars/tests/test_pdf_singleton_manager.py +156 -0
  14. aiagents4pharma/talk2scholars/tests/test_pdf_vector_normalization.py +121 -0
  15. aiagents4pharma/talk2scholars/tests/test_pdf_vector_store.py +434 -0
  16. aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +89 -509
  17. aiagents4pharma/talk2scholars/tests/test_tool_helper_utils.py +34 -89
  18. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +8 -6
  19. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +6 -4
  20. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +74 -40
  21. aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +26 -1
  22. aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +62 -0
  23. aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +200 -0
  24. aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +172 -0
  25. aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +76 -0
  26. aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +14 -14
  27. aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +63 -0
  28. aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +154 -0
  29. aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +60 -40
  30. aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +123 -0
  31. aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +122 -0
  32. aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +162 -40
  33. aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +140 -0
  34. aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +40 -78
  35. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +159 -0
  36. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +277 -96
  37. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +12 -9
  38. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +0 -1
  39. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +9 -8
  40. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +5 -5
  41. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/METADATA +27 -115
  42. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/RECORD +45 -23
  43. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker_utils.py +0 -28
  44. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/WHEEL +0 -0
  45. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/licenses/LICENSE +0 -0
  46. {aiagents4pharma-1.40.0.dist-info → aiagents4pharma-1.41.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,434 @@
1
+ """
2
+ Unit tests for the Vectorstore class with GPU support and embedding normalization.
3
+ """
4
+
5
+ from types import SimpleNamespace
6
+ from unittest.mock import MagicMock, patch
7
+
8
+ import pytest
9
+ from langchain_core.documents import Document
10
+ from langchain_core.embeddings import Embeddings
11
+
12
+ from aiagents4pharma.talk2scholars.tools.pdf.utils.vector_store import Vectorstore
13
+
14
+ MODULE = "aiagents4pharma.talk2scholars.tools.pdf.utils.vector_store"
15
+
16
+
17
+ @pytest.fixture(name="mock_config")
18
+ def _mock_config():
19
+ """
20
+ Fixture providing a mock configuration object with default GPU detection off.
21
+ """
22
+ return SimpleNamespace(
23
+ milvus=SimpleNamespace(
24
+ host="localhost",
25
+ port=19530,
26
+ collection_name="test_collection",
27
+ db_name="test_db",
28
+ embedding_dim=384,
29
+ ),
30
+ gpu_detection=SimpleNamespace(force_cpu_mode=False),
31
+ )
32
+
33
+
34
+ @pytest.fixture(name="mock_embedding")
35
+ def _mock_embedding():
36
+ """
37
+ Fixture providing a mock Embeddings model.
38
+ """
39
+ return MagicMock(spec=Embeddings)
40
+
41
+
42
+ @pytest.fixture(name="dummy_embedding")
43
+ def _dummy_embedding():
44
+ """
45
+ Fixture providing a dummy Embeddings model.
46
+ """
47
+ return MagicMock(spec=Embeddings)
48
+
49
+
50
+ @pytest.fixture(name="dummy_config")
51
+ def _dummy_config():
52
+ """
53
+ Fixture providing a dummy configuration object.
54
+ """
55
+ return SimpleNamespace(
56
+ milvus=SimpleNamespace(
57
+ host="localhost",
58
+ port=19530,
59
+ collection_name="test_collection",
60
+ db_name="test_db",
61
+ embedding_dim=768,
62
+ ),
63
+ gpu_detection=SimpleNamespace(force_cpu_mode=False),
64
+ )
65
+
66
+
67
+ @pytest.fixture(name="dummy_vectorstore_components")
68
+ def _dummy_vectorstore_components():
69
+ """
70
+ Provides VectorstoreSingleton mock and vector_store with empty collection.
71
+ """
72
+ with (
73
+ patch(f"{MODULE}.detect_nvidia_gpu", return_value=True),
74
+ patch(
75
+ f"{MODULE}.get_optimal_index_config",
76
+ return_value=(
77
+ {"index_type": "IVF_FLAT", "metric_type": "IP"},
78
+ {"nprobe": 10},
79
+ ),
80
+ ),
81
+ patch(f"{MODULE}.ensure_collection_exists", return_value=MagicMock()),
82
+ patch(f"{MODULE}.VectorstoreSingleton") as singleton_cls,
83
+ ):
84
+ mock_singleton = MagicMock()
85
+ mock_vector_store = MagicMock()
86
+ mock_collection = MagicMock()
87
+ mock_collection.num_entities = 0
88
+ mock_collection.flush.return_value = None
89
+ mock_vector_store.col = mock_collection
90
+ mock_vector_store.collection = mock_collection
91
+ mock_singleton.get_vector_store.return_value = mock_vector_store
92
+ mock_singleton.get_connection.return_value = "connected"
93
+ singleton_cls.return_value = mock_singleton
94
+ yield mock_singleton, mock_vector_store
95
+
96
+
97
+ def test_vectorstore_initialization(mock_config, mock_embedding):
98
+ """
99
+ Test Vectorstore initialization with GPU and mocked dependencies.
100
+ """
101
+ with (
102
+ patch(f"{MODULE}.detect_nvidia_gpu", return_value=True),
103
+ patch(f"{MODULE}.log_index_configuration"),
104
+ patch(
105
+ f"{MODULE}.get_optimal_index_config",
106
+ return_value=({"metric_type": "IP"}, {}),
107
+ ),
108
+ patch(f"{MODULE}.wrap_embedding_model_if_needed", return_value=mock_embedding),
109
+ patch(f"{MODULE}.ensure_collection_exists", return_value="mock_collection"),
110
+ patch(f"{MODULE}.VectorstoreSingleton") as singleton_cls,
111
+ ):
112
+ mock_singleton = MagicMock()
113
+ mock_vector_store = MagicMock()
114
+ mock_collection = MagicMock()
115
+ mock_collection.num_entities = 0
116
+ mock_collection.flush.return_value = None
117
+ mock_vector_store.col = mock_collection
118
+ mock_vector_store.collection = mock_collection
119
+ mock_singleton.get_vector_store.return_value = mock_vector_store
120
+ mock_singleton.get_connection.return_value = None
121
+ singleton_cls.return_value = mock_singleton
122
+
123
+ vs = Vectorstore(embedding_model=mock_embedding, config=mock_config)
124
+
125
+ assert vs.embedding_model is mock_embedding
126
+ assert vs.collection == "mock_collection"
127
+ assert vs.has_gpu
128
+ assert vs.vector_store is mock_vector_store
129
+
130
+
131
+ def test_get_embedding_info(mock_config, mock_embedding):
132
+ """
133
+ Test retrieval of embedding configuration info.
134
+ """
135
+ with (
136
+ patch(f"{MODULE}.detect_nvidia_gpu", return_value=True),
137
+ patch(f"{MODULE}.log_index_configuration"),
138
+ patch(
139
+ f"{MODULE}.get_optimal_index_config",
140
+ return_value=({"metric_type": "IP", "index_type": "IVF"}, {}),
141
+ ),
142
+ patch(f"{MODULE}.wrap_embedding_model_if_needed", return_value=mock_embedding),
143
+ patch(f"{MODULE}.ensure_collection_exists", return_value="mock_collection"),
144
+ patch(f"{MODULE}.VectorstoreSingleton") as singleton_cls,
145
+ ):
146
+ mock_singleton = MagicMock()
147
+ mock_vector_store = MagicMock()
148
+ mock_collection = MagicMock()
149
+ mock_collection.num_entities = 0
150
+ mock_collection.flush.return_value = None
151
+ mock_vector_store.col = mock_collection
152
+ mock_vector_store.collection = mock_collection
153
+ mock_singleton.get_vector_store.return_value = mock_vector_store
154
+ mock_singleton.get_connection.return_value = None
155
+ singleton_cls.return_value = mock_singleton
156
+
157
+ vs = Vectorstore(embedding_model=mock_embedding, config=mock_config)
158
+ info = vs.get_embedding_info()
159
+
160
+ assert info["has_gpu"]
161
+ assert info["use_cosine"]
162
+ assert "original_model_type" in info
163
+ assert "wrapped_model_type" in info
164
+ assert "normalization_enabled" in info
165
+
166
+
167
+ def test_load_existing_papers_with_exception(mock_embedding, mock_config):
168
+ """
169
+ Test that _load_existing_paper_ids propagates on flush failure.
170
+ """
171
+ with (
172
+ patch(f"{MODULE}.wrap_embedding_model_if_needed", return_value=mock_embedding),
173
+ patch(f"{MODULE}.VectorstoreSingleton") as singleton_cls,
174
+ patch(f"{MODULE}.ensure_collection_exists"),
175
+ patch(f"{MODULE}.detect_nvidia_gpu", return_value=True),
176
+ patch(
177
+ f"{MODULE}.get_optimal_index_config",
178
+ return_value=({"metric_type": "IP"}, {}),
179
+ ),
180
+ patch(f"{MODULE}.log_index_configuration"),
181
+ ):
182
+ mock_singleton = MagicMock()
183
+ safe_store = MagicMock()
184
+ safe_collection = MagicMock()
185
+ safe_collection.num_entities = 0
186
+ safe_collection.flush.return_value = None
187
+ safe_store.col = safe_collection
188
+ safe_store.collection = safe_collection
189
+ mock_singleton.get_vector_store.return_value = safe_store
190
+ mock_singleton.get_connection.return_value = None
191
+ singleton_cls.return_value = mock_singleton
192
+
193
+ vs = Vectorstore(embedding_model=mock_embedding, config=mock_config)
194
+
195
+ # now replace with failing store
196
+ bad_collection = MagicMock()
197
+ bad_collection.num_entities = 0
198
+ bad_collection.flush.side_effect = Exception("flush failed")
199
+ bad_store = MagicMock()
200
+ bad_store.col = bad_collection
201
+ bad_store.collection = bad_collection
202
+ vs.vector_store = bad_store
203
+
204
+ with pytest.raises(Exception) as excinfo:
205
+ getattr(vs, "_load_existing_paper_ids")()
206
+ assert "flush failed" in str(excinfo.value)
207
+
208
+
209
+ def test_ensure_collection_loaded_with_entities(mock_embedding, mock_config):
210
+ """
211
+ Test that _ensure_collection_loaded loads data when entities > 0.
212
+ """
213
+ with (
214
+ patch(f"{MODULE}.wrap_embedding_model_if_needed", return_value=mock_embedding),
215
+ patch(f"{MODULE}.VectorstoreSingleton") as singleton_cls,
216
+ patch(f"{MODULE}.ensure_collection_exists"),
217
+ patch(f"{MODULE}.detect_nvidia_gpu", return_value=True),
218
+ patch(
219
+ f"{MODULE}.get_optimal_index_config",
220
+ return_value=({"metric_type": "IP"}, {}),
221
+ ),
222
+ patch(f"{MODULE}.log_index_configuration"),
223
+ ):
224
+ mock_singleton = MagicMock()
225
+ mock_store = MagicMock()
226
+ mock_collection = MagicMock()
227
+ mock_collection.num_entities = 5
228
+ mock_collection.flush.return_value = None
229
+ mock_store.col = mock_collection
230
+ mock_store.collection = mock_collection
231
+ mock_singleton.get_vector_store.return_value = mock_store
232
+ mock_singleton.get_connection.return_value = None
233
+ singleton_cls.return_value = mock_singleton
234
+
235
+ vs = Vectorstore(embedding_model=mock_embedding, config=mock_config)
236
+ vs.vector_store = mock_store
237
+ getattr(vs, "_ensure_collection_loaded")()
238
+
239
+ assert mock_collection.load.called
240
+
241
+
242
+ def test_ensure_collection_loaded_handles_exception(mock_embedding, mock_config):
243
+ """
244
+ Test that _ensure_collection_loaded propagates on flush failure.
245
+ """
246
+ with (
247
+ patch(f"{MODULE}.wrap_embedding_model_if_needed", return_value=mock_embedding),
248
+ patch(f"{MODULE}.VectorstoreSingleton") as singleton_cls,
249
+ patch(f"{MODULE}.ensure_collection_exists"),
250
+ patch(f"{MODULE}.detect_nvidia_gpu", return_value=True),
251
+ patch(
252
+ f"{MODULE}.get_optimal_index_config",
253
+ return_value=({"metric_type": "IP"}, {}),
254
+ ),
255
+ patch(f"{MODULE}.log_index_configuration"),
256
+ ):
257
+ mock_singleton = MagicMock()
258
+ safe_store = MagicMock()
259
+ safe_collection = MagicMock()
260
+ safe_collection.num_entities = 0
261
+ safe_collection.flush.return_value = None
262
+ safe_store.col = safe_collection
263
+ safe_store.collection = safe_collection
264
+ mock_singleton.get_vector_store.return_value = safe_store
265
+ mock_singleton.get_connection.return_value = None
266
+ singleton_cls.return_value = mock_singleton
267
+
268
+ vs = Vectorstore(embedding_model=mock_embedding, config=mock_config)
269
+
270
+ # override with failing store
271
+ bad_collection = MagicMock()
272
+ bad_collection.num_entities = 0
273
+ bad_collection.flush.side_effect = Exception("flush error")
274
+ bad_store = MagicMock()
275
+ bad_store.col = bad_collection
276
+ bad_store.collection = bad_collection
277
+ vs.vector_store = bad_store
278
+
279
+ with pytest.raises(Exception) as excinfo:
280
+ getattr(vs, "_ensure_collection_loaded")()
281
+ assert "flush error" in str(excinfo.value)
282
+
283
+
284
+ def test_force_cpu_mode_logs_override(mock_config, mock_embedding):
285
+ """
286
+ Test that forcing CPU mode via config disables GPU detection.
287
+ """
288
+ mock_config.gpu_detection.force_cpu_mode = True
289
+ with (
290
+ patch(f"{MODULE}.wrap_embedding_model_if_needed", return_value=mock_embedding),
291
+ patch(f"{MODULE}.VectorstoreSingleton") as singleton_cls,
292
+ patch(f"{MODULE}.ensure_collection_exists", return_value="mock_collection"),
293
+ patch(f"{MODULE}.detect_nvidia_gpu", return_value=True),
294
+ patch(
295
+ f"{MODULE}.get_optimal_index_config",
296
+ return_value=({"metric_type": "IP"}, {}),
297
+ ),
298
+ patch(f"{MODULE}.log_index_configuration"),
299
+ ):
300
+ mock_singleton = MagicMock()
301
+ mock_store = MagicMock()
302
+ mock_collection = MagicMock()
303
+ mock_collection.num_entities = 0
304
+ mock_collection.flush.return_value = None
305
+ mock_store.col = mock_collection
306
+ mock_store.collection = mock_collection
307
+ mock_singleton.get_vector_store.return_value = mock_store
308
+ mock_singleton.get_connection.return_value = None
309
+ singleton_cls.return_value = mock_singleton
310
+
311
+ vs = Vectorstore(embedding_model=mock_embedding, config=mock_config)
312
+
313
+ assert not vs.has_gpu
314
+
315
+
316
+ def test_similarity_metric_override(
317
+ dummy_embedding, dummy_config, dummy_vectorstore_components
318
+ ):
319
+ """
320
+ Test setting of use_cosine from config.similarity_metric.
321
+ """
322
+ dummy_config.similarity_metric = SimpleNamespace(use_cosine=False)
323
+ # unpack and ignore vector_store
324
+ _singleton, _mock_vector_store = dummy_vectorstore_components
325
+ vs = Vectorstore(dummy_embedding, config=dummy_config)
326
+ assert not vs.use_cosine
327
+
328
+
329
+ def test_load_existing_paper_ids_fallback_to_collection(
330
+ dummy_embedding, dummy_config, dummy_vectorstore_components
331
+ ):
332
+ """
333
+ Test fallback if both `col` and `collection` missing.
334
+ """
335
+ _, mock_vector_store = dummy_vectorstore_components
336
+ for attr in ("col", "collection"):
337
+ if hasattr(mock_vector_store, attr):
338
+ delattr(mock_vector_store, attr)
339
+
340
+ vs = Vectorstore(dummy_embedding, config=dummy_config)
341
+ vs.vector_store = mock_vector_store
342
+ getattr(vs, "_load_existing_paper_ids")()
343
+ assert isinstance(vs.loaded_papers, set)
344
+
345
+
346
+ def test_load_existing_papers_collection_empty_logs(
347
+ dummy_embedding, dummy_config, dummy_vectorstore_components
348
+ ):
349
+ """
350
+ Test logging when collection empty in _load_existing_paper_ids.
351
+ """
352
+ _, mock_vector_store = dummy_vectorstore_components
353
+ mock_collection = MagicMock()
354
+ mock_collection.num_entities = 0
355
+ mock_collection.flush.return_value = None
356
+ mock_vector_store.col = mock_collection
357
+
358
+ vs = Vectorstore(dummy_embedding, config=dummy_config)
359
+ vs.vector_store = mock_vector_store
360
+ getattr(vs, "_load_existing_paper_ids")()
361
+ assert len(vs.loaded_papers) == 0
362
+
363
+
364
+ def test_similarity_search_filter_paths(
365
+ dummy_embedding, dummy_config, dummy_vectorstore_components
366
+ ):
367
+ """
368
+ Test filter expression generation in similarity_search.
369
+ """
370
+ _, mock_vector_store = dummy_vectorstore_components
371
+ mock_vector_store.similarity_search.return_value = [Document(page_content="test")]
372
+ vs = Vectorstore(dummy_embedding, config=dummy_config)
373
+ vs.vector_store = mock_vector_store
374
+
375
+ filters = {
376
+ "field1": "value",
377
+ "field2": [1, 2],
378
+ "field3": 99,
379
+ "field4": 3.14,
380
+ }
381
+ result = vs.similarity_search(query="text", filter=filters)
382
+ assert isinstance(result, list)
383
+
384
+
385
+ def test_mmr_search_filter_paths(
386
+ dummy_embedding, dummy_config, dummy_vectorstore_components
387
+ ):
388
+ """
389
+ Test filter expression generation in max_marginal_relevance_search.
390
+ """
391
+ _, mock_vector_store = dummy_vectorstore_components
392
+ mock_vector_store.max_marginal_relevance_search.return_value = [
393
+ Document(page_content="test")
394
+ ]
395
+ vs = Vectorstore(dummy_embedding, config=dummy_config)
396
+ vs.vector_store = mock_vector_store
397
+
398
+ filters = {"f": "text", "g": ["a", "b"], "h": 7, "j": 3.3}
399
+ result = vs.max_marginal_relevance_search(query="q", filter=filters)
400
+ assert isinstance(result, list)
401
+
402
+
403
+ def test_ensure_collection_loaded_no_col_and_no_collection(
404
+ dummy_embedding, dummy_config, dummy_vectorstore_components
405
+ ):
406
+ """
407
+ Test no-op when no collection attributes present.
408
+ """
409
+ _, mock_vector_store = dummy_vectorstore_components
410
+ for attr in ("col", "collection"):
411
+ if hasattr(mock_vector_store, attr):
412
+ delattr(mock_vector_store, attr)
413
+
414
+ vs = Vectorstore(dummy_embedding, config=dummy_config)
415
+ vs.vector_store = mock_vector_store
416
+ getattr(vs, "_ensure_collection_loaded")()
417
+ # no exception
418
+
419
+
420
+ def test_ensure_collection_loaded_empty_logs(
421
+ dummy_embedding, dummy_config, dummy_vectorstore_components
422
+ ):
423
+ """
424
+ Test logging when collection empty in _ensure_collection_loaded.
425
+ """
426
+ _, mock_vector_store = dummy_vectorstore_components
427
+ mock_collection = MagicMock()
428
+ mock_collection.num_entities = 0
429
+ mock_vector_store.col = mock_collection
430
+
431
+ vs = Vectorstore(dummy_embedding, config=dummy_config)
432
+ vs.vector_store = mock_vector_store
433
+ getattr(vs, "_ensure_collection_loaded")()
434
+ # no exception