aiagents4pharma 1.40.1__py3-none-any.whl → 1.42.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. aiagents4pharma/talk2knowledgegraphs/configs/app/frontend/default.yaml +1 -1
  2. aiagents4pharma/talk2knowledgegraphs/configs/tools/multimodal_subgraph_extraction/default.yaml +37 -0
  3. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/ols_terms/default.yaml +3 -0
  4. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/reactome_pathways/default.yaml +3 -0
  5. aiagents4pharma/talk2knowledgegraphs/configs/utils/enrichments/uniprot_proteins/default.yaml +6 -0
  6. aiagents4pharma/talk2knowledgegraphs/configs/utils/pubchem_utils/default.yaml +5 -0
  7. aiagents4pharma/talk2knowledgegraphs/milvus_data_dump.py +752 -350
  8. aiagents4pharma/talk2scholars/configs/agents/talk2scholars/s2_agent/default.yaml +4 -0
  9. aiagents4pharma/talk2scholars/configs/tools/question_and_answer/default.yaml +44 -4
  10. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker.py +127 -0
  11. aiagents4pharma/talk2scholars/tests/test_pdf_answer_formatter.py +66 -0
  12. aiagents4pharma/talk2scholars/tests/test_pdf_batch_processor.py +101 -0
  13. aiagents4pharma/talk2scholars/tests/test_pdf_collection_manager.py +150 -0
  14. aiagents4pharma/talk2scholars/tests/test_pdf_document_processor.py +69 -0
  15. aiagents4pharma/talk2scholars/tests/test_pdf_generate_answer.py +75 -0
  16. aiagents4pharma/talk2scholars/tests/test_pdf_gpu_detection.py +140 -0
  17. aiagents4pharma/talk2scholars/tests/test_pdf_paper_loader.py +116 -0
  18. aiagents4pharma/talk2scholars/tests/test_pdf_rag_pipeline.py +98 -0
  19. aiagents4pharma/talk2scholars/tests/test_pdf_retrieve_chunks.py +197 -0
  20. aiagents4pharma/talk2scholars/tests/test_pdf_singleton_manager.py +156 -0
  21. aiagents4pharma/talk2scholars/tests/test_pdf_vector_normalization.py +121 -0
  22. aiagents4pharma/talk2scholars/tests/test_pdf_vector_store.py +434 -0
  23. aiagents4pharma/talk2scholars/tests/test_question_and_answer_tool.py +89 -509
  24. aiagents4pharma/talk2scholars/tests/test_tool_helper_utils.py +34 -89
  25. aiagents4pharma/talk2scholars/tools/paper_download/download_biorxiv_input.py +8 -6
  26. aiagents4pharma/talk2scholars/tools/paper_download/download_medrxiv_input.py +6 -4
  27. aiagents4pharma/talk2scholars/tools/pdf/question_and_answer.py +74 -40
  28. aiagents4pharma/talk2scholars/tools/pdf/utils/__init__.py +26 -1
  29. aiagents4pharma/talk2scholars/tools/pdf/utils/answer_formatter.py +62 -0
  30. aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py +200 -0
  31. aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py +172 -0
  32. aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py +76 -0
  33. aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py +14 -14
  34. aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py +63 -0
  35. aiagents4pharma/talk2scholars/tools/pdf/utils/gpu_detection.py +154 -0
  36. aiagents4pharma/talk2scholars/tools/pdf/utils/nvidia_nim_reranker.py +60 -40
  37. aiagents4pharma/talk2scholars/tools/pdf/utils/paper_loader.py +123 -0
  38. aiagents4pharma/talk2scholars/tools/pdf/utils/rag_pipeline.py +122 -0
  39. aiagents4pharma/talk2scholars/tools/pdf/utils/retrieve_chunks.py +162 -40
  40. aiagents4pharma/talk2scholars/tools/pdf/utils/singleton_manager.py +140 -0
  41. aiagents4pharma/talk2scholars/tools/pdf/utils/tool_helper.py +40 -78
  42. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_normalization.py +159 -0
  43. aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py +277 -96
  44. aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py +12 -9
  45. aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py +0 -1
  46. aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py +9 -8
  47. aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py +5 -5
  48. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/METADATA +52 -126
  49. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/RECORD +52 -25
  50. aiagents4pharma/talk2scholars/tests/test_nvidia_nim_reranker_utils.py +0 -28
  51. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/WHEEL +0 -0
  52. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/licenses/LICENSE +0 -0
  53. {aiagents4pharma-1.40.1.dist-info → aiagents4pharma-1.42.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,200 @@
1
+ """
2
+ Batch processing utilities for adding multiple papers to vector store.
3
+ """
4
+
5
+ import concurrent.futures
6
+ import logging
7
+ import time
8
+ from typing import Any, Dict, List, Set, Tuple
9
+
10
+ from langchain_core.documents import Document
11
+
12
+ from .document_processor import load_and_split_pdf
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ def add_papers_batch(
18
+ papers_to_add: List[Tuple[str, str, Dict[str, Any]]],
19
+ vector_store: Any,
20
+ loaded_papers: Set[str],
21
+ paper_metadata: Dict[str, Dict[str, Any]],
22
+ documents: Dict[str, Document],
23
+ **kwargs: Any,
24
+ ) -> None:
25
+ """
26
+ Add multiple papers to the document store in parallel with batch embedding.
27
+
28
+ Args:
29
+ papers_to_add: List of tuples (paper_id, pdf_url, paper_metadata).
30
+ vector_store: The LangChain Milvus vector store instance.
31
+ loaded_papers: Set to track which papers are already loaded.
32
+ paper_metadata: Dict to store paper metadata after load.
33
+ documents: Dict to store document chunks.
34
+ config: (via kwargs) Configuration object.
35
+ metadata_fields: (via kwargs) List of metadata fields to include.
36
+ has_gpu: (via kwargs) Whether GPU is available.
37
+ max_workers: (via kwargs) Max PDF‐loading threads (default 5).
38
+ batch_size: (via kwargs) Embedding batch size (default 100).
39
+ """
40
+ cfg = kwargs
41
+
42
+ if not papers_to_add:
43
+ logger.info("No papers to add")
44
+ return
45
+
46
+ to_process = [
47
+ (pid, url, md) for pid, url, md in papers_to_add if pid not in loaded_papers
48
+ ]
49
+ if not to_process:
50
+ logger.info("Skipping %d already-loaded papers", len(papers_to_add))
51
+ logger.info("All %d papers are already loaded", len(papers_to_add))
52
+ return
53
+
54
+ logger.info(
55
+ "Starting PARALLEL batch processing of %d papers with %d workers (%s)",
56
+ len(to_process),
57
+ cfg.get("max_workers", 5),
58
+ "GPU acceleration" if cfg["has_gpu"] else "CPU processing",
59
+ )
60
+
61
+ chunks, ids, success = _parallel_load_and_split(
62
+ to_process,
63
+ cfg["config"],
64
+ cfg["metadata_fields"],
65
+ documents,
66
+ cfg.get("max_workers", 5),
67
+ )
68
+
69
+ if not chunks:
70
+ logger.warning("No chunks to add to vector store")
71
+ return
72
+
73
+ for pid, _, md in to_process:
74
+ if pid in success:
75
+ paper_metadata[pid] = md
76
+
77
+ try:
78
+ _batch_embed(
79
+ chunks,
80
+ ids,
81
+ vector_store,
82
+ cfg.get("batch_size", 100),
83
+ cfg["has_gpu"],
84
+ )
85
+ except Exception:
86
+ logger.error("Failed to add chunks to Milvus", exc_info=True)
87
+ raise
88
+
89
+ # finally mark papers as loaded
90
+ loaded_papers.update(success)
91
+
92
+
93
+ def _parallel_load_and_split(
94
+ papers: List[Tuple[str, str, Dict[str, Any]]],
95
+ config: Any,
96
+ metadata_fields: List[str],
97
+ documents: Dict[str, Document],
98
+ max_workers: int,
99
+ ) -> Tuple[List[Document], List[str], List[str]]:
100
+ """Load & split PDFs in parallel, preserving original logic."""
101
+ all_chunks: List[Document] = []
102
+ all_ids: List[str] = []
103
+ success: List[str] = []
104
+
105
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
106
+ futures = {
107
+ executor.submit(
108
+ load_and_split_pdf,
109
+ pid,
110
+ url,
111
+ md,
112
+ config,
113
+ metadata_fields=metadata_fields,
114
+ documents_dict=documents,
115
+ ): pid
116
+ for pid, url, md in papers
117
+ }
118
+ logger.info("Submitted %d PDF loading tasks", len(futures))
119
+
120
+ for idx, fut in enumerate(concurrent.futures.as_completed(futures), start=1):
121
+ pid = futures[fut]
122
+ chunks = fut.result()
123
+ ids = [f"{pid}_{i}" for i in range(len(chunks))]
124
+
125
+ all_chunks.extend(chunks)
126
+ all_ids.extend(ids)
127
+ success.append(pid)
128
+
129
+ logger.info(
130
+ "Progress: %d/%d - Loaded paper %s (%d chunks)",
131
+ idx,
132
+ len(papers),
133
+ pid,
134
+ len(chunks),
135
+ )
136
+
137
+ return all_chunks, all_ids, success
138
+
139
+
140
+ def _batch_embed(
141
+ chunks: List[Document],
142
+ ids: List[str],
143
+ store: Any,
144
+ batch_size: int,
145
+ has_gpu: bool,
146
+ ) -> None:
147
+ """Embed chunks in batches and verify insertion exactly as before."""
148
+ start = time.time()
149
+ n = len(chunks)
150
+ logger.info(
151
+ "Starting BATCH EMBEDDING of %d chunks in batches of %d (%s)",
152
+ n,
153
+ batch_size,
154
+ "GPU" if has_gpu else "CPU",
155
+ )
156
+
157
+ for batch_num, start_idx in enumerate(range(0, n, batch_size), start=1):
158
+ end_idx = min(start_idx + batch_size, n)
159
+ logger.info(
160
+ "Embedding batch %d/%d (chunks %d-%d of %d) - %s",
161
+ batch_num,
162
+ (n + batch_size - 1) // batch_size,
163
+ start_idx + 1,
164
+ end_idx,
165
+ n,
166
+ "GPU" if has_gpu else "CPU",
167
+ )
168
+
169
+ store.add_documents(
170
+ documents=chunks[start_idx:end_idx],
171
+ ids=ids[start_idx:end_idx],
172
+ )
173
+
174
+ # Post-insert verification
175
+ col = store.col
176
+ col.flush()
177
+ count = col.num_entities
178
+ logger.info(
179
+ "Post-insert batch %d: collection has %d entities",
180
+ batch_num,
181
+ count,
182
+ )
183
+ if count:
184
+ logger.info(
185
+ "Sample paper IDs: %s",
186
+ [
187
+ r.get("paper_id", "unknown")
188
+ for r in col.query(expr="", output_fields=["paper_id"], limit=3)
189
+ ],
190
+ )
191
+
192
+ logger.info("Successfully stored batch %d", batch_num)
193
+
194
+ elapsed = time.time() - start
195
+ logger.info(
196
+ "BATCH EMBEDDING COMPLETE: %d chunks in %.2f seconds (%.2f chunks/sec)",
197
+ n,
198
+ elapsed,
199
+ n / elapsed if elapsed > 0 else 0,
200
+ )
@@ -0,0 +1,172 @@
1
+ """
2
+ Collection Manager for Milvus
3
+ """
4
+
5
+ import logging
6
+ import os
7
+ import threading
8
+ from typing import Any, Dict
9
+
10
+ from pymilvus import (
11
+ Collection,
12
+ CollectionSchema,
13
+ DataType,
14
+ FieldSchema,
15
+ connections,
16
+ utility,
17
+ )
18
+
19
+ # Set up logging with configurable level
20
+ log_level = os.environ.get("LOG_LEVEL", "INFO")
21
+ logging.basicConfig(level=getattr(logging, log_level))
22
+ logger = logging.getLogger(__name__)
23
+ logger.setLevel(getattr(logging, log_level))
24
+
25
+ # Global cache for collections to avoid repeated creation checks
26
+ _collection_cache = {}
27
+ _cache_lock = threading.Lock()
28
+
29
+
30
+ def ensure_collection_exists(
31
+ collection_name: str, config: Any, index_params: Dict[str, Any], has_gpu: bool
32
+ ) -> Collection:
33
+ """Ensure the Milvus collection exists before trying to sync or add documents."""
34
+
35
+ # Check cache first
36
+ with _cache_lock:
37
+ if collection_name in _collection_cache:
38
+ logger.debug("Returning cached collection: %s", collection_name)
39
+ return _collection_cache[collection_name]
40
+
41
+ try:
42
+ existing_collections = utility.list_collections()
43
+ if collection_name not in existing_collections:
44
+ logger.info(
45
+ "Collection %s does not exist. Creating schema...",
46
+ collection_name,
47
+ )
48
+
49
+ # Define schema
50
+ fields = [
51
+ FieldSchema(
52
+ name="id",
53
+ dtype=DataType.VARCHAR,
54
+ is_primary=True,
55
+ auto_id=False,
56
+ max_length=100,
57
+ ),
58
+ FieldSchema(
59
+ name="embedding",
60
+ dtype=DataType.FLOAT_VECTOR,
61
+ dim=config.milvus.embedding_dim if config else 768,
62
+ ),
63
+ FieldSchema(
64
+ name="text",
65
+ dtype=DataType.VARCHAR,
66
+ max_length=65535,
67
+ ),
68
+ FieldSchema(
69
+ name="paper_id",
70
+ dtype=DataType.VARCHAR,
71
+ max_length=100,
72
+ ),
73
+ FieldSchema(
74
+ name="title",
75
+ dtype=DataType.VARCHAR,
76
+ max_length=512,
77
+ ),
78
+ FieldSchema(
79
+ name="chunk_id",
80
+ dtype=DataType.INT64,
81
+ ),
82
+ FieldSchema(
83
+ name="page",
84
+ dtype=DataType.INT64,
85
+ ),
86
+ FieldSchema(
87
+ name="source",
88
+ dtype=DataType.VARCHAR,
89
+ max_length=512,
90
+ ),
91
+ ]
92
+
93
+ schema = CollectionSchema(
94
+ fields=fields,
95
+ description="RAG collection for embedded PDF chunks",
96
+ enable_dynamic_field=True,
97
+ )
98
+
99
+ # Create collection
100
+ collection = Collection(
101
+ name=collection_name,
102
+ schema=schema,
103
+ using="default",
104
+ shards_num=2,
105
+ )
106
+ logger.info("Created collection: %s", collection_name)
107
+
108
+ # Create index on the embedding field with GPU/CPU optimization
109
+ logger.info(
110
+ "Creating %s index on 'embedding' field for collection: %s",
111
+ index_params["index_type"],
112
+ collection_name,
113
+ )
114
+
115
+ collection.create_index(field_name="embedding", index_params=index_params)
116
+
117
+ index_type = index_params["index_type"]
118
+ logger.info(
119
+ "Successfully created %s index on 'embedding' field for collection: %s",
120
+ index_type,
121
+ collection_name,
122
+ )
123
+
124
+ else:
125
+ logger.info("Collection %s already exists. Loading it.", collection_name)
126
+ collection = Collection(name=collection_name, using="default")
127
+
128
+ collection.load()
129
+
130
+ def debug_collection_state(collection, collection_name):
131
+ """Debug collection state for troubleshooting."""
132
+ logger.info("=== DEBUG COLLECTION STATE ===")
133
+ logger.info("Collection name: %s", collection_name)
134
+ logger.info("Collection schema: %s", collection.schema)
135
+ logger.info("Collection num_entities: %d", collection.num_entities)
136
+
137
+ # Check if collection is actually loaded
138
+ # logger.info("Is collection loaded: %s", collection.load)
139
+
140
+ # Check available indexes
141
+ indexes = collection.indexes
142
+ logger.info("Collection indexes: %s", [idx.field_name for idx in indexes])
143
+
144
+ # Try to get collection stats
145
+ logger.info("Collection statistics: %s", collection.num_entities)
146
+
147
+ logger.info("Active connections: %s", connections.list_connections())
148
+
149
+ logger.info("=== END DEBUG ===")
150
+
151
+ debug_collection_state(collection, collection_name)
152
+
153
+ # Log collection statistics with GPU/CPU info
154
+ num_entities = collection.num_entities
155
+ gpu_info = " (GPU accelerated)" if has_gpu else " (CPU only)"
156
+ logger.info(
157
+ "Collection %s is loaded and ready with %d entities%s",
158
+ collection_name,
159
+ num_entities,
160
+ gpu_info,
161
+ )
162
+
163
+ # Cache the collection
164
+ with _cache_lock:
165
+ _collection_cache[collection_name] = collection
166
+ logger.debug("Cached collection: %s", collection_name)
167
+
168
+ return collection # Return the collection object
169
+
170
+ except Exception as e:
171
+ logger.error("Failed to ensure collection exists: %s", e, exc_info=True)
172
+ raise
@@ -0,0 +1,76 @@
1
+ """
2
+ Document processing utilities for loading and splitting PDFs.
3
+ """
4
+
5
+ import logging
6
+ from typing import Any, Dict, List
7
+
8
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
9
+ from langchain_community.document_loaders import PyPDFLoader
10
+ from langchain_core.documents import Document
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def load_and_split_pdf(
16
+ paper_id: str,
17
+ pdf_url: str,
18
+ paper_metadata: Dict[str, Any],
19
+ config: Any,
20
+ **kwargs: Any,
21
+ ) -> List[Document]:
22
+ """
23
+ Load a PDF and split it into chunks.
24
+
25
+ Args:
26
+ paper_id: Unique identifier for the paper.
27
+ pdf_url: URL to the PDF.
28
+ paper_metadata: Metadata about the paper (e.g. Title, Authors, etc.).
29
+ config: Configuration object with `chunk_size` and `chunk_overlap` attributes.
30
+ metadata_fields: List of additional metadata keys to propagate into each
31
+ chunk (passed via kwargs).
32
+ documents_dict: Dictionary where split chunks will also be stored under keys
33
+ of the form "{paper_id}_{chunk_index}" (passed via kwargs).
34
+
35
+ Returns:
36
+ A list of Document chunks, each with updated metadata.
37
+ """
38
+ metadata_fields: List[str] = kwargs["metadata_fields"]
39
+ documents_dict: Dict[str, Document] = kwargs["documents_dict"]
40
+
41
+ logger.info("Loading PDF for paper %s from %s", paper_id, pdf_url)
42
+
43
+ # Load pages
44
+ documents = PyPDFLoader(pdf_url).load()
45
+ logger.info("Loaded %d pages from paper %s", len(documents), paper_id)
46
+
47
+ if config is None:
48
+ raise ValueError("Configuration is required for text splitting in Vectorstore.")
49
+ splitter = RecursiveCharacterTextSplitter(
50
+ chunk_size=config.chunk_size,
51
+ chunk_overlap=config.chunk_overlap,
52
+ separators=["\n\n", "\n", ". ", " ", ""],
53
+ )
54
+
55
+ # Split into chunks
56
+ chunks = splitter.split_documents(documents)
57
+ logger.info("Split paper %s into %d chunks", paper_id, len(chunks))
58
+
59
+ # Attach metadata & populate documents_dict
60
+ for i, chunk in enumerate(chunks):
61
+ chunk_id = f"{paper_id}_{i}"
62
+ chunk.metadata.update(
63
+ {
64
+ "paper_id": paper_id,
65
+ "title": paper_metadata.get("Title", "Unknown"),
66
+ "chunk_id": i,
67
+ "page": chunk.metadata.get("page", 0),
68
+ "source": pdf_url,
69
+ }
70
+ )
71
+ for field in metadata_fields:
72
+ if field in paper_metadata and field not in chunk.metadata:
73
+ chunk.metadata[field] = paper_metadata[field]
74
+ documents_dict[chunk_id] = chunk
75
+
76
+ return chunks
@@ -17,6 +17,20 @@ logger = logging.getLogger(__name__)
17
17
  logger.setLevel(getattr(logging, log_level))
18
18
 
19
19
 
20
+ def load_hydra_config() -> Any:
21
+ """
22
+ Load the configuration using Hydra and return the configuration for the Q&A tool.
23
+ """
24
+ with hydra.initialize(version_base=None, config_path="../../../configs"):
25
+ cfg = hydra.compose(
26
+ config_name="config",
27
+ overrides=["tools/question_and_answer=default"],
28
+ )
29
+ config = cfg.tools.question_and_answer
30
+ logger.debug("Loaded Question and Answer tool configuration.")
31
+ return config
32
+
33
+
20
34
  def _build_context_and_sources(
21
35
  retrieved_chunks: List[Document],
22
36
  ) -> tuple[str, set[str]]:
@@ -45,20 +59,6 @@ def _build_context_and_sources(
45
59
  return context, sources
46
60
 
47
61
 
48
- def load_hydra_config() -> Any:
49
- """
50
- Load the configuration using Hydra and return the configuration for the Q&A tool.
51
- """
52
- with hydra.initialize(version_base=None, config_path="../../../configs"):
53
- cfg = hydra.compose(
54
- config_name="config",
55
- overrides=["tools/question_and_answer=default"],
56
- )
57
- config = cfg.tools.question_and_answer
58
- logger.debug("Loaded Question and Answer tool configuration.")
59
- return config
60
-
61
-
62
62
  def generate_answer(
63
63
  question: str,
64
64
  retrieved_chunks: List[Document],
@@ -0,0 +1,63 @@
1
+ """
2
+ Create or retrieve a Vectorstore instance for PDF RAG.
3
+ """
4
+
5
+ import logging
6
+ import threading
7
+ from typing import Any
8
+
9
+ from langchain_core.embeddings import Embeddings
10
+
11
+ from .vector_store import Vectorstore
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Global cache for Vectorstore instances
16
+ _vectorstore_cache = {}
17
+ _cache_lock = threading.Lock()
18
+
19
+
20
+ def get_vectorstore(
21
+ embedding_model: Embeddings, config: Any, force_new: bool = False
22
+ ) -> "Vectorstore":
23
+ """
24
+ Factory function to get or create a Vectorstore instance.
25
+ Ensures the same instance is reused across the application.
26
+
27
+ Args:
28
+ embedding_model: The embedding model to use
29
+ config: Configuration object
30
+ force_new: Force creation of a new instance
31
+
32
+ Returns:
33
+ Vectorstore instance
34
+ """
35
+ collection_name = config.milvus.collection_name if config else "pdf_rag_documents"
36
+
37
+ with _cache_lock:
38
+ if force_new and collection_name in _vectorstore_cache:
39
+ del _vectorstore_cache[collection_name]
40
+ logger.info(
41
+ "Forced new Vectorstore instance for collection: %s", collection_name
42
+ )
43
+
44
+ if collection_name not in _vectorstore_cache:
45
+ logger.info(
46
+ "Creating new Vectorstore instance for collection: %s", collection_name
47
+ )
48
+ _vectorstore_cache[collection_name] = Vectorstore(
49
+ embedding_model=embedding_model, config=config
50
+ )
51
+ else:
52
+ logger.info(
53
+ "Reusing existing Vectorstore instance for collection: %s",
54
+ collection_name,
55
+ )
56
+ # Update embedding model if different
57
+ existing = _vectorstore_cache[collection_name]
58
+ if existing.embedding_model != embedding_model:
59
+ logger.warning("Embedding model changed, updating existing instance")
60
+ existing.embedding_model = embedding_model
61
+ existing.vector_store.embedding_function = embedding_model
62
+
63
+ return _vectorstore_cache[collection_name]