PyPI - aiagents4pharma - Versions diffs - 1.40.0__py3-none-any.whl → 1.41.0__py3-none-any.whl - Mend

aiagents4pharma 1.40.0py3-none-any.whl → 1.41.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

aiagents4pharma/talk2scholars/tools/pdf/utils/vector_store.py CHANGED Viewed

@@ -1,5 +1,8 @@
 """
-Vectorstore class for managing document embeddings and retrieval.
+Vectorstore class for managing PDF embeddings with Milvus.
+Manages GPU normalization and similarity search and MMR operations.
+With automatic handling of COSINE to IP conversion for GPU compatibility.
+Supports both GPU and CPU configurations.
 """
 import logging
@@ -7,13 +10,18 @@ import os
 import time
 from typing import Any, Dict, List, Optional
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.document_loaders import PyPDFLoader
-from langchain_community.vectorstores import FAISS
 from langchain_core.documents import Document
 from langchain_core.embeddings import Embeddings
-from langchain_core.vectorstores import VectorStore
+from langchain_milvus import Milvus
+from .collection_manager import ensure_collection_exists
+from .gpu_detection import (
+    detect_nvidia_gpu,
+    get_optimal_index_config,
+    log_index_configuration,
+)
+from .singleton_manager import VectorstoreSingleton
+from .vector_normalization import wrap_embedding_model_if_needed
 # Set up logging with configurable level
 log_level = os.environ.get("LOG_LEVEL", "INFO")
@@ -24,8 +32,8 @@ logger.setLevel(getattr(logging, log_level))
 class Vectorstore:
     """
-    A class for managing document embeddings and retrieval.
-    Provides unified access to documents across multiple papers.
+    Enhanced Vectorstore class with GPU normalization support.
+    Automatically handles COSINE -> IP conversion for GPU compatibility.
     """
     def __init__(
@@ -35,13 +43,13 @@ class Vectorstore:
         config: Any = None,
     ):
         """
-        Initialize the document store.
+        Initialize the document store with Milvus and GPU optimization.
         Args:
             embedding_model: The embedding model to use
-            metadata_fields: Fields to include in document metadata for filtering/retrieval
+            metadata_fields: Fields to include in document metadata
+            config: Configuration object containing Milvus connection details
         """
-        self.embedding_model = embedding_model
         self.config = config
         self.metadata_fields = metadata_fields or [
             "title",
@@ -50,113 +58,286 @@ class Vectorstore:
             "chunk_id",
         ]
         self.initialization_time = time.time()
-        logger.info("Vectorstore initialized at: %s", self.initialization_time)
+        # GPU detection with config override (SINGLE CALL)
+        self.has_gpu = detect_nvidia_gpu(config)
+        # Additional check for force CPU mode
+        if (
+            config
+            and hasattr(config, "gpu_detection")
+            and getattr(config.gpu_detection, "force_cpu_mode", False)
+        ):
+            logger.info("Running in forced CPU mode (config override)")
+            self.has_gpu = False
+        # Determine if we want to use COSINE similarity
+        self.use_cosine = True  # Default preference
+        if config and hasattr(config, "similarity_metric"):
+            self.use_cosine = getattr(config.similarity_metric, "use_cosine", True)
+        # Wrap embedding model with normalization if needed for GPU
+        self.original_embedding_model = embedding_model
+        self.embedding_model = wrap_embedding_model_if_needed(
+            embedding_model, self.has_gpu, self.use_cosine
+        )
+        # Configure index parameters AFTER determining GPU usage and normalization
+        embedding_dim = config.milvus.embedding_dim if config else 768
+        self.index_params, self.search_params = get_optimal_index_config(
+            self.has_gpu, embedding_dim, self.use_cosine
+        )
+        # Log the configuration
+        log_index_configuration(self.index_params, self.search_params, self.use_cosine)
         # Track loaded papers to prevent duplicate loading
         self.loaded_papers = set()
-        self.vector_store_class = FAISS
-        logger.info("Using FAISS vector store")
-        # Store for initialized documents
+        # Initialize Milvus connection parameters with environment variable fallback
+        self.connection_args = {
+            "host": (
+                config.milvus.host if config else os.getenv("MILVUS_HOST", "127.0.0.1")
+            ),
+            "port": (
+                config.milvus.port if config else int(os.getenv("MILVUS_PORT", "19530"))
+            ),
+        }
+        # Log the connection parameters being used
+        logger.info(
+            "Using Milvus connection: %s:%s",
+            self.connection_args["host"],
+            self.connection_args["port"],
+        )
+        self.collection_name = (
+            config.milvus.collection_name if config else "pdf_rag_documents"
+        )
+        self.db_name = config.milvus.db_name if config else "pdf_rag_db"
+        # Get singleton instance
+        self._singleton = VectorstoreSingleton()
+        # Connect to Milvus (reuses existing connection if available)
+        self._connect_milvus()
+        # Create collection with proper metric type
+        self.collection = ensure_collection_exists(
+            self.collection_name, self.config, self.index_params, self.has_gpu
+        )
+        # Initialize the LangChain Milvus vector store
+        self.vector_store = self._initialize_vector_store()
+        # Load existing papers AFTER vector store is ready
+        self._load_existing_paper_ids()
+        # CRITICAL: Load collection into memory/GPU after any existing data is identified
+        logger.info(
+            "Calling _ensure_collection_loaded() for %s processing...",
+            "GPU" if self.has_gpu else "CPU",
+        )
+        self._ensure_collection_loaded()
+        # Store for document metadata (keeping for compatibility)
         self.documents: Dict[str, Document] = {}
-        self.vector_store: Optional[VectorStore] = None
         self.paper_metadata: Dict[str, Dict[str, Any]] = {}
-        # Cache for document chunk embeddings to avoid recomputation
-        self.embeddings: Dict[str, Any] = {}
-    def add_paper(
-        self,
-        paper_id: str,
-        pdf_url: str,
-        paper_metadata: Dict[str, Any],
-    ) -> None:
-        """
-        Add a paper to the document store.
+        # Log final configuration
+        metric_info = (
+            "IP (normalized for COSINE)"
+            if self.has_gpu and self.use_cosine
+            else self.index_params["metric_type"]
+        )
-        Args:
-            paper_id: Unique identifier for the paper
-            pdf_url: URL to the PDF
-            paper_metadata: Metadata about the paper
-        """
-        # Skip if already loaded
-        if paper_id in self.loaded_papers:
-            logger.info("Paper %s already loaded, skipping", paper_id)
-            return
+        logger.info(
+            "Milvus vector store initialized with collection: %s (GPU: %s, Metric: %s)",
+            self.collection_name,
+            "enabled" if self.has_gpu else "disabled",
+            metric_info,
+        )
+    def _connect_milvus(self) -> None:
+        """Establish connection to Milvus server using singleton."""
+        self._singleton.get_connection(
+            self.connection_args["host"], self.connection_args["port"], self.db_name
+        )
+    def _initialize_vector_store(self) -> Milvus:
+        """Initialize or load the Milvus vector store with proper embedding model."""
+        # Use the wrapped embedding model (with normalization if needed)
+        vector_store = self._singleton.get_vector_store(
+            self.collection_name, self.embedding_model, self.connection_args
+        )
+        return vector_store
-        logger.info("Loading paper %s from %s", paper_id, pdf_url)
+    def _load_existing_paper_ids(self):
+        """Load already embedded paper IDs using LangChain's collection access."""
+        logger.info("Checking for existing papers via LangChain collection...")
-        # Store paper metadata
-        self.paper_metadata[paper_id] = paper_metadata
+        # Access the collection through LangChain's wrapper
+        langchain_collection = getattr(self.vector_store, "col", None)
-        # Load the PDF and split into chunks according to Hydra config
-        loader = PyPDFLoader(pdf_url)
-        documents = loader.load()
-        logger.info("Loaded %d pages from %s", len(documents), paper_id)
+        if langchain_collection is None:
+            langchain_collection = getattr(self.vector_store, "collection", None)
-        # Create text splitter according to provided configuration
-        if self.config is None:
-            raise ValueError(
-                "Configuration is required for text splitting in Vectorstore."
+        if langchain_collection is None:
+            logger.warning(
+                "No LangChain collection found, proceeding with empty loaded_papers"
             )
-        splitter = RecursiveCharacterTextSplitter(
-            chunk_size=self.config.chunk_size,
-            chunk_overlap=self.config.chunk_overlap,
-            separators=["\n\n", "\n", ". ", " ", ""],
-        )
+            return
+        # Force flush and check entity count
+        langchain_collection.flush()
+        num_entities = langchain_collection.num_entities
-        # Split documents and add metadata for each chunk
-        chunks = splitter.split_documents(documents)
-        logger.info("Split %s into %d chunks", paper_id, len(chunks))
-        # Embed and cache chunk embeddings
-        chunk_texts = [chunk.page_content for chunk in chunks]
-        chunk_embeddings = self.embedding_model.embed_documents(chunk_texts)
-        logger.info("Embedded %d chunks for paper %s", len(chunk_embeddings), paper_id)
-        # Enhance document metadata
-        for i, chunk in enumerate(chunks):
-            # Add paper metadata to each chunk
-            chunk.metadata.update(
-                {
-                    "paper_id": paper_id,
-                    "title": paper_metadata.get("Title", "Unknown"),
-                    "chunk_id": i,
-                    # Keep existing page number if available
-                    "page": chunk.metadata.get("page", 0),
-                }
+        logger.info("LangChain collection entity count: %d", num_entities)
+        if num_entities > 0:
+            logger.info("Loading existing paper IDs from LangChain collection...")
+            results = langchain_collection.query(
+                expr="",  # No filter - get all
+                output_fields=["paper_id"],
+                limit=16384,  # Max limit
+                consistency_level="Strong",
             )
-            # Add any additional metadata fields
-            for field in self.metadata_fields:
-                if field in paper_metadata and field not in chunk.metadata:
-                    chunk.metadata[field] = paper_metadata[field]
+            # Extract unique paper IDs
+            existing_paper_ids = set(result["paper_id"] for result in results)
+            self.loaded_papers.update(existing_paper_ids)
-            # Store chunk
-            doc_id = f"{paper_id}_{i}"
-            self.documents[doc_id] = chunk
-            # Cache embedding if available
-            if chunk_embeddings[i] is not None:
-                self.embeddings[doc_id] = chunk_embeddings[i]
+            logger.info("Found %d unique papers in collection", len(existing_paper_ids))
+        else:
+            logger.info("Collection is empty - no existing papers")
-        # Mark as loaded to prevent duplicate loading
-        self.loaded_papers.add(paper_id)
-        logger.info("Added %d chunks from paper %s", len(chunks), paper_id)
+    def similarity_search(self, query: str, **kwargs: Any) -> List[Document]:
+        """
+        Perform similarity search on the vector store.
+        Query embedding will be automatically normalized if using GPU with COSINE.
+        Keyword args:
+            k: int = 4
+            filter: Optional[Dict[str, Any]] = None
+            plus any other kwargs to pass through to the underlying vector_store.
+        """
+        # Extract our parameters
+        k: int = kwargs.pop("k", 4)
+        filter_: Optional[Dict[str, Any]] = kwargs.pop("filter", None)
+        # Build Milvus expr from filter_, if present
+        expr = None
+        if filter_:
+            conditions = []
+            for key, value in filter_.items():
+                if isinstance(value, str):
+                    conditions.append(f'{key} == "{value}"')
+                elif isinstance(value, list):
+                    vals = ", ".join(
+                        f'"{v}"' if isinstance(v, str) else str(v) for v in value
+                    )
+                    conditions.append(f"{key} in [{vals}]")
+                else:
+                    conditions.append(f"{key} == {value}")
+            expr = " and ".join(conditions)
+        # Delegate to the wrapped store
+        return self.vector_store.similarity_search(
+            query=query, k=k, expr=expr, **kwargs
+        )
-    def build_vector_store(self) -> None:
+    def max_marginal_relevance_search(
+        self, query: str, **kwargs: Any
+    ) -> List[Document]:
         """
-        Build the vector store from all loaded documents.
-        Should be called after all papers are added.
+        Perform MMR search on the vector store.
+        Query embedding will be automatically normalized if using GPU with COSINE.
+        Keyword args:
+            k: int = 4
+            fetch_k: int = 20
+            lambda_mult: float = 0.5
+            filter: Optional[Dict[str, Any]] = None
+            plus any other kwargs to pass through.
         """
-        if not self.documents:
-            logger.warning("No documents added to build vector store")
-            return
+        # Extract our parameters
+        k: int = kwargs.pop("k", 4)
+        fetch_k: int = kwargs.pop("fetch_k", 20)
+        lambda_mult: float = kwargs.pop("lambda_mult", 0.5)
+        filter_: Optional[Dict[str, Any]] = kwargs.pop("filter", None)
-        if self.vector_store is not None:
-            logger.info("Vector store already built, skipping")
-            return
+        # Build Milvus expr from filter_, if present
+        expr = None
+        if filter_:
+            conditions = []
+            for key, value in filter_.items():
+                if isinstance(value, str):
+                    conditions.append(f'{key} == "{value}"')
+                elif isinstance(value, list):
+                    vals = ", ".join(
+                        f'"{v}"' if isinstance(v, str) else str(v) for v in value
+                    )
+                    conditions.append(f"{key} in [{vals}]")
+                else:
+                    conditions.append(f"{key} == {value}")
+            expr = " and ".join(conditions)
-        # Create vector store from documents
-        documents_list = list(self.documents.values())
-        self.vector_store = self.vector_store_class.from_documents(
-            documents=documents_list, embedding=self.embedding_model
+        # Delegate to the wrapped store
+        return self.vector_store.max_marginal_relevance_search(
+            query=query,
+            k=k,
+            fetch_k=fetch_k,
+            lambda_mult=lambda_mult,
+            expr=expr,
+            **kwargs,
         )
-        logger.info("Built vector store with %d documents", len(documents_list))
+    def _ensure_collection_loaded(self):
+        """Ensure collection is loaded into memory/GPU after data insertion."""
+        # Get the collection
+        collection = getattr(self.vector_store, "col", None)
+        if collection is None:
+            collection = getattr(self.vector_store, "collection", None)
+        if collection is None:
+            logger.warning("Cannot access collection for loading")
+            return
+        # Force flush to ensure we see all data
+        logger.info("Flushing collection to ensure data visibility...")
+        collection.flush()
+        # Check entity count after flush
+        num_entities = collection.num_entities
+        logger.info("Collection entity count after flush: %d", num_entities)
+        if num_entities > 0:
+            hardware_type = "GPU" if self.has_gpu else "CPU"
+            logger.info(
+                "Loading collection with %d entities into %s memory...",
+                num_entities,
+                hardware_type,
+            )
+            # Load collection into memory (CPU or GPU)
+            collection.load()
+            # Verify loading was successful
+            final_count = collection.num_entities
+            logger.info(
+                "Collection successfully loaded into %s memory with %d entities",
+                hardware_type,
+                final_count,
+            )
+        else:
+            logger.info("Collection is empty, skipping load operation")
+    def get_embedding_info(self) -> Dict[str, Any]:
+        """Get information about the embedding configuration."""
+        return {
+            "has_gpu": self.has_gpu,
+            "use_cosine": self.use_cosine,
+            "metric_type": self.index_params["metric_type"],
+            "index_type": self.index_params["index_type"],
+            "normalization_enabled": hasattr(self.embedding_model, "normalize_for_gpu"),
+            "original_model_type": type(self.original_embedding_model).__name__,
+            "wrapped_model_type": type(self.embedding_model).__name__,
+        }

aiagents4pharma/talk2scholars/tools/s2/multi_paper_rec.py CHANGED Viewed

@@ -65,25 +65,28 @@ def get_multi_paper_recommendations(
     year: Optional[str] = None,
 ) -> Command[Any]:
     """
-    Return recommended papers based on multiple Semantic Scholar paper IDs.
+    Recommend related research papers using the Semantic Scholar API.
-    This tool accepts a list of Semantic Scholar paper IDs and returns a set of
-    recommended papers by aggregating related works (citations and references)
-    from each input paper.
+    This tool is designed to suggest relevant papers based on a list of
+    input Semantic Scholar paper IDs.
+    It fetches citations and references for each input paper and aggregates
+    them to generate a set of
+    recommended papers.
     Args:
         paper_ids (List[str]): List of 40-character Semantic Scholar paper IDs.
-        Provide at least two IDs.
+            Provide at least two IDs to improve the relevance of recommendations.
         tool_call_id (str): Internal tool call identifier injected by the system.
-        limit (int, optional): Maximum total number of recommendations to return. Defaults to 10.
-        year (str, optional): Publication year filter; supports formats: 'YYYY',
-        'YYYY-', '-YYYY', 'YYYY:YYYY'. Defaults to None.
+        limit (int, optional): Maximum number of recommendations to return. Defaults to 10.
+        year (str, optional): Filter recommendations by publication year.
+            Supports formats: 'YYYY', 'YYYY-', '-YYYY', or 'YYYY:YYYY'. Defaults to None.
     Returns:
         Command: A Command object containing:
             - multi_papers: List of recommended papers.
             - last_displayed_papers: Same list for display purposes.
-            - messages: List containing a ToolMessage with recommendations details.
+            - messages: List containing a ToolMessage with recommendation details.
     """
     # Create recommendation data object to organize variables
     rec_data = MultiPaperRecData(paper_ids, limit, year, tool_call_id)

aiagents4pharma/talk2scholars/tools/s2/query_dataframe.py CHANGED Viewed

@@ -87,7 +87,6 @@ class QueryDataFrameInput(BaseModel):
     "query_dataframe",
     args_schema=QueryDataFrameInput,
     parse_docstring=True,
-    return_direct=True,
 )
 def query_dataframe(
     question: str,

aiagents4pharma/talk2scholars/tools/s2/retrieve_semantic_scholar_paper_id.py CHANGED Viewed

@@ -50,22 +50,23 @@ def retrieve_semantic_scholar_paper_id(
     tool_call_id: str,
 ) -> Command[Any]:
     """
-    Search for a paper by title on Semantic Scholar and return its unique paper ID.
+    Retrieve a Semantic Scholar paper ID using a paper title.
-    This tool issues a GET request to the Semantic Scholar API to find the best match
-    for the given paper title, then returns the paper's Semantic Scholar ID.
+    This tool searches Semantic Scholar for the best match to the provided paper title
+    and returns the corresponding unique paper ID. It is intended to support downstream
+    tasks such as recommendations, metadata lookups, or citation graph queries.
-    Use when you have a known title (full or partial) and need the Semantic Scholar ID
-    to fetch additional metadata or perform downstream lookups. Do not use this tool
-    for broad literature searches; for general search use the `search` tool.
+    Use this tool when you know the full or partial title of a paper and need its
+    Semantic Scholar ID.
+    For broad literature searches or topic-based queries, use a general `search` tool instead.
     Args:
-        paper_title (str): The title of the paper to look up.
+        paper_title (str): The full or partial title of the paper to look up.
         tool_call_id (str): LangGraph-injected identifier for this tool call.
     Returns:
         Command: A structured response containing a ToolMessage whose content is
-          the Semantic Scholar paper ID string (e.g., 'abc123xyz').
+            the Semantic Scholar paper ID string (e.g., 'abc123xyz').
     Raises:
         ValueError: If no matching paper is found for the given title.

aiagents4pharma/talk2scholars/tools/s2/single_paper_rec.py CHANGED Viewed

@@ -60,17 +60,17 @@ def get_single_paper_recommendations(
     year: Optional[str] = None,
 ) -> Command[Any]:
     """
-    Return recommended papers for a single Semantic Scholar paper ID.
+    Recommend related research papers using the Semantic Scholar API for a single paper ID.
-    This tool accepts a single Semantic Scholar paper ID and returns related works
-    by aggregating citations and references.
+    This tool is designed to suggest relevant papers based on one input Semantic Scholar paper ID.
+    It fetches citations and references for the given paper and returns a set of recommended works.
     Args:
         paper_id (str): 40-character Semantic Scholar paper ID.
         tool_call_id (str): Internal tool call identifier injected by the system.
         limit (int, optional): Maximum number of recommendations to return. Defaults to 5.
-        year (str, optional): Publication year filter; supports 'YYYY', 'YYYY-',
-        '-YYYY', 'YYYY:YYYY'. Defaults to None.
+        year (str, optional): Filter recommendations by publication year.
+            Supports formats: 'YYYY', 'YYYY-', '-YYYY', or 'YYYY:YYYY'. Defaults to None.
     Returns:
         Command: A Command object containing:

aiagents4pharma 1.40.0__py3-none-any.whl → 1.41.0__py3-none-any.whl

aiagents4pharma 1.40.0py3-none-any.whl → 1.41.0py3-none-any.whl