PyPI - aiagents4pharma - Versions diffs - 0.0.0__py3-none-any.whl - Mend

aiagents4pharma 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (336) hide show

aiagents4pharma/talk2scholars/tools/pdf/utils/batch_processor.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""
+Batch processing utilities for adding multiple papers to vector store.
+"""
+import concurrent.futures
+import logging
+import time
+from typing import Any
+from langchain_core.documents import Document
+from .document_processor import load_and_split_pdf
+logger = logging.getLogger(__name__)
+def add_papers_batch(
+    papers_to_add: list[tuple[str, str, dict[str, Any]]],
+    vector_store: Any,
+    loaded_papers: set[str],
+    paper_metadata: dict[str, dict[str, Any]],
+    documents: dict[str, Document],
+    **kwargs: Any,
+) -> None:
+    """
+    Add multiple papers to the document store in parallel with batch embedding.
+    Args:
+        papers_to_add: List of tuples (paper_id, pdf_url, paper_metadata).
+        vector_store: The LangChain Milvus vector store instance.
+        loaded_papers: Set to track which papers are already loaded.
+        paper_metadata: Dict to store paper metadata after load.
+        documents: Dict to store document chunks.
+        config:           (via kwargs) Configuration object.
+        metadata_fields:  (via kwargs) List of metadata fields to include.
+        has_gpu:          (via kwargs) Whether GPU is available.
+        max_workers:      (via kwargs) Max PDF‐loading threads (default 5).
+        batch_size:       (via kwargs) Embedding batch size (default 100).
+    """
+    cfg = kwargs
+    if not papers_to_add:
+        logger.info("No papers to add")
+        return
+    to_process = [(pid, url, md) for pid, url, md in papers_to_add if pid not in loaded_papers]
+    if not to_process:
+        logger.info("Skipping %d already-loaded papers", len(papers_to_add))
+        logger.info("All %d papers are already loaded", len(papers_to_add))
+        return
+    logger.info(
+        "Starting PARALLEL batch processing of %d papers with %d workers (%s)",
+        len(to_process),
+        cfg.get("max_workers", 5),
+        "GPU acceleration" if cfg["has_gpu"] else "CPU processing",
+    )
+    chunks, ids, success = _parallel_load_and_split(
+        to_process,
+        cfg["config"],
+        cfg["metadata_fields"],
+        documents,
+        cfg.get("max_workers", 5),
+    )
+    if not chunks:
+        logger.warning("No chunks to add to vector store")
+        return
+    for pid, _, md in to_process:
+        if pid in success:
+            paper_metadata[pid] = md
+    try:
+        _batch_embed(
+            chunks,
+            ids,
+            vector_store,
+            cfg.get("batch_size", 100),
+            cfg["has_gpu"],
+        )
+    except Exception:
+        logger.error("Failed to add chunks to Milvus", exc_info=True)
+        raise
+    # finally mark papers as loaded
+    loaded_papers.update(success)
+def _parallel_load_and_split(
+    papers: list[tuple[str, str, dict[str, Any]]],
+    config: Any,
+    metadata_fields: list[str],
+    documents: dict[str, Document],
+    max_workers: int,
+) -> tuple[list[Document], list[str], list[str]]:
+    """Load & split PDFs in parallel, preserving original logic."""
+    all_chunks: list[Document] = []
+    all_ids: list[str] = []
+    success: list[str] = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        futures = {
+            executor.submit(
+                load_and_split_pdf,
+                pid,
+                url,
+                md,
+                config,
+                metadata_fields=metadata_fields,
+                documents_dict=documents,
+            ): pid
+            for pid, url, md in papers
+        }
+        logger.info("Submitted %d PDF loading tasks", len(futures))
+        for idx, fut in enumerate(concurrent.futures.as_completed(futures), start=1):
+            pid = futures[fut]
+            chunks = fut.result()
+            ids = [f"{pid}_{i}" for i in range(len(chunks))]
+            all_chunks.extend(chunks)
+            all_ids.extend(ids)
+            success.append(pid)
+            logger.info(
+                "Progress: %d/%d - Loaded paper %s (%d chunks)",
+                idx,
+                len(papers),
+                pid,
+                len(chunks),
+            )
+    return all_chunks, all_ids, success
+def _batch_embed(
+    chunks: list[Document],
+    ids: list[str],
+    store: Any,
+    batch_size: int,
+    has_gpu: bool,
+) -> None:
+    """Embed chunks in batches and verify insertion exactly as before."""
+    start = time.time()
+    n = len(chunks)
+    logger.info(
+        "Starting BATCH EMBEDDING of %d chunks in batches of %d (%s)",
+        n,
+        batch_size,
+        "GPU" if has_gpu else "CPU",
+    )
+    for batch_num, start_idx in enumerate(range(0, n, batch_size), start=1):
+        end_idx = min(start_idx + batch_size, n)
+        logger.info(
+            "Embedding batch %d/%d (chunks %d-%d of %d) - %s",
+            batch_num,
+            (n + batch_size - 1) // batch_size,
+            start_idx + 1,
+            end_idx,
+            n,
+            "GPU" if has_gpu else "CPU",
+        )
+        store.add_documents(
+            documents=chunks[start_idx:end_idx],
+            ids=ids[start_idx:end_idx],
+        )
+        # Post-insert verification
+        col = store.col
+        col.flush()
+        count = col.num_entities
+        logger.info(
+            "Post-insert batch %d: collection has %d entities",
+            batch_num,
+            count,
+        )
+        if count:
+            logger.info(
+                "Sample paper IDs: %s",
+                [
+                    r.get("paper_id", "unknown")
+                    for r in col.query(expr="", output_fields=["paper_id"], limit=3)
+                ],
+            )
+        logger.info("Successfully stored batch %d", batch_num)
+    elapsed = time.time() - start
+    logger.info(
+        "BATCH EMBEDDING COMPLETE: %d chunks in %.2f seconds (%.2f chunks/sec)",
+        n,
+        elapsed,
+        n / elapsed if elapsed > 0 else 0,
+    )

aiagents4pharma/talk2scholars/tools/pdf/utils/collection_manager.py ADDED Viewed

@@ -0,0 +1,172 @@
+"""
+Collection Manager for Milvus
+"""
+import logging
+import os
+import threading
+from typing import Any
+from pymilvus import (
+    Collection,
+    CollectionSchema,
+    DataType,
+    FieldSchema,
+    connections,
+    utility,
+)
+# Set up logging with configurable level
+log_level = os.environ.get("LOG_LEVEL", "INFO")
+logging.basicConfig(level=getattr(logging, log_level))
+logger = logging.getLogger(__name__)
+logger.setLevel(getattr(logging, log_level))
+# Global cache for collections to avoid repeated creation checks
+_collection_cache = {}
+_cache_lock = threading.Lock()
+def ensure_collection_exists(
+    collection_name: str, config: Any, index_params: dict[str, Any], has_gpu: bool
+) -> Collection:
+    """Ensure the Milvus collection exists before trying to sync or add documents."""
+    # Check cache first
+    with _cache_lock:
+        if collection_name in _collection_cache:
+            logger.debug("Returning cached collection: %s", collection_name)
+            return _collection_cache[collection_name]
+    try:
+        existing_collections = utility.list_collections()
+        if collection_name not in existing_collections:
+            logger.info(
+                "Collection %s does not exist. Creating schema...",
+                collection_name,
+            )
+            # Define schema
+            fields = [
+                FieldSchema(
+                    name="id",
+                    dtype=DataType.VARCHAR,
+                    is_primary=True,
+                    auto_id=False,
+                    max_length=100,
+                ),
+                FieldSchema(
+                    name="embedding",
+                    dtype=DataType.FLOAT_VECTOR,
+                    dim=config.milvus.embedding_dim if config else 768,
+                ),
+                FieldSchema(
+                    name="text",
+                    dtype=DataType.VARCHAR,
+                    max_length=65535,
+                ),
+                FieldSchema(
+                    name="paper_id",
+                    dtype=DataType.VARCHAR,
+                    max_length=100,
+                ),
+                FieldSchema(
+                    name="title",
+                    dtype=DataType.VARCHAR,
+                    max_length=512,
+                ),
+                FieldSchema(
+                    name="chunk_id",
+                    dtype=DataType.INT64,
+                ),
+                FieldSchema(
+                    name="page",
+                    dtype=DataType.INT64,
+                ),
+                FieldSchema(
+                    name="source",
+                    dtype=DataType.VARCHAR,
+                    max_length=512,
+                ),
+            ]
+            schema = CollectionSchema(
+                fields=fields,
+                description="RAG collection for embedded PDF chunks",
+                enable_dynamic_field=True,
+            )
+            # Create collection
+            collection = Collection(
+                name=collection_name,
+                schema=schema,
+                using="default",
+                shards_num=2,
+            )
+            logger.info("Created collection: %s", collection_name)
+            # Create index on the embedding field with GPU/CPU optimization
+            logger.info(
+                "Creating %s index on 'embedding' field for collection: %s",
+                index_params["index_type"],
+                collection_name,
+            )
+            collection.create_index(field_name="embedding", index_params=index_params)
+            index_type = index_params["index_type"]
+            logger.info(
+                "Successfully created %s index on 'embedding' field for collection: %s",
+                index_type,
+                collection_name,
+            )
+        else:
+            logger.info("Collection %s already exists. Loading it.", collection_name)
+            collection = Collection(name=collection_name, using="default")
+        collection.load()
+        def debug_collection_state(collection, collection_name):
+            """Debug collection state for troubleshooting."""
+            logger.info("=== DEBUG COLLECTION STATE ===")
+            logger.info("Collection name: %s", collection_name)
+            logger.info("Collection schema: %s", collection.schema)
+            logger.info("Collection num_entities: %d", collection.num_entities)
+            # Check if collection is actually loaded
+            # logger.info("Is collection loaded: %s", collection.load)
+            # Check available indexes
+            indexes = collection.indexes
+            logger.info("Collection indexes: %s", [idx.field_name for idx in indexes])
+            # Try to get collection stats
+            logger.info("Collection statistics: %s", collection.num_entities)
+            logger.info("Active connections: %s", connections.list_connections())
+            logger.info("=== END DEBUG ===")
+        debug_collection_state(collection, collection_name)
+        # Log collection statistics with GPU/CPU info
+        num_entities = collection.num_entities
+        gpu_info = " (GPU accelerated)" if has_gpu else " (CPU only)"
+        logger.info(
+            "Collection %s is loaded and ready with %d entities%s",
+            collection_name,
+            num_entities,
+            gpu_info,
+        )
+        # Cache the collection
+        with _cache_lock:
+            _collection_cache[collection_name] = collection
+            logger.debug("Cached collection: %s", collection_name)
+        return collection  # Return the collection object
+    except Exception as e:
+        logger.error("Failed to ensure collection exists: %s", e, exc_info=True)
+        raise

aiagents4pharma/talk2scholars/tools/pdf/utils/document_processor.py ADDED Viewed

@@ -0,0 +1,76 @@
+"""
+Document processing utilities for loading and splitting PDFs.
+"""
+import logging
+from typing import Any
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_core.documents import Document
+logger = logging.getLogger(__name__)
+def load_and_split_pdf(
+    paper_id: str,
+    pdf_url: str,
+    paper_metadata: dict[str, Any],
+    config: Any,
+    **kwargs: Any,
+) -> list[Document]:
+    """
+    Load a PDF and split it into chunks.
+    Args:
+        paper_id: Unique identifier for the paper.
+        pdf_url: URL to the PDF.
+        paper_metadata: Metadata about the paper (e.g. Title, Authors, etc.).
+        config: Configuration object with `chunk_size` and `chunk_overlap` attributes.
+        metadata_fields: List of additional metadata keys to propagate into each
+        chunk (passed via kwargs).
+        documents_dict: Dictionary where split chunks will also be stored under keys
+            of the form "{paper_id}_{chunk_index}" (passed via kwargs).
+    Returns:
+        A list of Document chunks, each with updated metadata.
+    """
+    metadata_fields: list[str] = kwargs["metadata_fields"]
+    documents_dict: dict[str, Document] = kwargs["documents_dict"]
+    logger.info("Loading PDF for paper %s from %s", paper_id, pdf_url)
+    # Load pages
+    documents = PyPDFLoader(pdf_url).load()
+    logger.info("Loaded %d pages from paper %s", len(documents), paper_id)
+    if config is None:
+        raise ValueError("Configuration is required for text splitting in Vectorstore.")
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=config.chunk_size,
+        chunk_overlap=config.chunk_overlap,
+        separators=["\n\n", "\n", ". ", " ", ""],
+    )
+    # Split into chunks
+    chunks = splitter.split_documents(documents)
+    logger.info("Split paper %s into %d chunks", paper_id, len(chunks))
+    # Attach metadata & populate documents_dict
+    for i, chunk in enumerate(chunks):
+        chunk_id = f"{paper_id}_{i}"
+        chunk.metadata.update(
+            {
+                "paper_id": paper_id,
+                "title": paper_metadata.get("Title", "Unknown"),
+                "chunk_id": i,
+                "page": chunk.metadata.get("page", 0),
+                "source": pdf_url,
+            }
+        )
+        for field in metadata_fields:
+            if field in paper_metadata and field not in chunk.metadata:
+                chunk.metadata[field] = paper_metadata[field]
+        documents_dict[chunk_id] = chunk
+    return chunks

aiagents4pharma/talk2scholars/tools/pdf/utils/generate_answer.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""
+Generate an answer for a question using retrieved chunks of documents.
+"""
+import logging
+import os
+from typing import Any
+import hydra
+from langchain_core.documents import Document
+from langchain_core.language_models.chat_models import BaseChatModel
+# Set up logging with configurable level
+log_level = os.environ.get("LOG_LEVEL", "INFO")
+logging.basicConfig(level=getattr(logging, log_level))
+logger = logging.getLogger(__name__)
+logger.setLevel(getattr(logging, log_level))
+def load_hydra_config() -> Any:
+    """
+    Load the configuration using Hydra and return the configuration for the Q&A tool.
+    """
+    with hydra.initialize(version_base=None, config_path="../../../configs"):
+        cfg = hydra.compose(
+            config_name="config",
+            overrides=["tools/question_and_answer=default"],
+        )
+        config = cfg.tools.question_and_answer
+        logger.debug("Loaded Question and Answer tool configuration.")
+        return config
+def _build_context_and_sources(
+    retrieved_chunks: list[Document],
+) -> tuple[str, set[str]]:
+    """
+    Build the combined context string and set of paper_ids from retrieved chunks.
+    """
+    papers = {}
+    for doc in retrieved_chunks:
+        pid = doc.metadata.get("paper_id", "unknown")
+        papers.setdefault(pid, []).append(doc)
+    formatted = []
+    idx = 1
+    for pid, chunks in papers.items():
+        title = chunks[0].metadata.get("title", "Unknown")
+        formatted.append(f"[Document {idx}] From: '{title}' (ID: {pid})")
+        for chunk in chunks:
+            page = chunk.metadata.get("page", "unknown")
+            formatted.append(f"Page {page}: {chunk.page_content}")
+        idx += 1
+    context = "\n\n".join(formatted)
+    sources: set[str] = set()
+    for doc in retrieved_chunks:
+        pid = doc.metadata.get("paper_id")
+        if isinstance(pid, str):
+            sources.add(pid)
+    return context, sources
+def generate_answer(
+    question: str,
+    retrieved_chunks: list[Document],
+    llm_model: BaseChatModel,
+    config: Any,
+) -> dict[str, Any]:
+    """
+    Generate an answer for a question using retrieved chunks.
+    Args:
+        question (str): The question to answer
+        retrieved_chunks (List[Document]): List of relevant document chunks
+        llm_model (BaseChatModel): Language model for generating answers
+        config (Any): Configuration for answer generation
+    Returns:
+        Dict[str, Any]: Dictionary with the answer and metadata
+    """
+    # Ensure the configuration is provided and has the prompt_template.
+    if config is None:
+        raise ValueError("Configuration for generate_answer is required.")
+    if "prompt_template" not in config:
+        raise ValueError("The prompt_template is missing from the configuration.")
+    # Build context and sources, then invoke LLM
+    context, paper_sources = _build_context_and_sources(retrieved_chunks)
+    prompt = config["prompt_template"].format(context=context, question=question)
+    response = llm_model.invoke(prompt)
+    # Return the response with metadata
+    return {
+        "output_text": response.content,
+        "sources": [doc.metadata for doc in retrieved_chunks],
+        "num_sources": len(retrieved_chunks),
+        "papers_used": list(paper_sources),
+    }

aiagents4pharma/talk2scholars/tools/pdf/utils/get_vectorstore.py ADDED Viewed

@@ -0,0 +1,59 @@
+"""
+Create or retrieve a Vectorstore instance for PDF RAG.
+"""
+import logging
+import threading
+from typing import Any
+from langchain_core.embeddings import Embeddings
+from .vector_store import Vectorstore
+logger = logging.getLogger(__name__)
+# Global cache for Vectorstore instances
+_vectorstore_cache = {}
+_cache_lock = threading.Lock()
+def get_vectorstore(
+    embedding_model: Embeddings, config: Any, force_new: bool = False
+) -> "Vectorstore":
+    """
+    Factory function to get or create a Vectorstore instance.
+    Ensures the same instance is reused across the application.
+    Args:
+        embedding_model: The embedding model to use
+        config: Configuration object
+        force_new: Force creation of a new instance
+    Returns:
+        Vectorstore instance
+    """
+    collection_name = config.milvus.collection_name if config else "pdf_rag_documents"
+    with _cache_lock:
+        if force_new and collection_name in _vectorstore_cache:
+            del _vectorstore_cache[collection_name]
+            logger.info("Forced new Vectorstore instance for collection: %s", collection_name)
+        if collection_name not in _vectorstore_cache:
+            logger.info("Creating new Vectorstore instance for collection: %s", collection_name)
+            _vectorstore_cache[collection_name] = Vectorstore(
+                embedding_model=embedding_model, config=config
+            )
+        else:
+            logger.info(
+                "Reusing existing Vectorstore instance for collection: %s",
+                collection_name,
+            )
+            # Update embedding model if different
+            existing = _vectorstore_cache[collection_name]
+            if existing.embedding_model != embedding_model:
+                logger.warning("Embedding model changed, updating existing instance")
+                existing.embedding_model = embedding_model
+                existing.vector_store.embedding_function = embedding_model
+        return _vectorstore_cache[collection_name]