PyPI - langroid - Versions diffs - 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl - Mend

langroid 0.1.139py3-none-any.whl → 0.1.219py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (97) hide show

langroid/__init__.py +70 -0
langroid/agent/__init__.py +22 -0
langroid/agent/base.py +120 -33
langroid/agent/batch.py +134 -35
langroid/agent/callbacks/__init__.py +0 -0
langroid/agent/callbacks/chainlit.py +608 -0
langroid/agent/chat_agent.py +164 -100
langroid/agent/chat_document.py +19 -2
langroid/agent/openai_assistant.py +20 -10
langroid/agent/special/__init__.py +33 -10
langroid/agent/special/doc_chat_agent.py +521 -108
langroid/agent/special/lance_doc_chat_agent.py +258 -0
langroid/agent/special/lance_rag/__init__.py +9 -0
langroid/agent/special/lance_rag/critic_agent.py +136 -0
langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
langroid/agent/special/lance_tools.py +44 -0
langroid/agent/special/neo4j/__init__.py +0 -0
langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
langroid/agent/special/neo4j/utils/__init__.py +0 -0
langroid/agent/special/neo4j/utils/system_message.py +46 -0
langroid/agent/special/relevance_extractor_agent.py +23 -7
langroid/agent/special/retriever_agent.py +29 -174
langroid/agent/special/sql/__init__.py +7 -0
langroid/agent/special/sql/sql_chat_agent.py +47 -23
langroid/agent/special/sql/utils/__init__.py +11 -0
langroid/agent/special/sql/utils/description_extractors.py +95 -46
langroid/agent/special/sql/utils/populate_metadata.py +28 -21
langroid/agent/special/table_chat_agent.py +43 -9
langroid/agent/task.py +423 -114
langroid/agent/tool_message.py +67 -10
langroid/agent/tools/__init__.py +8 -0
langroid/agent/tools/duckduckgo_search_tool.py +66 -0
langroid/agent/tools/google_search_tool.py +11 -0
langroid/agent/tools/metaphor_search_tool.py +67 -0
langroid/agent/tools/recipient_tool.py +6 -24
langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
langroid/cachedb/__init__.py +6 -0
langroid/embedding_models/__init__.py +24 -0
langroid/embedding_models/base.py +9 -1
langroid/embedding_models/models.py +117 -17
langroid/embedding_models/protoc/embeddings.proto +19 -0
langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
langroid/embedding_models/remote_embeds.py +153 -0
langroid/language_models/__init__.py +22 -0
langroid/language_models/azure_openai.py +47 -4
langroid/language_models/base.py +26 -10
langroid/language_models/config.py +5 -0
langroid/language_models/openai_gpt.py +407 -121
langroid/language_models/prompt_formatter/__init__.py +9 -0
langroid/language_models/prompt_formatter/base.py +4 -6
langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
langroid/language_models/utils.py +10 -9
langroid/mytypes.py +10 -4
langroid/parsing/__init__.py +33 -1
langroid/parsing/document_parser.py +259 -63
langroid/parsing/image_text.py +32 -0
langroid/parsing/parse_json.py +143 -0
langroid/parsing/parser.py +20 -7
langroid/parsing/repo_loader.py +108 -46
langroid/parsing/search.py +8 -0
langroid/parsing/table_loader.py +44 -0
langroid/parsing/url_loader.py +59 -13
langroid/parsing/urls.py +18 -9
langroid/parsing/utils.py +130 -9
langroid/parsing/web_search.py +73 -0
langroid/prompts/__init__.py +7 -0
langroid/prompts/chat-gpt4-system-prompt.md +68 -0
langroid/prompts/prompts_config.py +1 -1
langroid/utils/__init__.py +10 -0
langroid/utils/algorithms/__init__.py +3 -0
langroid/utils/configuration.py +0 -1
langroid/utils/constants.py +4 -0
langroid/utils/logging.py +2 -5
langroid/utils/output/__init__.py +15 -2
langroid/utils/output/status.py +33 -0
langroid/utils/pandas_utils.py +30 -0
langroid/utils/pydantic_utils.py +446 -4
langroid/utils/system.py +36 -1
langroid/vector_store/__init__.py +34 -2
langroid/vector_store/base.py +33 -2
langroid/vector_store/chromadb.py +42 -13
langroid/vector_store/lancedb.py +226 -60
langroid/vector_store/meilisearch.py +7 -6
langroid/vector_store/momento.py +3 -2
langroid/vector_store/qdrantdb.py +82 -11
{langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/METADATA +190 -129
langroid-0.1.219.dist-info/RECORD +127 -0
langroid/agent/special/recipient_validator_agent.py +0 -157
langroid/parsing/json.py +0 -64
langroid/utils/web/selenium_login.py +0 -36
langroid-0.1.139.dist-info/RECORD +0 -103
{langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
{langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0

langroid/agent/special/doc_chat_agent.py CHANGED Viewed

@@ -12,16 +12,17 @@ langroid with the [hf-embeddings] extra, e.g.:
 pip install "langroid[hf-embeddings]"
 """
 import logging
 from contextlib import ExitStack
-from typing import List, Optional, Tuple, no_type_check
+from functools import cache
+from typing import Any, Dict, List, Optional, Set, Tuple, no_type_check
+import nest_asyncio
 import numpy as np
-from rich import print
-from rich.console import Console
+import pandas as pd
 from rich.prompt import Prompt
-from langroid.agent.base import Agent
 from langroid.agent.batch import run_batch_tasks
 from langroid.agent.chat_agent import ChatAgent, ChatAgentConfig
 from langroid.agent.chat_document import ChatDocMetaData, ChatDocument
@@ -34,6 +35,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
 from langroid.language_models.base import StreamingIfAllowed
 from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
 from langroid.mytypes import DocMetaData, Document, Entity
+from langroid.parsing.document_parser import DocumentType
 from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
 from langroid.parsing.repo_loader import RepoLoader
 from langroid.parsing.search import (
@@ -41,20 +43,26 @@ from langroid.parsing.search import (
     find_fuzzy_matches_in_docs,
     preprocess_text,
 )
+from langroid.parsing.table_loader import describe_dataframe
 from langroid.parsing.url_loader import URLLoader
-from langroid.parsing.urls import get_list_from_user, get_urls_and_paths
+from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
 from langroid.parsing.utils import batched
 from langroid.prompts.prompts_config import PromptsConfig
 from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
 from langroid.utils.configuration import settings
 from langroid.utils.constants import NO_ANSWER
-from langroid.utils.output.printing import show_if_debug
-from langroid.vector_store.base import VectorStoreConfig
+from langroid.utils.output import show_if_debug, status
+from langroid.utils.pydantic_utils import dataframe_to_documents, extract_fields
+from langroid.vector_store.base import VectorStore, VectorStoreConfig
 from langroid.vector_store.lancedb import LanceDBConfig
-logger = logging.getLogger(__name__)
-console = Console()
+@cache
+def apply_nest_asyncio() -> None:
+    nest_asyncio.apply()
+logger = logging.getLogger(__name__)
 DEFAULT_DOC_CHAT_INSTRUCTIONS = """
 Your task is to answer questions about various documents.
@@ -66,25 +74,29 @@ DEFAULT_DOC_CHAT_SYSTEM_MESSAGE = """
 You are a helpful assistant, helping me understand a collection of documents.
 """
+has_sentence_transformers = False
+try:
+    from sentence_transformer import SentenceTransformer  # noqa: F401
-class DocChatAgentConfig(ChatAgentConfig):
-    """
-    Attributes:
-        max_context_tokens (int): threshold to use for various steps, e.g.
-            if we are able to fit the current stage of doc processing into
-            this many tokens, we skip additional compression steps, and
-            use the current docs as-is in the context
-        conversation_mode (bool): if True, we will accumulate message history,
-            and pass entire history to LLM at each round.
-            If False, each request to LLM will consist only of the
-            initial task messages plus the current query.
-    """
+    has_sentence_transformers = True
+except ImportError:
+    pass
+class DocChatAgentConfig(ChatAgentConfig):
     system_message: str = DEFAULT_DOC_CHAT_SYSTEM_MESSAGE
     user_message: str = DEFAULT_DOC_CHAT_INSTRUCTIONS
     summarize_prompt: str = SUMMARY_ANSWER_PROMPT_GPT4
-    max_context_tokens: int = 1000
-    conversation_mode: bool = True
+    # extra fields to include in content as key=value pairs
+    # (helps retrieval for table-like data)
+    add_fields_to_content: List[str] = []
+    filter_fields: List[str] = []  # fields usable in filter
+    retrieve_only: bool = False  # only retr relevant extracts, don't gen summary answer
+    extraction_granularity: int = 1  # granularity (in sentences) for relev extraction
+    filter: str | None = (
+        None  # filter condition for various lexical/semantic search fns
+    )
+    conversation_mode: bool = True  # accumulate message history?
     # In assistant mode, DocChatAgent receives questions from another Agent,
     # and those will already be in stand-alone form, so in this mode
     # there is no need to convert them to stand-alone form.
@@ -100,17 +112,22 @@ class DocChatAgentConfig(ChatAgentConfig):
     n_fuzzy_neighbor_words: int = 100  # num neighbor words to retrieve for fuzzy match
     use_fuzzy_match: bool = True
     use_bm25_search: bool = True
-    cross_encoder_reranking_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
+    cross_encoder_reranking_model: str = (
+        "cross-encoder/ms-marco-MiniLM-L-6-v2" if has_sentence_transformers else ""
+    )
     rerank_diversity: bool = True  # rerank to maximize diversity?
     rerank_periphery: bool = True  # rerank to avoid Lost In the Middle effect?
     embed_batch_size: int = 500  # get embedding of at most this many at a time
     cache: bool = True  # cache results
     debug: bool = False
     stream: bool = True  # allow streaming where needed
-    relevance_extractor_config: RelevanceExtractorAgentConfig = (
-        RelevanceExtractorAgentConfig()
+    split: bool = True  # use chunking
+    relevance_extractor_config: None | RelevanceExtractorAgentConfig = (
+        RelevanceExtractorAgentConfig(
+            llm=None  # use the parent's llm unless explicitly set here
+        )
     )
-    doc_paths: List[str] = []
+    doc_paths: List[str | bytes] = []
     default_paths: List[str] = [
         "https://news.ycombinator.com/item?id=35629033",
         "https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
@@ -135,7 +152,7 @@ class DocChatAgentConfig(ChatAgentConfig):
             # NOTE: PDF parsing is extremely challenging, and each library
             # has its own strengths and weaknesses.
             # Try one that works for your use case.
-            # or "haystack", "unstructured", "pdfplumber", "fitz", "pypdf"
+            # or "unstructured", "pdfplumber", "fitz", "pypdf"
             library="pdfplumber",
         ),
     )
@@ -156,7 +173,7 @@ class DocChatAgentConfig(ChatAgentConfig):
         collection_name="doc-chat-lancedb",
         replace_collection=True,
         storage_path=".lancedb/data/",
-        embedding=hf_embed_config,
+        embedding=hf_embed_config if has_sentence_transformers else oai_embed_config,
     )
     llm: OpenAIGPTConfig = OpenAIGPTConfig(
         type="openai",
@@ -180,14 +197,40 @@ class DocChatAgent(ChatAgent):
     ):
         super().__init__(config)
         self.config: DocChatAgentConfig = config
-        self.original_docs: None | List[Document] = None
+        self.original_docs: List[Document] = []
         self.original_docs_length = 0
-        self.chunked_docs: None | List[Document] = None
-        self.chunked_docs_clean: None | List[Document] = None
+        self.from_dataframe = False
+        self.df_description = ""
+        self.chunked_docs: List[Document] = []
+        self.chunked_docs_clean: List[Document] = []
         self.response: None | Document = None
         if len(config.doc_paths) > 0:
             self.ingest()
+    def clear(self) -> None:
+        """Clear the document collection and the specific collection in vecdb"""
+        if self.vecdb is None:
+            raise ValueError("VecDB not set")
+        self.original_docs = []
+        self.original_docs_length = 0
+        self.chunked_docs = []
+        self.chunked_docs_clean = []
+        collection_name = self.vecdb.config.collection_name
+        if collection_name is None:
+            return
+        try:
+            # Note we may have used a vecdb with a config.collection_name
+            # different from the agent's config.vecdb.collection_name!!
+            self.vecdb.delete_collection(collection_name)
+            self.vecdb = VectorStore.create(self.vecdb.config)
+        except Exception as e:
+            logger.warning(
+                f"""
+                Error while deleting collection {collection_name}:
+                {e}
+                """
+            )
     def ingest(self) -> None:
         """
         Chunk + embed + store docs specified by self.config.doc_paths
@@ -204,63 +247,316 @@ class DocChatAgent(ChatAgent):
             # do keyword and other non-vector searches
             if self.vecdb is None:
                 raise ValueError("VecDB not set")
-            self.chunked_docs = self.vecdb.get_all_documents()
-            # used for lexical similarity e.g. keyword search (bm25 etc)
-            self.chunked_docs_clean = [
-                Document(content=preprocess_text(d.content), metadata=d.metadata)
-                for d in self.chunked_docs
-            ]
+            self.setup_documents(filter=self.config.filter)
             return
-        urls, paths = get_urls_and_paths(self.config.doc_paths)
+        self.ingest_doc_paths(self.config.doc_paths)  # type: ignore
+    def ingest_doc_paths(
+        self,
+        paths: str | bytes | List[str | bytes],
+        metadata: (
+            List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
+        ) = [],
+        doc_type: str | DocumentType | None = None,
+    ) -> List[Document]:
+        """Split, ingest docs from specified paths,
+        do not add these to config.doc_paths.
+        Args:
+            paths: document paths, urls or byte-content of docs.
+                The bytes option is intended to support cases where a document
+                has already been read in as bytes (e.g. from an API or a database),
+                and we want to avoid having to write it to a temporary file
+                just to read it back in.
+            metadata: List of metadata dicts, one for each path.
+                If a single dict is passed in, it is used for all paths.
+            doc_type: DocumentType to use for parsing, if known.
+                MUST apply to all docs if specified.
+                This is especially useful when the `paths` are of bytes type,
+                to help with document type detection.
+        Returns:
+            List of Document objects
+        """
+        if isinstance(paths, str) or isinstance(paths, bytes):
+            paths = [paths]
+        all_paths = paths
+        paths_meta: Dict[int, Any] = {}
+        urls_meta: Dict[int, Any] = {}
+        idxs = range(len(all_paths))
+        url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
+        urls = [all_paths[i] for i in url_idxs]
+        paths = [all_paths[i] for i in path_idxs]
+        bytes_list = [all_paths[i] for i in bytes_idxs]
+        path_idxs.extend(bytes_idxs)
+        paths.extend(bytes_list)
+        if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
+            metadata, list
+        ):
+            if isinstance(metadata, list):
+                idx2meta = {
+                    p: (
+                        m
+                        if isinstance(m, dict)
+                        else (isinstance(m, DocMetaData) and m.dict())
+                    )  # appease mypy
+                    for p, m in zip(idxs, metadata)
+                }
+            elif isinstance(metadata, dict):
+                idx2meta = {p: metadata for p in idxs}
+            else:
+                idx2meta = {p: metadata.dict() for p in idxs}
+            urls_meta = {u: idx2meta[u] for u in url_idxs}
+            paths_meta = {p: idx2meta[p] for p in path_idxs}
         docs: List[Document] = []
         parser = Parser(self.config.parsing)
         if len(urls) > 0:
-            loader = URLLoader(urls=urls, parser=parser)
-            docs = loader.load()
-        if len(paths) > 0:
-            for p in paths:
-                path_docs = RepoLoader.get_documents(p, parser=parser)
+            for ui in url_idxs:
+                meta = urls_meta.get(ui, {})
+                loader = URLLoader(urls=[all_paths[ui]], parser=parser)  # type: ignore
+                url_docs = loader.load()
+                # update metadata of each doc with meta
+                for d in url_docs:
+                    d.metadata = d.metadata.copy(update=meta)
+                docs.extend(url_docs)
+        if len(paths) > 0:  # paths OR bytes are handled similarly
+            for pi in path_idxs:
+                meta = paths_meta.get(pi, {})
+                p = all_paths[pi]
+                path_docs = RepoLoader.get_documents(
+                    p,
+                    parser=parser,
+                    doc_type=doc_type,
+                )
+                # update metadata of each doc with meta
+                for d in path_docs:
+                    d.metadata = d.metadata.copy(update=meta)
                 docs.extend(path_docs)
         n_docs = len(docs)
-        n_splits = self.ingest_docs(docs)
+        n_splits = self.ingest_docs(docs, split=self.config.split)
         if n_docs == 0:
-            return
+            return []
         n_urls = len(urls)
         n_paths = len(paths)
         print(
             f"""
         [green]I have processed the following {n_urls} URLs
-        and {n_paths} paths into {n_splits} parts:
+        and {n_paths} docs into {n_splits} parts:
         """.strip()
         )
-        print("\n".join(urls))
-        print("\n".join(paths))
+        path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
+        print("\n".join([u for u in urls if isinstance(u, str)]))  # appease mypy
+        print("\n".join(path_reps))
+        return docs
-    def ingest_docs(self, docs: List[Document]) -> int:
+    def ingest_docs(
+        self,
+        docs: List[Document],
+        split: bool = True,
+        metadata: (
+            List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
+        ) = [],
+    ) -> int:
         """
         Chunk docs into pieces, map each chunk to vec-embedding, store in vec-db
+        Args:
+            docs: List of Document objects
+            split: Whether to split docs into chunks. Default is True.
+                If False, docs are treated as "chunks" and are not split.
+            metadata: List of metadata dicts, one for each doc, to augment
+                whatever metadata is already in the doc.
+                [ASSUME no conflicting keys between the two metadata dicts.]
+                If a single dict is passed in, it is used for all docs.
         """
-        self.original_docs = docs
+        if isinstance(metadata, list) and len(metadata) > 0:
+            for d, m in zip(docs, metadata):
+                d.metadata = d.metadata.copy(
+                    update=m if isinstance(m, dict) else m.dict()  # type: ignore
+                )
+        elif isinstance(metadata, dict):
+            for d in docs:
+                d.metadata = d.metadata.copy(update=metadata)
+        elif isinstance(metadata, DocMetaData):
+            for d in docs:
+                d.metadata = d.metadata.copy(update=metadata.dict())
+        self.original_docs.extend(docs)
         if self.parser is None:
             raise ValueError("Parser not set")
         for d in docs:
             if d.metadata.id in [None, ""]:
                 d.metadata.id = d._unique_hash_id()
-        docs = self.parser.split(docs)
-        self.chunked_docs = docs
-        self.chunked_docs_clean = [
-            Document(content=preprocess_text(d.content), metadata=d.metadata)
-            for d in self.chunked_docs
-        ]
+        if split:
+            docs = self.parser.split(docs)
+        else:
+            # treat each doc as a chunk
+            for d in docs:
+                d.metadata.is_chunk = True
         if self.vecdb is None:
             raise ValueError("VecDB not set")
+        # If any additional fields need to be added to content,
+        # add them as key=value pairs for all docs, before batching.
+        # This helps retrieval for table-like data.
+        # Note we need to do this at stage so that the embeddings
+        # are computed on the full content with these additional fields.
+        if len(self.config.add_fields_to_content) > 0:
+            fields = [
+                f for f in extract_fields(docs[0], self.config.add_fields_to_content)
+            ]
+            if len(fields) > 0:
+                for d in docs:
+                    key_vals = extract_fields(d, fields)
+                    d.content = (
+                        ",".join(f"{k}={v}" for k, v in key_vals.items())
+                        + ",content="
+                        + d.content
+                    )
+        docs = docs[: self.config.parsing.max_chunks]
         # add embeddings in batches, to stay under limit of embeddings API
         batches = list(batched(docs, self.config.embed_batch_size))
         for batch in batches:
             self.vecdb.add_documents(batch)
         self.original_docs_length = self.doc_length(docs)
+        self.setup_documents(docs, filter=self.config.filter)
         return len(docs)
+    @staticmethod
+    def document_compatible_dataframe(
+        df: pd.DataFrame,
+        content: str = "content",
+        metadata: List[str] = [],
+    ) -> Tuple[pd.DataFrame, List[str]]:
+        """
+        Convert dataframe so it is compatible with Document class:
+        - has "content" column
+        - has an "id" column to be used as Document.metadata.id
+        Args:
+            df: dataframe to convert
+            content: name of content column
+            metadata: list of metadata column names
+        Returns:
+            Tuple[pd.DataFrame, List[str]]: dataframe, metadata
+                - dataframe: dataframe with "content" column and "id" column
+                - metadata: list of metadata column names, including "id"
+        """
+        if content not in df.columns:
+            raise ValueError(
+                f"""
+                Content column {content} not in dataframe,
+                so we cannot ingest into the DocChatAgent.
+                Please specify the `content` parameter as a suitable
+                text-based column in the dataframe.
+                """
+            )
+        if content != "content":
+            # rename content column to "content", leave existing column intact
+            df = df.rename(columns={content: "content"}, inplace=False)
+        actual_metadata = metadata.copy()
+        if "id" not in df.columns:
+            docs = dataframe_to_documents(df, content="content", metadata=metadata)
+            ids = [str(d.id()) for d in docs]
+            df["id"] = ids
+        if "id" not in actual_metadata:
+            actual_metadata += ["id"]
+        return df, actual_metadata
+    def ingest_dataframe(
+        self,
+        df: pd.DataFrame,
+        content: str = "content",
+        metadata: List[str] = [],
+    ) -> int:
+        """
+        Ingest a dataframe into vecdb.
+        """
+        self.from_dataframe = True
+        self.df_description = describe_dataframe(
+            df, filter_fields=self.config.filter_fields, n_vals=5
+        )
+        df, metadata = DocChatAgent.document_compatible_dataframe(df, content, metadata)
+        docs = dataframe_to_documents(df, content="content", metadata=metadata)
+        # When ingesting a dataframe we will no longer do any chunking,
+        # so we mark each doc as a chunk.
+        # TODO - revisit this since we may still want to chunk large text columns
+        for d in docs:
+            d.metadata.is_chunk = True
+        return self.ingest_docs(docs)
+    def set_filter(self, filter: str) -> None:
+        self.config.filter = filter
+        self.setup_documents(filter=filter)
+    def setup_documents(
+        self,
+        docs: List[Document] = [],
+        filter: str | None = None,
+    ) -> None:
+        """
+        Setup `self.chunked_docs` and `self.chunked_docs_clean`
+        based on possible filter.
+        These will be used in various non-vector-based search functions,
+        e.g. self.get_similar_chunks_bm25(), self.get_fuzzy_matches(), etc.
+        Args:
+            docs: List of Document objects. This is empty when we are calling this
+                method after initial doc ingestion.
+            filter: Filter condition for various lexical/semantic search fns.
+        """
+        if filter is None and len(docs) > 0:
+            # no filter, so just use the docs passed in
+            self.chunked_docs.extend(docs)
+        else:
+            if self.vecdb is None:
+                raise ValueError("VecDB not set")
+            self.chunked_docs = self.vecdb.get_all_documents(where=filter or "")
+        self.chunked_docs_clean = [
+            Document(content=preprocess_text(d.content), metadata=d.metadata)
+            for d in self.chunked_docs
+        ]
+    def get_field_values(self, fields: list[str]) -> Dict[str, str]:
+        """Get string-listing of possible values of each filterable field,
+        e.g.
+        {
+            "genre": "crime, drama, mystery, ... (10 more)",
+            "certificate": "R, PG-13, PG, R",
+        }
+        """
+        field_values: Dict[str, Set[str]] = {}
+        # make empty set for each field
+        for f in fields:
+            field_values[f] = set()
+        if self.vecdb is None:
+            raise ValueError("VecDB not set")
+        # get all documents and accumulate possible values of each field until 10
+        docs = self.vecdb.get_all_documents()  # only works for vecdbs that support this
+        for d in docs:
+            # extract fields from d
+            doc_field_vals = extract_fields(d, fields)
+            for field, val in doc_field_vals.items():
+                field_values[field].add(val)
+        # For each field make a string showing list of possible values,
+        # truncate to 20 values, and if there are more, indicate how many
+        # more there are, e.g. Genre: crime, drama, mystery, ... (20 more)
+        field_values_list = {}
+        for f in fields:
+            vals = list(field_values[f])
+            n = len(vals)
+            remaining = n - 20
+            vals = vals[:20]
+            if n > 20:
+                vals.append(f"(...{remaining} more)")
+            # make a string of the values, ensure they are strings
+            field_values_list[f] = ", ".join(str(v) for v in vals)
+        return field_values_list
     def doc_length(self, docs: List[Document]) -> int:
         """
         Calc token-length of a list of docs
@@ -342,10 +638,9 @@ class DocChatAgent(ChatAgent):
         if len(inputs) == 0:
             if is_new_collection:
                 inputs = self.config.default_paths
-        self.config.doc_paths = inputs
+        self.config.doc_paths = inputs  # type: ignore
         self.ingest()
-    @no_type_check
     def llm_response(
         self,
         query: None | str | ChatDocument = None,
@@ -362,10 +657,55 @@ class DocChatAgent(ChatAgent):
             query_str = query_str[1:] if query_str is not None else None
             if self.llm is None:
                 raise ValueError("LLM not set")
-            with StreamingIfAllowed(self.llm):
+            with StreamingIfAllowed(self.llm, self.llm.get_stream()):
                 response = super().llm_response(query_str)
             if query_str is not None:
-                self.update_dialog(query_str, response.content)
+                self.update_dialog(
+                    query_str, "" if response is None else response.content
+                )
+            return response
+        if query_str == "":
+            return None
+        elif query_str == "?" and self.response is not None:
+            return self.justify_response()
+        elif (query_str.startswith(("summar", "?")) and self.response is None) or (
+            query_str == "??"
+        ):
+            return self.summarize_docs()
+        else:
+            self.callbacks.show_start_response(entity="llm")
+            response = self.answer_from_docs(query_str)
+            return ChatDocument(
+                content=response.content,
+                metadata=ChatDocMetaData(
+                    source=response.metadata.source,
+                    sender=Entity.LLM,
+                ),
+            )
+    async def llm_response_async(
+        self,
+        query: None | str | ChatDocument = None,
+    ) -> Optional[ChatDocument]:
+        apply_nest_asyncio()
+        if not self.llm_can_respond(query):
+            return None
+        query_str: str | None
+        if isinstance(query, ChatDocument):
+            query_str = query.content
+        else:
+            query_str = query
+        if query_str is None or query_str.startswith("!"):
+            # direct query to LLM
+            query_str = query_str[1:] if query_str is not None else None
+            if self.llm is None:
+                raise ValueError("LLM not set")
+            with StreamingIfAllowed(self.llm, self.llm.get_stream()):
+                response = await super().llm_response_async(query_str)
+            if query_str is not None:
+                self.update_dialog(
+                    query_str, "" if response is None else response.content
+                )
             return response
         if query_str == "":
             return None
@@ -376,6 +716,7 @@ class DocChatAgent(ChatAgent):
         ):
             return self.summarize_docs()
         else:
+            self.callbacks.show_start_response(entity="llm")
             response = self.answer_from_docs(query_str)
             return ChatDocument(
                 content=response.content,
@@ -407,7 +748,9 @@ class DocChatAgent(ChatAgent):
             ]
         )
-    def get_summary_answer(self, question: str, passages: List[Document]) -> Document:
+    def get_summary_answer(
+        self, question: str, passages: List[Document]
+    ) -> ChatDocument:
         """
         Given a question and a list of (possibly) doc snippets,
         generate an answer if possible
@@ -435,9 +778,6 @@ class DocChatAgent(ChatAgent):
         # 2 new LLMMessage objects:
         # one for `final_prompt`, and one for the LLM response
-        # TODO need to "forget" last two messages in message_history
-        # if we are not in conversation mode
         if self.config.conversation_mode:
             # respond with temporary context
             answer_doc = super()._llm_response_temp_context(question, final_prompt)
@@ -446,16 +786,23 @@ class DocChatAgent(ChatAgent):
         final_answer = answer_doc.content.strip()
         show_if_debug(final_answer, "SUMMARIZE_RESPONSE= ")
-        parts = final_answer.split("SOURCE:", maxsplit=1)
-        if len(parts) > 1:
-            content = parts[0].strip()
-            sources = parts[1].strip()
-        else:
+        if final_answer.startswith("SOURCE"):
+            # sometimes SOURCE may be shown first,
+            # in this case just use final_answer as-is for both content and source
             content = final_answer
-            sources = ""
-        return Document(
+            sources = final_answer
+        else:
+            parts = final_answer.split("SOURCE:", maxsplit=1)
+            if len(parts) > 1:
+                content = parts[0].strip()
+                sources = parts[1].strip()
+            else:
+                content = final_answer
+                sources = ""
+        return ChatDocument(
             content=content,
-            metadata=DocMetaData(
+            metadata=ChatDocMetaData(
                 source="SOURCE: " + sources,
                 sender=Entity.LLM,
                 cached=getattr(answer_doc.metadata, "cached", False),
@@ -465,7 +812,7 @@ class DocChatAgent(ChatAgent):
     def llm_hypothetical_answer(self, query: str) -> str:
         if self.llm is None:
             raise ValueError("LLM not set")
-        with console.status("[cyan]LLM generating hypothetical answer..."):
+        with status("[cyan]LLM generating hypothetical answer..."):
             with StreamingIfAllowed(self.llm, False):
                 # TODO: provide an easy way to
                 # Adjust this prompt depending on context.
@@ -485,7 +832,7 @@ class DocChatAgent(ChatAgent):
     def llm_rephrase_query(self, query: str) -> List[str]:
         if self.llm is None:
             raise ValueError("LLM not set")
-        with console.status("[cyan]LLM generating rephrases of query..."):
+        with status("[cyan]LLM generating rephrases of query..."):
             with StreamingIfAllowed(self.llm, False):
                 rephrases = self.llm_response_forget(
                     f"""
@@ -501,11 +848,11 @@ class DocChatAgent(ChatAgent):
     ) -> List[Tuple[Document, float]]:
         # find similar docs using bm25 similarity:
         # these may sometimes be more likely to contain a relevant verbatim extract
-        with console.status("[cyan]Searching for similar chunks using bm25..."):
-            if self.chunked_docs is None:
+        with status("[cyan]Searching for similar chunks using bm25..."):
+            if self.chunked_docs is None or len(self.chunked_docs) == 0:
                 logger.warning("No chunked docs; cannot use bm25-similarity")
                 return []
-            if self.chunked_docs_clean is None:
+            if self.chunked_docs_clean is None or len(self.chunked_docs_clean) == 0:
                 logger.warning("No cleaned chunked docs; cannot use bm25-similarity")
                 return []
             docs_scores = find_closest_matches_with_bm25(
@@ -519,7 +866,7 @@ class DocChatAgent(ChatAgent):
     def get_fuzzy_matches(self, query: str, multiple: int) -> List[Document]:
         # find similar docs using fuzzy matching:
         # these may sometimes be more likely to contain a relevant verbatim extract
-        with console.status("[cyan]Finding fuzzy matches in chunks..."):
+        with status("[cyan]Finding fuzzy matches in chunks..."):
             if self.chunked_docs is None:
                 logger.warning("No chunked docs; cannot use fuzzy matching")
                 return []
@@ -539,7 +886,7 @@ class DocChatAgent(ChatAgent):
     def rerank_with_cross_encoder(
         self, query: str, passages: List[Document]
     ) -> List[Document]:
-        with console.status("[cyan]Re-ranking retrieved chunks using cross-encoder..."):
+        with status("[cyan]Re-ranking retrieved chunks using cross-encoder..."):
             try:
                 from sentence_transformers import CrossEncoder
             except ImportError:
@@ -657,8 +1004,45 @@ class DocChatAgent(ChatAgent):
         """
         if self.vecdb is None or self.config.n_neighbor_chunks == 0:
             return docs_scores
+        if len(docs_scores) == 0:
+            return []
+        if set(docs_scores[0][0].__fields__) != {"content", "metadata"}:
+            # Do not add context window when there are other fields besides just
+            # content and metadata, since we do not know how to set those other fields
+            # for newly created docs with combined content.
+            return docs_scores
         return self.vecdb.add_context_window(docs_scores, self.config.n_neighbor_chunks)
+    def get_semantic_search_results(
+        self,
+        query: str,
+        k: int = 10,
+    ) -> List[Tuple[Document, float]]:
+        """
+        Get semantic search results from vecdb.
+        Args:
+            query (str): query to search for
+            k (int): number of results to return
+        Returns:
+            List[Tuple[Document, float]]: List of (Document, score) tuples.
+        """
+        if self.vecdb is None:
+            raise ValueError("VecDB not set")
+        # Note: for dynamic filtering based on a query, users can
+        # use the `temp_update` context-manager to pass in a `filter` to self.config,
+        # e.g.:
+        # with temp_update(self.config, {"filter": "metadata.source=='source1'"}):
+        #     docs_scores = self.get_semantic_search_results(query, k=k)
+        # This avoids having pass the `filter` argument to every function call
+        # upstream of this one.
+        # The `temp_update` context manager is defined in
+        # `langroid/utils/pydantic_utils.py`
+        return self.vecdb.similar_texts_with_scores(
+            query,
+            k=k,
+            where=self.config.filter,
+        )
     def get_relevant_chunks(
         self, query: str, query_proxies: List[str] = []
     ) -> List[Document]:
@@ -695,21 +1079,21 @@ class DocChatAgent(ChatAgent):
         if self.vecdb is None:
             raise ValueError("VecDB not set")
-        with console.status("[cyan]Searching VecDB for relevant doc passages..."):
+        with status("[cyan]Searching VecDB for relevant doc passages..."):
             docs_and_scores: List[Tuple[Document, float]] = []
             for q in [query] + query_proxies:
-                docs_and_scores += self.vecdb.similar_texts_with_scores(
+                docs_and_scores += self.get_semantic_search_results(
                     q,
                     k=self.config.parsing.n_similar_docs * retrieval_multiple,
                 )
         # keep only docs with unique d.id()
         id2doc_score = {d.id(): (d, s) for d, s in docs_and_scores}
         docs_and_scores = list(id2doc_score.values())
-        passages = [
-            Document(content=d.content, metadata=d.metadata)
-            for (d, _) in docs_and_scores
-        ]
+        passages = [d for (d, _) in docs_and_scores]
+        # passages = [
+        #     Document(content=d.content, metadata=d.metadata)
+        #     for (d, _) in docs_and_scores
+        # ]
         if self.config.use_bm25_search:
             docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
@@ -771,7 +1155,7 @@ class DocChatAgent(ChatAgent):
             # Regardless of whether we are in conversation mode or not,
             # for relevant doc/chunk extraction, we must convert the query
             # to a standalone query to get more relevant results.
-            with console.status("[cyan]Converting to stand-alone query...[/cyan]"):
+            with status("[cyan]Converting to stand-alone query...[/cyan]"):
                 with StreamingIfAllowed(self.llm, False):
                     query = self.llm.followup_to_standalone(self.dialog, query)
             print(f"[orange2]New query: {query}")
@@ -790,7 +1174,7 @@ class DocChatAgent(ChatAgent):
         if len(passages) == 0:
             return query, []
-        with console.status("[cyan]LLM Extracting verbatim passages..."):
+        with status("[cyan]LLM Extracting verbatim passages..."):
             with StreamingIfAllowed(self.llm, False):
                 # these are async calls, one per passage; turn off streaming
                 extracts = self.get_verbatim_extracts(query, passages)
@@ -814,8 +1198,15 @@ class DocChatAgent(ChatAgent):
             List[Document]: list of Documents containing extracts and metadata.
         """
         agent_cfg = self.config.relevance_extractor_config
+        if agent_cfg is None:
+            # no relevance extraction: simply return passages
+            return passages
+        if agent_cfg.llm is None:
+            # Use main DocChatAgent's LLM if not provided explicitly:
+            # this reduces setup burden on the user
+            agent_cfg.llm = self.config.llm
         agent_cfg.query = query
-        agent_cfg.segment_length = 1
+        agent_cfg.segment_length = self.config.extraction_granularity
         agent_cfg.llm.stream = False  # disable streaming for concurrent calls
         agent = RelevanceExtractorAgent(agent_cfg)
@@ -831,16 +1222,21 @@ class DocChatAgent(ChatAgent):
             input_map=lambda msg: msg.content,
             output_map=lambda ans: ans.content if ans is not None else NO_ANSWER,
         )
-        metadatas = [P.metadata for P in passages]
-        # return with metadata so we can use it downstream, e.g. to cite sources
-        return [
-            Document(content=e, metadata=m)
-            for e, m in zip(extracts, metadatas)
-            if (e != NO_ANSWER and len(e) > 0)
-        ]
-    @no_type_check
-    def answer_from_docs(self, query: str) -> Document:
+        # Caution: Retain ALL other fields in the Documents (which could be
+        # other than just `content` and `metadata`), while simply replacing
+        # `content` with the extracted portions
+        passage_extracts = []
+        for p, e in zip(passages, extracts):
+            if e == NO_ANSWER or len(e) == 0:
+                continue
+            p_copy = p.copy()
+            p_copy.content = e
+            passage_extracts.append(p_copy)
+        return passage_extracts
+    def answer_from_docs(self, query: str) -> ChatDocument:
         """
         Answer query based on relevant docs from the VecDB
@@ -850,24 +1246,38 @@ class DocChatAgent(ChatAgent):
         Returns:
             Document: answer
         """
-        response = Document(
+        response = ChatDocument(
             content=NO_ANSWER,
-            metadata=DocMetaData(
+            metadata=ChatDocMetaData(
                 source="None",
+                sender=Entity.LLM,
             ),
         )
         # query may be updated to a stand-alone version
         query, extracts = self.get_relevant_extracts(query)
         if len(extracts) == 0:
             return response
+        if self.llm is None:
+            raise ValueError("LLM not set")
+        if self.config.retrieve_only:
+            # only return extracts, skip LLM-based summary answer
+            meta = dict(
+                sender=Entity.LLM,
+            )
+            # copy metadata from first doc, unclear what to do here.
+            meta.update(extracts[0].metadata)
+            return ChatDocument(
+                content="\n\n".join([e.content for e in extracts]),
+                metadata=ChatDocMetaData(**meta),
+            )
         with ExitStack() as stack:
             # conditionally use Streaming or rich console context
             cm = (
                 StreamingIfAllowed(self.llm)
                 if settings.stream
-                else (console.status("LLM Generating final answer..."))
+                else (status("LLM Generating final answer..."))
             )
-            stack.enter_context(cm)
+            stack.enter_context(cm)  # type: ignore
             response = self.get_summary_answer(query, extracts)
         self.update_dialog(query, response.content)
@@ -881,7 +1291,7 @@ class DocChatAgent(ChatAgent):
         """Summarize all docs"""
         if self.llm is None:
             raise ValueError("LLM not set")
-        if self.original_docs is None:
+        if len(self.original_docs) == 0:
             logger.warning(
                 """
                 No docs to summarize! Perhaps you are re-using a previously
@@ -910,19 +1320,22 @@ class DocChatAgent(ChatAgent):
             )
         prompt = f"""
         {instruction}
+        FULL TEXT:
         {full_text}
         """.strip()
         with StreamingIfAllowed(self.llm):
-            summary = Agent.llm_response(self, prompt)
-            return summary  # type: ignore
+            summary = ChatAgent.llm_response(self, prompt)
+            return summary
-    def justify_response(self) -> None:
+    def justify_response(self) -> ChatDocument | None:
         """Show evidence for last response"""
         if self.response is None:
             print("[magenta]No response yet")
-            return
+            return None
         source = self.response.metadata.source
         if len(source) > 0:
             print("[magenta]" + source)
         else:
             print("[magenta]No source found")
+        return None

langroid 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl

langroid 0.1.139py3-none-any.whl → 0.1.219py3-none-any.whl