PyPI - langroid - Versions diffs - 0.1.101__py3-none-any.whl → 0.1.102__py3-none-any.whl - Mend

langroid 0.1.101py3-none-any.whl → 0.1.102py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

langroid/agent/batch.py +2 -2
langroid/agent/special/doc_chat_agent.py +48 -3
langroid/agent/special/retriever_agent.py +1 -1
langroid/mytypes.py +10 -4
langroid/parsing/document_parser.py +1 -0
langroid/parsing/parser.py +62 -31
langroid/parsing/search.py +54 -49
langroid/parsing/utils.py +26 -0
langroid/utils/algorithms/graph.py +49 -0
langroid/utils/configuration.py +13 -0
langroid/utils/pydantic_utils.py +3 -1
langroid/vector_store/base.py +157 -1
langroid/vector_store/chromadb.py +12 -19
langroid/vector_store/meilisearch.py +1 -0
langroid/vector_store/momento.py +1 -0
langroid/vector_store/qdrantdb.py +10 -4
{langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/METADATA +1 -1
{langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/RECORD +20 -19
{langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/LICENSE +0 -0
{langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/WHEEL +0 -0

langroid/agent/batch.py CHANGED Viewed

@@ -9,7 +9,7 @@ from rich.console import Console
 from langroid.agent.base import Agent
 from langroid.agent.chat_document import ChatDocument
 from langroid.agent.task import Task
-from langroid.utils.configuration import Settings, settings, temporary_settings
+from langroid.utils.configuration import quiet_mode, settings
 from langroid.utils.logging import setup_colored_logging
 console = Console(quiet=settings.quiet)
@@ -53,7 +53,7 @@ def run_batch_tasks(
         return output_map(result)
     async def _do_all() -> List[Any]:
-        with temporary_settings(Settings(quiet=True)):
+        with quiet_mode():
             return await asyncio.gather(  # type: ignore
                 *(_do_task(input, i) for i, input in enumerate(inputs))
             )

langroid/agent/special/doc_chat_agent.py CHANGED Viewed

@@ -66,6 +66,10 @@ You are a helpful assistant, helping me understand a collection of documents.
 """
+class DocChunkMetqdata(DocMetaData):
+    id: str
 class DocChatAgentConfig(ChatAgentConfig):
     """
     Attributes:
@@ -95,6 +99,7 @@ class DocChatAgentConfig(ChatAgentConfig):
     # It is False by default; its benefits depends on the context.
     hypothetical_answer: bool = False
     n_query_rephrases: int = 0
+    n_neighbor_chunks: int = 0  # how many neighbors on either side of match to retrieve
     use_fuzzy_match: bool = True
     use_bm25_search: bool = True
     cross_encoder_reranking_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
@@ -122,6 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
         min_chunk_chars=200,
         discard_chunk_chars=5,  # discard chunks with fewer than this many chars
         n_similar_docs=3,
+        n_neighbor_ids=0,  # num chunk IDs to store on either side of each chunk
         pdf=PdfParsingConfig(
             # NOTE: PDF parsing is extremely challenging, and each library
             # has its own strengths and weaknesses.
@@ -195,6 +201,7 @@ class DocChatAgent(ChatAgent):
             if self.vecdb is None:
                 raise ValueError("VecDB not set")
             self.chunked_docs = self.vecdb.get_all_documents()
+            # used for lexical similarity e.g. keyword search (bm25 etc)
             self.chunked_docs_clean = [
                 Document(content=preprocess_text(d.content), metadata=d.metadata)
                 for d in self.chunked_docs
@@ -509,9 +516,13 @@ class DocChatAgent(ChatAgent):
             if self.chunked_docs is None:
                 logger.warning("No chunked docs; cannot use fuzzy matching")
                 return []
+            if self.chunked_docs_clean is None:
+                logger.warning("No cleaned chunked docs; cannot use fuzzy-search")
+                return []
             fuzzy_match_docs = find_fuzzy_matches_in_docs(
                 query,
                 self.chunked_docs,
+                self.chunked_docs_clean,
                 k=self.config.parsing.n_similar_docs * multiple,
                 words_before=1000,
                 words_after=1000,
@@ -546,6 +557,36 @@ class DocChatAgent(ChatAgent):
             ]
         return passages
+    def add_context_window(
+        self,
+        docs_scores: List[Tuple[Document, float]],
+    ) -> List[Tuple[Document, float]]:
+        """
+        In each doc's metadata, there may be a window_ids field indicating
+        the ids of the chunks around the current chunk.
+        These window_ids may overlap, so we
+        - gather connected-components of overlapping windows,
+        - split each component into roughly equal parts,
+        - create a new document for each part, preserving metadata,
+        We may have stored a longer set of window_ids than we need.
+        We just want `neighbors` on each side of the center of window_ids.
+        Args:
+            docs (List[Document]): List of documents to add context window to.
+            scores (List[float]): List of match scores for each document.
+            neighbors (int, optional): Number of neighbors on "each side" of match to
+                retrieve. Defaults to 0.
+                "Each side" here means before and after the match,
+                in the original text.
+        Returns:
+            List[Tuple[Document, float]]: List of (Document, score) tuples.
+        """
+        if self.vecdb is None or self.config.n_neighbor_chunks == 0:
+            return docs_scores
+        return self.vecdb.add_context_window(docs_scores, self.config.n_neighbor_chunks)
     def get_relevant_chunks(
         self, query: str, query_proxies: List[str] = []
     ) -> List[Document]:
@@ -560,10 +601,11 @@ class DocChatAgent(ChatAgent):
         dynamically retrieved based on a window around a lexical match.
         These are the steps (some optional based on config):
-        - vector-embedding distance, from vecdb
-        - bm25-ranking (keyword similarity)
+        - semantic search based on vector-embedding distance, from vecdb
+        - lexical search using bm25-ranking (keyword similarity)
         - fuzzy matching (keyword similarity)
-        - re-ranking of doc-chunks using cross-encoder, pick top k
+        - re-ranking of doc-chunks by relevance to query, using cross-encoder,
+           and pick top k
         Args:
             query: original query (assumed to be in stand-alone form)
@@ -612,6 +654,9 @@ class DocChatAgent(ChatAgent):
         if len(passages) == 0:
             return []
+        passages_scores = [(p, 0.0) for p in passages]
+        passages_scores = self.add_context_window(passages_scores)
+        passages = [p for p, _ in passages_scores]
         # now passages can potentially have a lot of doc chunks,
         # so we re-rank them using a cross-encoder scoring model
         # https://www.sbert.net/examples/applications/retrieve_rerank

langroid/agent/special/retriever_agent.py CHANGED Viewed

@@ -28,7 +28,7 @@ logger = logging.getLogger(__name__)
 class RecordMetadata(DocMetaData):
-    id: None | int | str = None
+    id: None | str = None
 class RecordDoc(Document):

langroid/mytypes.py CHANGED Viewed

@@ -26,6 +26,8 @@ class DocMetaData(BaseModel):
     source: str = "context"
     is_chunk: bool = False  # if it is a chunk, don't split
+    id: str | None = None  # unique id for the document
+    window_ids: List[str] = []  # for RAG: ids of chunks around this one
     def dict(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
         """
@@ -51,9 +53,10 @@ class Document(BaseModel):
     content: str
     metadata: DocMetaData
-    def _unique_hash_id(self) -> str:
+    @staticmethod
+    def hash_id(doc: str) -> str:
         # Encode the document as UTF-8
-        doc_utf8 = str(self).encode("utf-8")
+        doc_utf8 = str(doc).encode("utf-8")
         # Create a SHA256 hash object
         sha256_hash = hashlib.sha256()
@@ -69,8 +72,11 @@ class Document(BaseModel):
         return str(hash_uuid)
-    def id(self) -> Any:
-        if hasattr(self.metadata, "id"):
+    def _unique_hash_id(self) -> str:
+        return self.hash_id(str(self))
+    def id(self) -> str:
+        if hasattr(self.metadata, "id") and self.metadata.id is not None:
             return self.metadata.id
         else:
             return self._unique_hash_id()

langroid/parsing/document_parser.py CHANGED Viewed

@@ -200,6 +200,7 @@ class DocumentParser(Parser):
                     ),
                 )
             )
+        self.add_window_ids(docs)
         return docs

langroid/parsing/parser.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 from enum import Enum
-from functools import reduce
 from typing import List
 import tiktoken
@@ -36,6 +35,7 @@ class ParsingConfig(BaseSettings):
     min_chunk_chars: int = 350
     discard_chunk_chars: int = 5  # discard chunks with fewer than this many chars
     n_similar_docs: int = 4
+    n_neighbor_ids: int = 0  # window size to store around each chunk
     separators: List[str] = ["\n\n", "\n", " ", ""]
     token_encoding_model: str = "text-embedding-ada-002"
     pdf: PdfParsingConfig = PdfParsingConfig()
@@ -51,17 +51,42 @@ class Parser:
         tokens = self.tokenizer.encode(text)
         return len(tokens)
+    def add_window_ids(self, chunks: List[Document]) -> None:
+        """Chunks are consecutive parts of a single original document.
+        Add window_ids in metadata"""
+        # The original metadata.id (if any) is ignored since it will be same for all
+        # chunks and is useless. We want a distinct id for each chunk.
+        ids = [Document.hash_id(str(c)) for c in chunks]
+        k = self.config.n_neighbor_ids
+        n = len(ids)
+        window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
+        for i, c in enumerate(chunks):
+            if c.content.strip() == "":
+                continue
+            c.metadata.window_ids = window_ids[i]
+            c.metadata.id = ids[i]
+            c.metadata.is_chunk = True
     def split_simple(self, docs: List[Document]) -> List[Document]:
         if len(self.config.separators) == 0:
             raise ValueError("Must have at least one separator")
-        return [
-            Document(content=chunk.strip(), metadata=d.metadata)
-            for d in docs
-            for chunk in remove_extra_whitespace(d.content).split(
-                self.config.separators[0]
-            )
-            if chunk.strip() != ""
-        ]
+        final_docs = []
+        for d in docs:
+            if d.content.strip() == "":
+                continue
+            chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
+            chunk_docs = [
+                Document(
+                    content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
+                )
+                for c in chunks
+                if c.strip() != ""
+            ]
+            self.add_window_ids(chunk_docs)
+            final_docs += chunk_docs
+        return final_docs
     def split_para_sentence(self, docs: List[Document]) -> List[Document]:
         final_chunks = []
@@ -95,28 +120,37 @@ class Parser:
         return final_chunks + chunks
     def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
-        chunked_docs = [
-            [
-                Document(content=chunk.strip(), metadata=d.metadata)
-                for chunk in create_chunks(
-                    d.content, self.config.chunk_size, self.num_tokens
+        final_chunks = []
+        for d in docs:
+            if d.content.strip() == "":
+                continue
+            chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
+            chunk_docs = [
+                Document(
+                    content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
                 )
-                if chunk.strip() != ""
+                for c in chunks
+                if c.strip() != ""
             ]
-            for d in docs
-        ]
-        return reduce(lambda x, y: x + y, chunked_docs)
+            self.add_window_ids(chunk_docs)
+            final_chunks += chunk_docs
+        return final_chunks
     def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
-        chunked_docs = [
-            [
-                Document(content=chunk.strip(), metadata=d.metadata)
-                for chunk in self.chunk_tokens(d.content)
-                if chunk.strip() != ""
+        final_docs = []
+        for d in docs:
+            chunks = self.chunk_tokens(d.content)
+            chunk_docs = [
+                Document(
+                    content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
+                )
+                for c in chunks
+                if c.strip() != ""
             ]
-            for d in docs
-        ]
-        return reduce(lambda x, y: x + y, chunked_docs)
+            self.add_window_ids(chunk_docs)
+            final_docs += chunk_docs
+        return final_docs
     def chunk_tokens(
         self,
@@ -198,11 +232,8 @@ class Parser:
             # Increment the number of chunks
             num_chunks += 1
-        # Handle the remaining tokens
-        if tokens:
-            remaining_text = self.tokenizer.decode(tokens).replace("\n", " ").strip()
-            if len(remaining_text) > self.config.discard_chunk_chars:
-                chunks.append(remaining_text)
+        # There may be remaining tokens, but we discard them
+        # since we have already reached the maximum number of chunks
         return chunks

langroid/parsing/search.py CHANGED Viewed

@@ -7,7 +7,6 @@ See tests for examples: tests/main/test_string_search.py
 """
 import difflib
-import re
 from typing import List, Tuple
 from nltk.corpus import stopwords
@@ -24,6 +23,7 @@ from .utils import download_nltk_resource
 def find_fuzzy_matches_in_docs(
     query: str,
     docs: List[Document],
+    docs_clean: List[Document],
     k: int,
     words_before: int | None = None,
     words_after: int | None = None,
@@ -49,45 +49,45 @@ def find_fuzzy_matches_in_docs(
         return []
     best_matches = process.extract(
         query,
-        [d.content for d in docs],
+        [d.content for d in docs_clean],
         limit=k,
         scorer=fuzz.partial_ratio,
     )
     real_matches = [m for m, score in best_matches if score > 50]
-    results = []
-    for match in real_matches:
-        words = match.split()
-        for doc in docs:
-            if match in doc.content:
-                words_in_text = doc.content.split()
-                first_word_idx = next(
-                    (
-                        i
-                        for i, word in enumerate(words_in_text)
-                        if word.startswith(words[0])
-                    ),
-                    -1,
-                )
-                if words_before is None:
-                    words_before = len(words_in_text)
-                if words_after is None:
-                    words_after = len(words_in_text)
-                if first_word_idx != -1:
-                    start_idx = max(0, first_word_idx - words_before)
-                    end_idx = min(
-                        len(words_in_text),
-                        first_word_idx + len(words) + words_after,
-                    )
-                    doc_match = Document(
-                        content=" ".join(words_in_text[start_idx:end_idx]),
-                        metadata=doc.metadata,
-                    )
-                    results.append(doc_match)
+    # find the original docs that corresponding to the matches
+    orig_doc_matches = []
+    for i, m in enumerate(real_matches):
+        for j, doc_clean in enumerate(docs_clean):
+            if m in doc_clean.content:
+                orig_doc_matches.append(docs[j])
                 break
+    if words_after is None and words_before is None:
+        return orig_doc_matches
+    contextual_matches = []
+    for match in orig_doc_matches:
+        choice_text = match.content
+        contexts = []
+        while choice_text != "":
+            context, start_pos, end_pos = get_context(
+                query, choice_text, words_before, words_after
+            )
+            if context == "" or end_pos == 0:
+                break
+            contexts.append(context)
+            words = choice_text.split()
+            end_pos = min(end_pos, len(words))
+            choice_text = " ".join(words[end_pos:])
+        if len(contexts) > 0:
+            contextual_matches.append(
+                Document(
+                    content=" ... ".join(contexts),
+                    metadata=match.metadata,
+                )
+            )
-    return results
+    return contextual_matches
 def preprocess_text(text: str) -> str:
@@ -171,7 +171,7 @@ def get_context(
     text: str,
     words_before: int | None = 100,
     words_after: int | None = 100,
-) -> str:
+) -> Tuple[str, int, int]:
     """
     Returns a portion of text containing the best approximate match of the query,
     including b words before and a words after the match.
@@ -185,7 +185,9 @@ def get_context(
     Returns:
     str: A string containing b words before, the match, and a words after
         the best approximate match position of the query in the text. If no
-        match is found, returns "No match found".
+        match is found, returns empty string.
+    int: The start position of the match in the text.
+    int: The end position of the match in the text.
     Example:
     >>> get_context("apple", "The quick brown fox jumps over the apple.", 3, 2)
@@ -193,26 +195,29 @@ def get_context(
     """
     if words_after is None and words_before is None:
         # return entire text since we're not asked to return a bounded context
-        return text
+        return text, 0, 0
+    # make sure there is a good enough fu
+    if fuzz.partial_ratio(query, text) < 70:
+        return "", 0, 0
     sequence_matcher = difflib.SequenceMatcher(None, text, query)
     match = sequence_matcher.find_longest_match(0, len(text), 0, len(query))
     if match.size == 0:
-        return "No match found"
-    words = re.findall(r"\b\w+\b", text)
-    if words_after is None:
-        words_after = len(words)
-    if words_before is None:
-        words_before = len(words)
-    start_word_pos = len(re.findall(r"\b\w+\b", text[: match.a]))
-    start_pos = max(0, start_word_pos - words_before)
-    end_pos = min(
-        len(words), start_word_pos + words_after + len(re.findall(r"\b\w+\b", query))
-    )
+        return "", 0, 0
+    segments = text.split()
+    n_segs = len(segments)
+    start_segment_pos = len(text[: match.a].split())
+    words_before = words_before or n_segs
+    words_after = words_after or n_segs
+    start_pos = max(0, start_segment_pos - words_before)
+    end_pos = min(len(segments), start_segment_pos + words_after + len(query.split()))
-    return " ".join(words[start_pos:end_pos])
+    return " ".join(segments[start_pos:end_pos]), start_pos, end_pos
 def eliminate_near_duplicates(passages: List[str], threshold: float = 0.8) -> List[str]:

langroid/parsing/utils.py CHANGED Viewed

@@ -165,6 +165,32 @@ def parse_number_range_list(specs: str) -> List[int]:
     return sorted(list(spec_indices))
+def strip_k(s: str, k: int = 2) -> str:
+    """
+    Strip any leading and trailing whitespaces from the input text beyond length k.
+    This is useful for removing leading/trailing whitespaces from a text while
+    preserving paragraph structure.
+    Args:
+        s (str): The input text.
+        k (int): The number of leading and trailing whitespaces to retain.
+    Returns:
+        str: The text with leading and trailing whitespaces removed beyond length k.
+    """
+    # Count leading and trailing whitespaces
+    leading_count = len(s) - len(s.lstrip())
+    trailing_count = len(s) - len(s.rstrip())
+    # Determine how many whitespaces to retain
+    leading_keep = min(leading_count, k)
+    trailing_keep = min(trailing_count, k)
+    # Use slicing to get the desired output
+    return s[leading_count - leading_keep : len(s) - (trailing_count - trailing_keep)]
 def clean_whitespace(text: str) -> str:
     """Remove extra whitespace from the input text, while preserving
     paragraph structure.

langroid/utils/algorithms/graph.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""
+Graph algos.
+"""
+from typing import List, no_type_check
+import numpy as np
+@no_type_check
+def topological_sort(order: np.array) -> List[int]:
+    """
+    Given a directed adjacency matrix, return a topological sort of the nodes.
+    order[i,j] = -1 means there is an edge from i to j.
+    order[i,j] = 0 means there is no edge from i to j.
+    order[i,j] = 1 means there is an edge from j to i.
+    Args:
+        order (np.array): The adjacency matrix.
+    Returns:
+        List[int]: The topological sort of the nodes.
+    """
+    n = order.shape[0]
+    # Calculate the in-degrees
+    in_degree = [0] * n
+    for i in range(n):
+        for j in range(n):
+            if order[i, j] == -1:
+                in_degree[j] += 1
+    # Initialize the queue with nodes of in-degree 0
+    queue = [i for i in range(n) if in_degree[i] == 0]
+    result = []
+    while queue:
+        node = queue.pop(0)
+        result.append(node)
+        for i in range(n):
+            if order[node, i] == -1:
+                in_degree[i] -= 1
+                if in_degree[i] == 0:
+                    queue.append(i)
+    assert len(result) == n, "Cycle detected"
+    return result

langroid/utils/configuration.py CHANGED Viewed

@@ -71,6 +71,19 @@ def temporary_settings(temp_settings: Settings) -> Iterator[None]:
         settings.__dict__.update(original_settings.__dict__)
+@contextmanager
+def quiet_mode() -> Iterator[None]:
+    """Temporarily set quiet=True in global settings and restore afterward."""
+    original_quiet = settings.quiet
+    set_global(Settings(quiet=True))
+    try:
+        yield
+    finally:
+        settings.quiet = original_quiet
 def set_env(settings: BaseSettings) -> None:
     """
     Set environment variables from a BaseSettings instance

langroid/utils/pydantic_utils.py CHANGED Viewed

@@ -79,7 +79,9 @@ def flatten_pydantic_model(
         current_model, current_prefix = models_to_process.pop()
         for name, field in current_model.__fields__.items():
-            if issubclass(field.outer_type_, BaseModel):
+            if isinstance(field.outer_type_, type) and issubclass(
+                field.outer_type_, BaseModel
+            ):
                 new_prefix = (
                     f"{current_prefix}{name}__" if current_prefix else f"{name}__"
                 )

langroid/vector_store/base.py CHANGED Viewed

@@ -1,12 +1,16 @@
+import copy
 import logging
 from abc import ABC, abstractmethod
-from typing import List, Optional, Sequence, Tuple
+from math import ceil
+from typing import Dict, List, Optional, Sequence, Tuple
+import numpy as np
 from pydantic import BaseSettings
 from langroid.embedding_models.base import EmbeddingModelsConfig
 from langroid.embedding_models.models import OpenAIEmbeddingsConfig
 from langroid.mytypes import Document
+from langroid.utils.algorithms.graph import topological_sort
 from langroid.utils.configuration import settings
 from langroid.utils.output.printing import print_long_text
@@ -130,8 +134,160 @@ class VectorStore(ABC):
         k: int = 1,
         where: Optional[str] = None,
     ) -> List[Tuple[Document, float]]:
+        """
+        Find k most similar texts to the given text, in terms of vector distance metric
+        (e.g., cosine similarity).
+        Args:
+            text (str): The text to find similar texts for.
+            k (int, optional): Number of similar texts to retrieve. Defaults to 1.
+            where (Optional[str], optional): Where clause to filter the search.
+        Returns:
+            List[Tuple[Document,float]]: List of (Document, score) tuples.
+        """
         pass
+    def add_context_window(
+        self, docs_scores: List[Tuple[Document, float]], neighbors: int = 0
+    ) -> List[Tuple[Document, float]]:
+        """
+        In each doc's metadata, there may be a window_ids field indicating
+        the ids of the chunks around the current chunk.
+        These window_ids may overlap, so we
+        - gather connected-components of overlapping windows,
+        - split each component into roughly equal parts,
+        - create a new document for each part, preserving metadata,
+        We may have stored a longer set of window_ids than we need.
+        We just want `neighbors` on each side of the center of window_ids.
+        Args:
+            docs (List[Document]): List of documents to add context window to.
+            scores (List[float]): List of match scores for each document.
+            neighbors (int, optional): Number of neighbors on "each side" of match to
+                retrieve. Defaults to 0.
+                "Each side" here means before and after the match,
+                in the original text.
+        Returns:
+            List[Tuple[Document, float]]: List of (Document, score) tuples.
+        """
+        # We return a larger context around each match, i.e.
+        # a window of `neighbors` on each side of the match.
+        docs = [d for d, s in docs_scores]
+        scores = [s for d, s in docs_scores]
+        if neighbors == 0:
+            return docs_scores
+        doc_chunks = [d for d in docs if d.metadata.is_chunk]
+        if len(doc_chunks) == 0:
+            return docs_scores
+        window_ids_list = []
+        id2metadata = {}
+        # id -> highest score of a doc it appears in
+        id2max_score: Dict[int | str, float] = {}
+        for i, d in enumerate(docs):
+            window_ids = d.metadata.window_ids
+            id2metadata.update({id: d.metadata for id in window_ids})
+            id2max_score.update(
+                {id: max(id2max_score.get(id, 0), scores[i]) for id in window_ids}
+            )
+            n = len(window_ids)
+            chunk_idx = window_ids.index(d.id())
+            neighbor_ids = window_ids[
+                max(0, chunk_idx - neighbors) : min(n, chunk_idx + neighbors + 1)
+            ]
+            window_ids_list += [neighbor_ids]
+        # window_ids could be from different docs,
+        # and they may overlap, so we first remove overlaps
+        window_ids_list = self.remove_overlaps(window_ids_list)
+        final_docs = []
+        final_scores = []
+        for w in window_ids_list:
+            metadata = copy.deepcopy(id2metadata[w[0]])
+            metadata.window_ids = w
+            document = Document(
+                content=" ".join([d.content for d in self.get_documents_by_ids(w)]),
+                metadata=metadata,
+            )
+            # make a fresh id since content is in general different
+            document.metadata.id = document.hash_id(document.content)
+            final_docs += [document]
+            final_scores += [max(id2max_score[id] for id in w)]
+        return list(zip(final_docs, final_scores))
+    @staticmethod
+    def remove_overlaps(windows: List[List[str]]) -> List[List[str]]:
+        """
+        Given a collection of windows, where each window is a sequence of ids,
+        identify groups of overlapping windows, and for each overlapping k-group,
+        split the ids into k roughly equal sequences.
+        Args:
+            windows (List[int|str]): List of windows, where each window is a
+                sequence of ids.
+        Returns:
+            List[int|str]: List of windows, where each window is a sequence of ids,
+                and no two windows overlap.
+        """
+        ids = set(id for w in windows for id in w)
+        # id -> {win -> # pos}
+        id2win2pos: Dict[str, Dict[int, int]] = {id: {} for id in ids}
+        for i, w in enumerate(windows):
+            for j, id in enumerate(w):
+                id2win2pos[id][i] = j
+        n = len(windows)
+        # relation between windows:
+        order = np.zeros((n, n), dtype=np.int8)
+        for i, w in enumerate(windows):
+            for j, x in enumerate(windows):
+                if i == j:
+                    continue
+                if len(set(w).intersection(x)) == 0:
+                    continue
+                id = list(set(w).intersection(x))[0]  # any common id
+                if id2win2pos[id][i] > id2win2pos[id][j]:
+                    order[i, j] = -1  # win i is before win j
+                else:
+                    order[i, j] = 1  # win i is after win j
+        # find groups of windows that overlap, like connected components in a graph
+        groups = [[0]]
+        for i in range(1, n):
+            found = False
+            for g in groups:
+                if any(order[i, j] != 0 for j in g):
+                    g.append(i)
+                    found = True
+                    break
+            if not found:
+                groups.append([i])
+        # split each group into roughly equal parts
+        new_windows = []
+        max_window_len = max(len(w) for w in windows)
+        for g in groups:
+            # find total ordering among windows in group based on order matrix
+            # (this is a topological sort)
+            _g = np.array(g)
+            order_matrix = order[_g][:, _g]
+            ordered_window_indices = topological_sort(order_matrix)
+            ordered_window_ids = [windows[i] for i in _g[ordered_window_indices]]
+            flattened = [id for w in ordered_window_ids for id in w]
+            flattened_deduped = list(dict.fromkeys(flattened))
+            # split into k parts where k is the smallest integer such that
+            # each part has length <= max_window_len
+            k = max(1, int(ceil(len(flattened_deduped) / max_window_len)))
+            new_windows += np.array_split(flattened_deduped, k)
+        return [w.tolist() for w in new_windows]
     @abstractmethod
     def get_all_documents(self) -> List[Document]:
         """

langroid/vector_store/chromadb.py CHANGED Viewed

@@ -109,14 +109,17 @@ class ChromaDB(VectorStore):
         if documents is None:
             return
         contents: List[str] = [document.content for document in documents]
-        metadatas: List[dict[str, Any]] = [
-            document.metadata.dict() for document in documents
-        ]
+        # convert metadatas to dicts so chroma can handle them
+        metadata_dicts: List[dict[str, Any]] = [d.metadata.dict() for d in documents]
+        for m in metadata_dicts:
+            # chroma does not handle non-atomic types in metadata
+            m["window_ids"] = ",".join(m["window_ids"])
         ids = [str(d.id()) for d in documents]
         self.collection.add(
             # embedding_models=embedding_models,
             documents=contents,
-            metadatas=metadatas,
+            metadatas=metadata_dicts,
             ids=ids,
         )
@@ -145,7 +148,8 @@ class ChromaDB(VectorStore):
             include=["documents", "distances", "metadatas"],
         )
         docs = self._docs_from_results(results)
-        scores = results["distances"][0]
+        # chroma distances are 1 - cosine.
+        scores = [1 - s for s in results["distances"][0]]
         return list(zip(docs, scores))
     def _docs_from_results(self, results: Dict[str, Any]) -> List[Document]:
@@ -164,22 +168,11 @@ class ChromaDB(VectorStore):
             for i, c in enumerate(contents):
                 print_long_text("red", "italic red", f"MATCH-{i}", c)
         metadatas = results["metadatas"][0]
+        for m in metadatas:
+            # restore the stringified list of window_ids into the original List[str]
+            m["window_ids"] = m["window_ids"].split(",")
         docs = [
             Document(content=d, metadata=DocMetaData(**m))
             for d, m in zip(contents, metadatas)
         ]
         return docs
-# Example usage and testing
-# chroma_db = ChromaDB.from_documents(
-#     collection_name="all-my-documents",
-#     documents=["doc1000101", "doc288822"],
-#     metadatas=[{"style": "style1"}, {"style": "style2"}],
-#     ids=["uri9", "uri10"]
-# )
-# results = chroma_db.query(
-#     query_texts=["This is a query document"],
-#     n_results=2
-# )
-# print(results)

langroid/vector_store/meilisearch.py CHANGED Viewed

@@ -263,6 +263,7 @@ class MeiliSearch(VectorStore):
         text: str,
         k: int = 20,
         where: Optional[str] = None,
+        neighbors: int = 0,  # ignored
     ) -> List[Tuple[Document, float]]:
         filter = [] if where is None else where
         if self.config.collection_name is None:

langroid/vector_store/momento.py CHANGED Viewed

@@ -222,6 +222,7 @@ class MomentoVI(VectorStore):
         text: str,
         k: int = 1,
         where: Optional[str] = None,
+        neighbors: int = 0,  # ignored
     ) -> List[Tuple[Document, float]]:
         if self.config.collection_name is None:
             raise ValueError("No collection name set, cannot search")

langroid/vector_store/qdrantdb.py CHANGED Viewed

@@ -244,7 +244,11 @@ class QdrantDB(VectorStore):
             with_vectors=False,
             with_payload=True,
         )
-        docs = [Document(**record.payload) for record in records]  # type: ignore
+        # Note the records may NOT be in the order of the ids,
+        # so we re-order them here.
+        id2payload = {record.id: record.payload for record in records}
+        ordered_payloads = [id2payload[id] for id in _ids]
+        docs = [Document(**payload) for payload in ordered_payloads]  # type: ignore
         return docs
     def similar_texts_with_scores(
@@ -252,6 +256,7 @@ class QdrantDB(VectorStore):
         text: str,
         k: int = 1,
         where: Optional[str] = None,
+        neighbors: int = 0,
     ) -> List[Tuple[Document, float]]:
         embedding = self.embedding_fn([text])[0]
         # TODO filter may not work yet
@@ -268,7 +273,7 @@ class QdrantDB(VectorStore):
                 exact=False,  # use Apx NN, not exact NN
             ),
         )
-        scores = [match.score for match in search_result]
+        scores = [match.score for match in search_result if match is not None]
         docs = [
             Document(**(match.payload))  # type: ignore
             for match in search_result
@@ -277,8 +282,9 @@ class QdrantDB(VectorStore):
         if len(docs) == 0:
             logger.warning(f"No matches found for {text}")
             return []
-        if settings.debug:
-            logger.info(f"Found {len(docs)} matches, max score: {max(scores)}")
         doc_score_pairs = list(zip(docs, scores))
+        max_score = max(ds[1] for ds in doc_score_pairs)
+        if settings.debug:
+            logger.info(f"Found {len(doc_score_pairs)} matches, max score: {max_score}")
         self.show_if_debug(doc_score_pairs)
         return doc_score_pairs

{langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langroid
-Version: 0.1.101
+Version: 0.1.102
 Summary: Harness LLMs with Multi-Agent Programming
 License: MIT
 Author: Prasad Chalasani

{langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/RECORD RENAMED Viewed

@@ -1,16 +1,16 @@
 langroid/__init__.py,sha256=-AWkFhhW0b0paHQ11SORyIVPnXv0nyT2X_0_xh3zLjw,408
 langroid/agent/__init__.py,sha256=ZqDw3Ktw7XGDl6mC8DN61F71V4ckf0rBoEOydH9l6C4,428
 langroid/agent/base.py,sha256=NjRf_y5ymZqpFlXh2sK94lcJRJbrBhw5pd1_Qe1Da_E,30151
-langroid/agent/batch.py,sha256=in7OGXcOhKnJo9WDfIuNcP1xb-QZ1Y4rCy2w0dzR9C0,5496
+langroid/agent/batch.py,sha256=p5UPdvRn6QOpb3V4B517nPCF7nZemKk7_9YlJ7jR7w4,5450
 langroid/agent/chat_agent.py,sha256=qjCwvR7i9DtonTmm1d1mbBHN4aW0LzxABAL-2JfGcF8,33548
 langroid/agent/chat_document.py,sha256=k7Klav3FIBTf2w95bQtxgqBrf2fMo1ydSlklQvv4RCg,6252
 langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
 langroid/agent/special/__init__.py,sha256=ciNhdoIIjFxNk-5xcy8H76A3d-TldbIYaFexlgfN-2A,575
-langroid/agent/special/doc_chat_agent.py,sha256=bQsDmrZrnydIW7Cll9WkhTgqMqt3bndknziiB2iTbV4,31330
+langroid/agent/special/doc_chat_agent.py,sha256=ko6_oYJuv70zQ0mvLXcQMj_ZB1UVE8bw2V2Ng-FoVoI,33425
 langroid/agent/special/recipient_validator_agent.py,sha256=R3Rit93BNWQar_9stuDBGzmLr2W-IYOQ7oq-tlNNlps,6035
 langroid/agent/special/relevance_extractor_agent.py,sha256=JU52PbY5FO72kfnA902-UKzVgxExndlwEC7Lb-XqDNI,4348
-langroid/agent/special/retriever_agent.py,sha256=q_JMZEe_4Ha1zWJrEyIWAesmrlAa7_VOSHTi8AQNXyo,6579
+langroid/agent/special/retriever_agent.py,sha256=ze8jXJW9A_twsrRXVECAQCYicfjm8-a6qv1vDk41AAc,6573
 langroid/agent/special/sql/__init__.py,sha256=3kR5nC0wnYIzmMrr9L8RJa7JAJpbwBLx7KKygiwz0v0,111
 langroid/agent/special/sql/sql_chat_agent.py,sha256=Ua_gfK_1k5ct59Zkbe78bzs-2jabtFkEVx76a0pGs9Y,12867
 langroid/agent/special/sql/utils/__init__.py,sha256=_IBHt3iNXvPqxvDrs5_T86qdj0gPugVGnGNi6Cx7F-I,238
@@ -45,24 +45,24 @@ langroid/language_models/prompt_formatter/__init__.py,sha256=wj2e6j7R9d3m63HCbSD
 langroid/language_models/prompt_formatter/base.py,sha256=2y_GcwhstvB5ih3haS7l5Fv79jVnFJ_vEw1jqWJzB9k,1247
 langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
 langroid/language_models/utils.py,sha256=rmnSn-sJ3aKl_wBdeLPkck0Li4Ed6zkCxZYYl7n1V34,4668
-langroid/mytypes.py,sha256=5jl4vpnwN2U19Eyh0mH1JhoVFpa8Ml7-HYpNyrgSArw,2110
+langroid/mytypes.py,sha256=XmEUL_xAZfeWuJLEvQe_4g-W9P7rpY6zOIAHhtYikwk,2363
 langroid/parsing/__init__.py,sha256=_EZ8iuixxU39zuaydtfjyap8g9C_c1dnrCQ0QR81U2E,340
 langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
 langroid/parsing/code-parsing.md,sha256=--cyyNiSZSDlIwcjAV4-shKrSiRe2ytF3AdSoS_hD2g,3294
 langroid/parsing/code_parser.py,sha256=BbDAzp35wkYQ9U1dpf1ARL0lVyi0tfqEc6_eox2C090,3727
 langroid/parsing/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-langroid/parsing/document_parser.py,sha256=wsBwZ8LHzobAoxxyV46iZQ5B69lRkZjxDEIV6d7SwEg,14581
+langroid/parsing/document_parser.py,sha256=YC3IXQ9ErpBGBZh6Be9gfJWHcTwGTSMfNQMT5ARrj5g,14615
 langroid/parsing/json.py,sha256=MVqBUfInALQm1QKbcfEvLzWxBz_UztCIyGk7AK5uFPo,1650
 langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
-langroid/parsing/parser.py,sha256=99RE4sQg5CHH4xEznuJOE_yl3lIIehkRyGmUdq4hmuo,8070
+langroid/parsing/parser.py,sha256=3EVPkOfXehZwUvdM-tn7LN951722_2c7umGtwzwdxts,9297
 langroid/parsing/repo_loader.py,sha256=nmtvorVip4VQbUMDxoxpVyAlbLt8R8eJjxpAX0vVlfs,27695
-langroid/parsing/search.py,sha256=OEIR8a2f_IRiTuo6EjBzWtJz0cJjsHWwhk5lvoRn5qs,8247
+langroid/parsing/search.py,sha256=h-C0Ij111cI7lcddr_vdABjfNKXDqBkJVG48WyJCovA,8424
 langroid/parsing/spider.py,sha256=aX0ucHQ9SVgpieNjtEn_G1bhq5DH_03VpBXoxcdZPl8,3008
 langroid/parsing/table_loader.py,sha256=uqbupGr4y_7os18RtaY5GpD0hWcgzROoNy8dQIHB4kc,1767
 langroid/parsing/url_loader.py,sha256=dhmUTysS_YZyIXVAekxCGPiCbFsOsHXj_eHMow0xoGQ,2153
 langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
 langroid/parsing/urls.py,sha256=vJ-ZJROtmLwykoE690w5y0BxWN2QOpbxR4hy03knx6Q,7520
-langroid/parsing/utils.py,sha256=UL-8Klr5sxDuZkMBeBXJlYnGe3kG4JAry3X58_j9o7E,6763
+langroid/parsing/utils.py,sha256=nuCW_sRe5js0d-K6EtDEIbFQpMicS1ntr3FXxtYtGzw,7639
 langroid/parsing/web_search.py,sha256=hGUVoSJNdpoT5rsm-ikAteMiUropHrzKaxN8EVVqO2U,2496
 langroid/prompts/__init__.py,sha256=aTW86CbDZM7tntqiTVeNLYJv7pbRDcKOI3qHVXCEHUY,99
 langroid/prompts/dialog.py,sha256=SpfiSyofSgy2pwD1YboHR_yHO3LEEMbv6j2sm874jKo,331
@@ -70,7 +70,8 @@ langroid/prompts/prompts_config.py,sha256=EMK1Fm7EmS8y3CV4AkrVgn5K4NipiM4m7J8819
 langroid/prompts/templates.py,sha256=4X-07tnmUQ8Z_zaWRQAUUyKiErGztp3tERujqnG8sGA,6369
 langroid/prompts/transforms.py,sha256=GsQo1klGxUy0fACh6j0lTblk6XEl2erRnhRWlN2M4-c,2706
 langroid/utils/__init__.py,sha256=3aMfdwFizpl3W2H5Q-TMqUFqMoYgec1NiX-caSnClmQ,167
-langroid/utils/configuration.py,sha256=KqhNU1ar-R-3KoWXyhHk727cEAxYajLIIBY3A3jFiZQ,2760
+langroid/utils/algorithms/graph.py,sha256=5D7scuxeofllU6xh8_tIcc2WiHVn0MjVQ7lSPrOgKr4,1173
+langroid/utils/configuration.py,sha256=p_MlevqGdS3681u2IiDgrMXBCytg9xZwQH5OK9PUNno,3044
 langroid/utils/constants.py,sha256=edJ5J-sC9CeUwwNey_uLQbGbHgjX-T8XLf_J53h3Tys,484
 langroid/utils/docker.py,sha256=kJQOLTgM0x9j9pgIIqp0dZNZCTvoUDhp6i8tYBq1Jr0,1105
 langroid/utils/globals.py,sha256=UubMelOGkLy3BxByl1vprITU4dbysZmCtYBvZWL8dto,1337
@@ -79,20 +80,20 @@ langroid/utils/llms/strings.py,sha256=CSAX9Z6FQOLXOzbLMe_Opqtc3ruDAKTTk7cPqc6Blh
 langroid/utils/logging.py,sha256=xXpohbvK74_reomdkIWTeyDjGG8GT1fuU7zcLL3Ngt8,3951
 langroid/utils/output/__init__.py,sha256=IpfqnCkfXa4HaOx39EMUhXuA7GPZFd7N_QMm1n43C_I,174
 langroid/utils/output/printing.py,sha256=RZoY8S-8UljiVURe5o5SljpzwF77LTCO7-68nf_uvA4,2277
-langroid/utils/pydantic_utils.py,sha256=xV6vItb6LfIEgSCEsHm5JUTKVfqi-mJoct4SWqx4o0E,6226
+langroid/utils/pydantic_utils.py,sha256=00ajeBTvxJEOyqd7M7FveRz7oa9wdQ0QFKvOjy_ZfRE,6296
 langroid/utils/system.py,sha256=LyFrSPfvAnhA8GSRjT-2HOkLzxmziZ8wfpDYMqSv01M,1518
 langroid/utils/web/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/utils/web/login.py,sha256=1iz9eUAHa87vpKIkzwkmFa00avwFWivDSAr7QUhK7U0,2528
 langroid/utils/web/selenium_login.py,sha256=mYI6EvVmne34N9RajlsxxRqJQJvV-WG4LGp6sEECHPw,1156
 langroid/vector_store/__init__.py,sha256=NhAXOCKX_x2whfghOn44e0O3-vV0nJRz6ZLsCBqYFyQ,242
-langroid/vector_store/base.py,sha256=VVsk6WvGW6Rc0TLEcyVVOTAB0pFlfZ1kSd9VyDl_eHM,5492
-langroid/vector_store/chromadb.py,sha256=2vWoOwWIgeRVIMiywAl084eruBBQhkd8_XzAg-K_saU,6744
+langroid/vector_store/base.py,sha256=VQb_7EIJ1r3g-fzyP4b-WSfZg73rtdYsILIcHZLl4NM,11989
+langroid/vector_store/chromadb.py,sha256=EJONjIa77Bkr8ych5JLykYV9n-DP_9jqFechmmZHHwI,6803
 langroid/vector_store/lancedb.py,sha256=_d7Mz7O8j4keYgHzFSpEOBFq6L13kDJ3eQOZAIrIaOc,11262
-langroid/vector_store/meilisearch.py,sha256=r5-2sybfE0zPt7wEO_HB7JqVI9Mf5O55uVS1L-Mx-jM,11168
-langroid/vector_store/momento.py,sha256=gr9Mig72OMH3sLGEh3jS5p-8txiuMNkZoCrl2x0we4E,9891
+langroid/vector_store/meilisearch.py,sha256=aQ5Bo-Rk-BnMxbcCTpR7yVm4aNNZHy4hlJBJxn-UpYw,11207
+langroid/vector_store/momento.py,sha256=krw1KwyVRE-ekq1KUAktsMxrJfeolsAC5BmK-1zdxsg,9930
 langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
-langroid/vector_store/qdrantdb.py,sha256=t6ITLTFrtAru7J0DqYXo9JhJjciHGxvs7zWejx2P9Ts,11062
-langroid-0.1.101.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.1.101.dist-info/WHEEL,sha256=vVCvjcmxuUltf8cYhJ0sJMRDLr1XsPuxEId8YDzbyCY,88
-langroid-0.1.101.dist-info/METADATA,sha256=9cQIK4t1NhfNMGh40LO1BTKlkAvCbEEE5aLFE4jt6-Y,38599
-langroid-0.1.101.dist-info/RECORD,,
+langroid/vector_store/qdrantdb.py,sha256=YfH0t5nzBBMmwyH0_QndQNnrSfv_3_LFpjlVzcEhbso,11409
+langroid-0.1.102.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.1.102.dist-info/WHEEL,sha256=vVCvjcmxuUltf8cYhJ0sJMRDLr1XsPuxEId8YDzbyCY,88
+langroid-0.1.102.dist-info/METADATA,sha256=zcrmh544o2NXQXuOhUt4YTSf9P6McMB3WQQUxVYkp_g,38599
+langroid-0.1.102.dist-info/RECORD,,

{langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/LICENSE RENAMED Viewed

File without changes

{langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/WHEEL RENAMED Viewed

File without changes

langroid 0.1.101__py3-none-any.whl → 0.1.102__py3-none-any.whl

langroid 0.1.101py3-none-any.whl → 0.1.102py3-none-any.whl