PyPI - langroid - Versions diffs - 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl - Mend

langroid 0.12.0py3-none-any.whl → 0.14.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

langroid/agent/special/doc_chat_agent.py CHANGED Viewed

@@ -14,6 +14,7 @@ pip install "langroid[hf-embeddings]"
 """
 import logging
+from collections import OrderedDict
 from functools import cache
 from typing import Any, Dict, List, Optional, Set, Tuple, no_type_check
@@ -49,7 +50,6 @@ from langroid.parsing.search import (
 from langroid.parsing.table_loader import describe_dataframe
 from langroid.parsing.url_loader import URLLoader
 from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
-from langroid.parsing.utils import batched
 from langroid.prompts.prompts_config import PromptsConfig
 from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
 from langroid.utils.constants import NO_ANSWER
@@ -131,13 +131,16 @@ class DocChatAgentConfig(ChatAgentConfig):
     n_fuzzy_neighbor_words: int = 100  # num neighbor words to retrieve for fuzzy match
     use_fuzzy_match: bool = True
     use_bm25_search: bool = True
+    use_reciprocal_rank_fusion: bool = True  # ignored if using cross-encoder reranking
     cross_encoder_reranking_model: str = (
         "cross-encoder/ms-marco-MiniLM-L-6-v2" if has_sentence_transformers else ""
     )
     rerank_diversity: bool = True  # rerank to maximize diversity?
     rerank_periphery: bool = True  # rerank to avoid Lost In the Middle effect?
     rerank_after_adding_context: bool = True  # rerank after adding context window?
-    embed_batch_size: int = 500  # get embedding of at most this many at a time
+    # RRF (Reciprocal Rank Fusion) score = 1/(rank + reciprocal_rank_fusion_constant)
+    # see https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#how-rrf-ranking-works
+    reciprocal_rank_fusion_constant: float = 60.0
     cache: bool = True  # cache results
     debug: bool = False
     stream: bool = True  # allow streaming where needed
@@ -400,7 +403,11 @@ class DocChatAgent(ChatAgent):
         if split:
             docs = self.parser.split(docs)
         else:
-            self.parser.add_window_ids(docs)
+            if self.config.n_neighbor_chunks > 0:
+                self.parser.add_window_ids(docs)
+            # we're not splitting, so we mark each doc as a chunk
+            for d in docs:
+                d.metadata.is_chunk = True
         if self.vecdb is None:
             raise ValueError("VecDB not set")
@@ -422,10 +429,9 @@ class DocChatAgent(ChatAgent):
                         + d.content
                     )
         docs = docs[: self.config.parsing.max_chunks]
-        # add embeddings in batches, to stay under limit of embeddings API
-        batches = list(batched(docs, self.config.embed_batch_size))
-        for batch in batches:
-            self.vecdb.add_documents(batch)
+        # vecdb should take care of adding docs in batches;
+        # batching can be controlled via vecdb.config.batch_size
+        self.vecdb.add_documents(docs)
         self.original_docs_length = self.doc_length(docs)
         self.setup_documents(docs, filter=self.config.filter)
         return len(docs)
@@ -894,7 +900,9 @@ class DocChatAgent(ChatAgent):
             )
         return docs_scores
-    def get_fuzzy_matches(self, query: str, multiple: int) -> List[Document]:
+    def get_fuzzy_matches(
+        self, query: str, multiple: int
+    ) -> List[Tuple[Document, float]]:
         # find similar docs using fuzzy matching:
         # these may sometimes be more likely to contain a relevant verbatim extract
         with status("[cyan]Finding fuzzy matches in chunks..."):
@@ -909,8 +917,8 @@ class DocChatAgent(ChatAgent):
                 self.chunked_docs,
                 self.chunked_docs_clean,
                 k=self.config.parsing.n_similar_docs * multiple,
-                words_before=self.config.n_fuzzy_neighbor_words,
-                words_after=self.config.n_fuzzy_neighbor_words,
+                words_before=self.config.n_fuzzy_neighbor_words or None,
+                words_after=self.config.n_fuzzy_neighbor_words or None,
             )
         return fuzzy_match_docs
@@ -1102,10 +1110,17 @@ class DocChatAgent(ChatAgent):
         Returns:
         """
-        # if we are using cross-encoder reranking, we can retrieve more docs
-        # during retrieval, and leave it to the cross-encoder re-ranking
-        # to whittle down to self.config.parsing.n_similar_docs
-        retrieval_multiple = 1 if self.config.cross_encoder_reranking_model == "" else 3
+        # if we are using cross-encoder reranking or reciprocal rank fusion (RRF),
+        # we can retrieve more docs during retrieval, and leave it to the cross-encoder
+        # or RRF reranking to whittle down to self.config.parsing.n_similar_docs
+        retrieval_multiple = (
+            1
+            if (
+                self.config.cross_encoder_reranking_model == ""
+                and not self.config.use_reciprocal_rank_fusion
+            )
+            else 3
+        )
         if self.vecdb is None:
             raise ValueError("VecDB not set")
@@ -1117,26 +1132,98 @@ class DocChatAgent(ChatAgent):
                     q,
                     k=self.config.parsing.n_similar_docs * retrieval_multiple,
                 )
+                # sort by score descending
+                docs_and_scores = sorted(
+                    docs_and_scores, key=lambda x: x[1], reverse=True
+                )
         # keep only docs with unique d.id()
-        id2doc_score = {d.id(): (d, s) for d, s in docs_and_scores}
-        docs_and_scores = list(id2doc_score.values())
-        passages = [d for (d, _) in docs_and_scores]
-        # passages = [
-        #     Document(content=d.content, metadata=d.metadata)
-        #     for (d, _) in docs_and_scores
-        # ]
+        id2_rank_semantic = {d.id(): i for i, (d, _) in enumerate(docs_and_scores)}
+        id2doc = {d.id(): d for d, _ in docs_and_scores}
+        # make sure we get unique docs
+        passages = [id2doc[id] for id, _ in id2_rank_semantic.items()]
+        id2_rank_bm25 = {}
         if self.config.use_bm25_search:
+            # TODO: Add score threshold in config
             docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
-            passages += [d for (d, _) in docs_scores]
+            if self.config.cross_encoder_reranking_model == "":
+                # only if we're not re-ranking with a cross-encoder,
+                # we collect these ranks for Reciprocal Rank Fusion down below.
+                docs_scores = sorted(docs_scores, key=lambda x: x[1], reverse=True)
+                id2_rank_bm25 = {d.id(): i for i, (d, _) in enumerate(docs_scores)}
+                id2doc.update({d.id(): d for d, _ in docs_scores})
+            else:
+                passages += [d for (d, _) in docs_scores]
+        id2_rank_fuzzy = {}
         if self.config.use_fuzzy_match:
-            fuzzy_match_docs = self.get_fuzzy_matches(query, retrieval_multiple)
-            passages += fuzzy_match_docs
+            # TODO: Add score threshold in config
+            fuzzy_match_doc_scores = self.get_fuzzy_matches(query, retrieval_multiple)
+            if self.config.cross_encoder_reranking_model == "":
+                # only if we're not re-ranking with a cross-encoder,
+                # we collect these ranks for Reciprocal Rank Fusion down below.
+                fuzzy_match_doc_scores = sorted(
+                    fuzzy_match_doc_scores, key=lambda x: x[1], reverse=True
+                )
+                id2_rank_fuzzy = {
+                    d.id(): i for i, (d, _) in enumerate(fuzzy_match_doc_scores)
+                }
+                id2doc.update({d.id(): d for d, _ in fuzzy_match_doc_scores})
+            else:
+                passages += [d for (d, _) in fuzzy_match_doc_scores]
-        # keep unique passages
-        id2passage = {p.id(): p for p in passages}
-        passages = list(id2passage.values())
+        if (
+            self.config.cross_encoder_reranking_model == ""
+            and self.config.use_reciprocal_rank_fusion
+            and (self.config.use_bm25_search or self.config.use_fuzzy_match)
+        ):
+            # Since we're not using cross-enocder re-ranking,
+            # we need to re-order the retrieved chunks from potentially three
+            # different retrieval methods (semantic, bm25, fuzzy), where the
+            # similarity scores are on different scales.
+            # We order the retrieved chunks using Reciprocal Rank Fusion (RRF) score.
+            # Combine the ranks from each id2doc_rank_* dict into a single dict,
+            # where the reciprocal rank score is the sum of
+            # 1/(rank + self.config.reciprocal_rank_fusion_constant).
+            # See https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking
+            #
+            # Note: diversity/periphery-reranking below may modify the final ranking.
+            id2_reciprocal_score = {}
+            for id_ in (
+                set(id2_rank_semantic.keys())
+                | set(id2_rank_bm25.keys())
+                | set(id2_rank_fuzzy.keys())
+            ):
+                rank_semantic = id2_rank_semantic.get(id_, float("inf"))
+                rank_bm25 = id2_rank_bm25.get(id_, float("inf"))
+                rank_fuzzy = id2_rank_fuzzy.get(id_, float("inf"))
+                c = self.config.reciprocal_rank_fusion_constant
+                reciprocal_fusion_score = (
+                    1 / (rank_semantic + c) + 1 / (rank_bm25 + c) + 1 / (rank_fuzzy + c)
+                )
+                id2_reciprocal_score[id_] = reciprocal_fusion_score
+            # sort the docs by the reciprocal score, in descending order
+            id2_reciprocal_score = OrderedDict(
+                sorted(
+                    id2_reciprocal_score.items(),
+                    key=lambda x: x[1],
+                    reverse=True,
+                )
+            )
+            # each method retrieved up to retrieval_multiple * n_similar_docs,
+            # so we need to take the top n_similar_docs from the combined list
+            passages = [
+                id2doc[id]
+                for i, (id, _) in enumerate(id2_reciprocal_score.items())
+                if i < self.config.parsing.n_similar_docs
+            ]
+            # passages must have distinct ids
+            assert len(passages) == len(set([d.id() for d in passages])), (
+                f"Duplicate passages in retrieved docs: {len(passages)} != "
+                f"{len(set([d.id() for d in passages]))}"
+            )
         if len(passages) == 0:
             return []
@@ -1166,7 +1253,7 @@ class DocChatAgent(ChatAgent):
             passages_scores = self.add_context_window(passages_scores)
             passages = [p for p, _ in passages_scores]
-        return passages
+        return passages[: self.config.parsing.n_similar_docs]
     @no_type_check
     def get_relevant_extracts(self, query: str) -> Tuple[str, List[Document]]:

langroid/parsing/search.py CHANGED Viewed

@@ -27,7 +27,7 @@ def find_fuzzy_matches_in_docs(
     k: int,
     words_before: int | None = None,
     words_after: int | None = None,
-) -> List[Document]:
+) -> List[Tuple[Document, float]]:
     """
     Find approximate matches of the query in the docs and return surrounding
     characters.
@@ -35,6 +35,7 @@ def find_fuzzy_matches_in_docs(
     Args:
         query (str): The search string.
         docs (List[Document]): List of Document objects to search through.
+        docs_clean (List[Document]): List of Document objects with cleaned content.
         k (int): Number of best matches to return.
         words_before (int|None): Number of words to include before each match.
             Default None => return max
@@ -42,8 +43,7 @@ def find_fuzzy_matches_in_docs(
             Default None => return max
     Returns:
-        List[Document]: List of Documents containing the matches,
-            including the given number of words around the match.
+        List[Tuple[Document,float]]: List of (Document, score) tuples.
     """
     if len(docs) == 0:
         return []
@@ -54,19 +54,19 @@ def find_fuzzy_matches_in_docs(
         scorer=fuzz.partial_ratio,
     )
-    real_matches = [m for m, score in best_matches if score > 50]
+    real_matches = [(m, score) for m, score in best_matches if score > 50]
     # find the original docs that corresponding to the matches
     orig_doc_matches = []
-    for i, m in enumerate(real_matches):
+    for i, (m, s) in enumerate(real_matches):
         for j, doc_clean in enumerate(docs_clean):
             if m in doc_clean.content:
-                orig_doc_matches.append(docs[j])
+                orig_doc_matches.append((docs[j], s))
                 break
     if words_after is None and words_before is None:
         return orig_doc_matches
     if len(orig_doc_matches) == 0:
         return []
-    if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
+    if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
         # If there are fields beyond just content and metadata,
         # we do NOT want to create new document objects with content fields
         # based on words_before and words_after, since we don't know how to
@@ -74,7 +74,7 @@ def find_fuzzy_matches_in_docs(
         return orig_doc_matches
     contextual_matches = []
-    for match in orig_doc_matches:
+    for match, score in orig_doc_matches:
         choice_text = match.content
         contexts = []
         while choice_text != "":
@@ -89,9 +89,12 @@ def find_fuzzy_matches_in_docs(
             choice_text = " ".join(words[end_pos:])
         if len(contexts) > 0:
             contextual_matches.append(
-                Document(
-                    content=" ... ".join(contexts),
-                    metadata=match.metadata,
+                (
+                    Document(
+                        content=" ... ".join(contexts),
+                        metadata=match.metadata,
+                    ),
+                    score,
                 )
             )

{langroid-0.12.0.dist-info → langroid-0.14.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langroid
-Version: 0.12.0
+Version: 0.14.0
 Summary: Harness LLMs with Multi-Agent Programming
 License: MIT
 Author: Prasad Chalasani
@@ -153,6 +153,8 @@ This Multi-Agent paradigm is inspired by the
 `Langroid` is a fresh take on LLM app-development, where considerable thought has gone
 into simplifying the developer experience; it does not use `Langchain`.
+:fire: Read the (WIP) [overview of the langroid architecture](https://langroid.github.io/langroid/blog/2024/08/15/overview-of-langroids-multi-agent-architecture-prelim/)
 📢 Companies are using/adapting Langroid in **production**. Here is a quote:
 >[Nullify](https://www.nullify.ai) uses AI Agents for secure software development.

{langroid-0.12.0.dist-info → langroid-0.14.0.dist-info}/RECORD RENAMED Viewed

@@ -10,7 +10,7 @@ langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
 langroid/agent/openai_assistant.py,sha256=2rjCZw45ysNBEGNzQM4uf0bTC4KkatGYAWcVcW4xcek,34337
 langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
-langroid/agent/special/doc_chat_agent.py,sha256=3EICtutRADu8S8v0qO8PGFu3VyqjDY6Gp8xYgNtiNSY,54596
+langroid/agent/special/doc_chat_agent.py,sha256=r1uPunYf2lQcqYQ4fsD8Q5gB9cZyf7cn0KPcR_CLtrU,59065
 langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
 langroid/agent/special/lance_rag/__init__.py,sha256=QTbs0IVE2ZgDg8JJy1zN97rUUg4uEPH7SLGctFNumk4,174
 langroid/agent/special/lance_rag/critic_agent.py,sha256=OtFuHthKQLkdVkvuZ2m0GNq1qOYLqHkm1pfLRFnSg5c,9548
@@ -91,7 +91,7 @@ langroid/parsing/parse_json.py,sha256=sKrYv9-IUqRFaTJA24_rmfjN1E7dQSrTBrtd1jYDE1
 langroid/parsing/parser.py,sha256=AgtmlVUvrkSG1l7-YZPX8rlldgXjh_HqXAMqpXkBxUo,11746
 langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
 langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
-langroid/parsing/search.py,sha256=plQtjarB9afGfJLB0CyPXPq3mM4m7kRsfd0_4brziEI,8846
+langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
 langroid/parsing/spider.py,sha256=Y6y7b86Y2k770LdhxgjVlImBxuuy1V9n8-XQ3QPaG5s,3199
 langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
 langroid/parsing/url_loader.py,sha256=Na2TBlKuQkloZzkE2d7xl6mh9olS3CbpgCsJbJ-xhIA,4472
@@ -137,8 +137,8 @@ langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3Hmh
 langroid/vector_store/momento.py,sha256=qR-zBF1RKVHQZPZQYW_7g-XpTwr46p8HJuYPCkfJbM4,10534
 langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
 langroid/vector_store/qdrantdb.py,sha256=v88lqFkepADvlN6lByUj9I4NEKa9X9lWH16uTPPbYrE,17457
-pyproject.toml,sha256=oocGdj8dqhrarP8c5LeFeOKboZ4WYNzs1YpcKszoJgM,7107
-langroid-0.12.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.12.0.dist-info/METADATA,sha256=S-V-w4lhAay08FYPRyJcT7FliitUSaLkL9gQS25luSQ,55259
-langroid-0.12.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
-langroid-0.12.0.dist-info/RECORD,,
+pyproject.toml,sha256=W5AMGnCoX4SvE5HYNJlJcernYJ-sbIVoVmfpVifMMm8,7107
+langroid-0.14.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.14.0.dist-info/METADATA,sha256=hEJyAJh8I1K9102zVxSya1pVgXxTUNkPXKo__JUtf54,55430
+langroid-0.14.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
+langroid-0.14.0.dist-info/RECORD,,

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langroid"
-version = "0.12.0"
+version = "0.14.0"
 description = "Harness LLMs with Multi-Agent Programming"
 authors = ["Prasad Chalasani <pchalasani@gmail.com>"]
 readme = "README.md"

{langroid-0.12.0.dist-info → langroid-0.14.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{langroid-0.12.0.dist-info → langroid-0.14.0.dist-info}/WHEEL RENAMED Viewed

File without changes

langroid 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

langroid 0.12.0py3-none-any.whl → 0.14.0py3-none-any.whl