PyPI - langroid - Versions diffs - 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl - Mend

langroid 0.12.0py3-none-any.whl → 0.13.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

langroid/agent/special/doc_chat_agent.py CHANGED Viewed

@@ -49,7 +49,6 @@ from langroid.parsing.search import (
 from langroid.parsing.table_loader import describe_dataframe
 from langroid.parsing.url_loader import URLLoader
 from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
-from langroid.parsing.utils import batched
 from langroid.prompts.prompts_config import PromptsConfig
 from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
 from langroid.utils.constants import NO_ANSWER
@@ -137,7 +136,6 @@ class DocChatAgentConfig(ChatAgentConfig):
     rerank_diversity: bool = True  # rerank to maximize diversity?
     rerank_periphery: bool = True  # rerank to avoid Lost In the Middle effect?
     rerank_after_adding_context: bool = True  # rerank after adding context window?
-    embed_batch_size: int = 500  # get embedding of at most this many at a time
     cache: bool = True  # cache results
     debug: bool = False
     stream: bool = True  # allow streaming where needed
@@ -400,7 +398,11 @@ class DocChatAgent(ChatAgent):
         if split:
             docs = self.parser.split(docs)
         else:
-            self.parser.add_window_ids(docs)
+            if self.config.n_neighbor_chunks > 0:
+                self.parser.add_window_ids(docs)
+            # we're not splitting, so we mark each doc as a chunk
+            for d in docs:
+                d.metadata.is_chunk = True
         if self.vecdb is None:
             raise ValueError("VecDB not set")
@@ -422,10 +424,9 @@ class DocChatAgent(ChatAgent):
                         + d.content
                     )
         docs = docs[: self.config.parsing.max_chunks]
-        # add embeddings in batches, to stay under limit of embeddings API
-        batches = list(batched(docs, self.config.embed_batch_size))
-        for batch in batches:
-            self.vecdb.add_documents(batch)
+        # vecdb should take care of adding docs in batches;
+        # batching can be controlled via vecdb.config.batch_size
+        self.vecdb.add_documents(docs)
         self.original_docs_length = self.doc_length(docs)
         self.setup_documents(docs, filter=self.config.filter)
         return len(docs)
@@ -894,7 +895,9 @@ class DocChatAgent(ChatAgent):
             )
         return docs_scores
-    def get_fuzzy_matches(self, query: str, multiple: int) -> List[Document]:
+    def get_fuzzy_matches(
+        self, query: str, multiple: int
+    ) -> List[Tuple[Document, float]]:
         # find similar docs using fuzzy matching:
         # these may sometimes be more likely to contain a relevant verbatim extract
         with status("[cyan]Finding fuzzy matches in chunks..."):
@@ -909,8 +912,8 @@ class DocChatAgent(ChatAgent):
                 self.chunked_docs,
                 self.chunked_docs_clean,
                 k=self.config.parsing.n_similar_docs * multiple,
-                words_before=self.config.n_fuzzy_neighbor_words,
-                words_after=self.config.n_fuzzy_neighbor_words,
+                words_before=self.config.n_fuzzy_neighbor_words or None,
+                words_after=self.config.n_fuzzy_neighbor_words or None,
             )
         return fuzzy_match_docs
@@ -1127,12 +1130,14 @@ class DocChatAgent(ChatAgent):
         # ]
         if self.config.use_bm25_search:
+            # TODO: Add score threshold in config
             docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
             passages += [d for (d, _) in docs_scores]
         if self.config.use_fuzzy_match:
-            fuzzy_match_docs = self.get_fuzzy_matches(query, retrieval_multiple)
-            passages += fuzzy_match_docs
+            # TODO: Add score threshold in config
+            fuzzy_match_doc_scores = self.get_fuzzy_matches(query, retrieval_multiple)
+            passages += [d for (d, _) in fuzzy_match_doc_scores]
         # keep unique passages
         id2passage = {p.id(): p for p in passages}

langroid/parsing/search.py CHANGED Viewed

@@ -27,7 +27,7 @@ def find_fuzzy_matches_in_docs(
     k: int,
     words_before: int | None = None,
     words_after: int | None = None,
-) -> List[Document]:
+) -> List[Tuple[Document, float]]:
     """
     Find approximate matches of the query in the docs and return surrounding
     characters.
@@ -35,6 +35,7 @@ def find_fuzzy_matches_in_docs(
     Args:
         query (str): The search string.
         docs (List[Document]): List of Document objects to search through.
+        docs_clean (List[Document]): List of Document objects with cleaned content.
         k (int): Number of best matches to return.
         words_before (int|None): Number of words to include before each match.
             Default None => return max
@@ -42,8 +43,7 @@ def find_fuzzy_matches_in_docs(
             Default None => return max
     Returns:
-        List[Document]: List of Documents containing the matches,
-            including the given number of words around the match.
+        List[Tuple[Document,float]]: List of (Document, score) tuples.
     """
     if len(docs) == 0:
         return []
@@ -54,19 +54,19 @@ def find_fuzzy_matches_in_docs(
         scorer=fuzz.partial_ratio,
     )
-    real_matches = [m for m, score in best_matches if score > 50]
+    real_matches = [(m, score) for m, score in best_matches if score > 50]
     # find the original docs that corresponding to the matches
     orig_doc_matches = []
-    for i, m in enumerate(real_matches):
+    for i, (m, s) in enumerate(real_matches):
         for j, doc_clean in enumerate(docs_clean):
             if m in doc_clean.content:
-                orig_doc_matches.append(docs[j])
+                orig_doc_matches.append((docs[j], s))
                 break
     if words_after is None and words_before is None:
         return orig_doc_matches
     if len(orig_doc_matches) == 0:
         return []
-    if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
+    if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
         # If there are fields beyond just content and metadata,
         # we do NOT want to create new document objects with content fields
         # based on words_before and words_after, since we don't know how to
@@ -74,7 +74,7 @@ def find_fuzzy_matches_in_docs(
         return orig_doc_matches
     contextual_matches = []
-    for match in orig_doc_matches:
+    for match, score in orig_doc_matches:
         choice_text = match.content
         contexts = []
         while choice_text != "":
@@ -89,9 +89,12 @@ def find_fuzzy_matches_in_docs(
             choice_text = " ".join(words[end_pos:])
         if len(contexts) > 0:
             contextual_matches.append(
-                Document(
-                    content=" ... ".join(contexts),
-                    metadata=match.metadata,
+                (
+                    Document(
+                        content=" ... ".join(contexts),
+                        metadata=match.metadata,
+                    ),
+                    score,
                 )
             )

{langroid-0.12.0.dist-info → langroid-0.13.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langroid
-Version: 0.12.0
+Version: 0.13.0
 Summary: Harness LLMs with Multi-Agent Programming
 License: MIT
 Author: Prasad Chalasani

{langroid-0.12.0.dist-info → langroid-0.13.0.dist-info}/RECORD RENAMED Viewed

@@ -10,7 +10,7 @@ langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
 langroid/agent/openai_assistant.py,sha256=2rjCZw45ysNBEGNzQM4uf0bTC4KkatGYAWcVcW4xcek,34337
 langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
-langroid/agent/special/doc_chat_agent.py,sha256=3EICtutRADu8S8v0qO8PGFu3VyqjDY6Gp8xYgNtiNSY,54596
+langroid/agent/special/doc_chat_agent.py,sha256=dqm0Gp11Mfl4hOWN4sUR1uZL-oHEmHzcB6bNN6WFgqw,54784
 langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
 langroid/agent/special/lance_rag/__init__.py,sha256=QTbs0IVE2ZgDg8JJy1zN97rUUg4uEPH7SLGctFNumk4,174
 langroid/agent/special/lance_rag/critic_agent.py,sha256=OtFuHthKQLkdVkvuZ2m0GNq1qOYLqHkm1pfLRFnSg5c,9548
@@ -91,7 +91,7 @@ langroid/parsing/parse_json.py,sha256=sKrYv9-IUqRFaTJA24_rmfjN1E7dQSrTBrtd1jYDE1
 langroid/parsing/parser.py,sha256=AgtmlVUvrkSG1l7-YZPX8rlldgXjh_HqXAMqpXkBxUo,11746
 langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
 langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
-langroid/parsing/search.py,sha256=plQtjarB9afGfJLB0CyPXPq3mM4m7kRsfd0_4brziEI,8846
+langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
 langroid/parsing/spider.py,sha256=Y6y7b86Y2k770LdhxgjVlImBxuuy1V9n8-XQ3QPaG5s,3199
 langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
 langroid/parsing/url_loader.py,sha256=Na2TBlKuQkloZzkE2d7xl6mh9olS3CbpgCsJbJ-xhIA,4472
@@ -137,8 +137,8 @@ langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3Hmh
 langroid/vector_store/momento.py,sha256=qR-zBF1RKVHQZPZQYW_7g-XpTwr46p8HJuYPCkfJbM4,10534
 langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
 langroid/vector_store/qdrantdb.py,sha256=v88lqFkepADvlN6lByUj9I4NEKa9X9lWH16uTPPbYrE,17457
-pyproject.toml,sha256=oocGdj8dqhrarP8c5LeFeOKboZ4WYNzs1YpcKszoJgM,7107
-langroid-0.12.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.12.0.dist-info/METADATA,sha256=S-V-w4lhAay08FYPRyJcT7FliitUSaLkL9gQS25luSQ,55259
-langroid-0.12.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
-langroid-0.12.0.dist-info/RECORD,,
+pyproject.toml,sha256=g99bgxP-XUiTx-KsdFICVJuV2bB89areQkDRU5sIgmk,7107
+langroid-0.13.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.13.0.dist-info/METADATA,sha256=Znhge-Z8nn_L7Lxeh8dWs04d4ejZfj0NCCRutJJSkdg,55259
+langroid-0.13.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
+langroid-0.13.0.dist-info/RECORD,,

pyproject.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langroid"
-version = "0.12.0"
+version = "0.13.0"
 description = "Harness LLMs with Multi-Agent Programming"
 authors = ["Prasad Chalasani <pchalasani@gmail.com>"]
 readme = "README.md"

{langroid-0.12.0.dist-info → langroid-0.13.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{langroid-0.12.0.dist-info → langroid-0.13.0.dist-info}/WHEEL RENAMED Viewed

File without changes

langroid 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

langroid 0.12.0py3-none-any.whl → 0.13.0py3-none-any.whl