PyPI - langroid - Versions diffs - 0.56.4__py3-none-any.whl → 0.56.6__py3-none-any.whl - Mend

langroid 0.56.4py3-none-any.whl → 0.56.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

langroid/agent/base.py CHANGED Viewed

@@ -279,9 +279,6 @@ class Agent(ABC):
             if param.annotation != inspect.Parameter.empty:
                 ann_str = str(param.annotation)
                 # Check for Agent-like types
-                print(param, inspect.isclass(param.annotation))
-                print(param, issubclass(param.annotation, Agent))
-                print(param, param.annotation is ChatDocument)
                 if (
                     inspect.isclass(param.annotation)
                     and issubclass(param.annotation, Agent)
@@ -312,7 +309,6 @@ class Agent(ABC):
             elif param.name == "chat_doc":
                 chat_doc_param = param.name
-        print(has_annotations, agent_param, chat_doc_param)
         return has_annotations, agent_param, chat_doc_param
     @no_type_check

langroid/agent/special/doc_chat_agent.py CHANGED Viewed

@@ -142,6 +142,8 @@ class DocChatAgentConfig(ChatAgentConfig):
     # improve retrieval.
     chunk_enrichment_config: Optional[ChunkEnrichmentAgentConfig] = None
+    n_relevant_chunks: int = 3  # how many relevant chunks to retrieve finally
+    n_similar_chunks: int = 3  # how many similar chunks to retrieve, by each method
     n_query_rephrases: int = 0
     n_neighbor_chunks: int = 0  # how many neighbors on either side of match to retrieve
     n_fuzzy_neighbor_words: int = 100  # num neighbor words to retrieve for fuzzy match
@@ -185,7 +187,8 @@ class DocChatAgentConfig(ChatAgentConfig):
         # truncating due to punctuation
         min_chunk_chars=200,
         discard_chunk_chars=5,  # discard chunks with fewer than this many chars
-        n_similar_docs=3,
+        # set deprecated n_similar_docs to None; use n_similar_chunks above instead
+        n_similar_docs=None,
         n_neighbor_ids=0,  # num chunk IDs to store on either side of each chunk
         pdf=PdfParsingConfig(
             # NOTE: PDF parsing is extremely challenging, and each library
@@ -240,6 +243,60 @@ class DocChatAgent(ChatAgent):
         self.chunked_docs: List[Document] = []
         self.chunked_docs_clean: List[Document] = []
         self.response: None | Document = None
+        if (
+            self.config.cross_encoder_reranking_model != ""
+            and self.config.use_reciprocal_rank_fusion
+        ):
+            logger.warning(
+                """
+                You have set `use_reciprocal_rank_fusion` to True,
+                but it will be ignored since you have also set
+                `cross_encoder_reranking_model` to a non-empty value.
+                To use RRF (Reciprocal Rank Fusion), set
+                `cross_encoder_reranking_model` to an empty string.
+                """
+            )
+        if (
+            self.config.cross_encoder_reranking_model == ""
+            and not self.config.use_reciprocal_rank_fusion
+            and (self.config.use_fuzzy_match or self.config.use_bm25_search)
+            and (
+                self.config.n_relevant_chunks
+                < self.config.n_similar_chunks
+                * (self.config.use_bm25_search + self.config.use_fuzzy_match)
+            )
+        ):
+            logger.warning(
+                """
+                DocChatAgent has been configured to have no cross encoder reranking,
+                AND `use_reciprocal_rank_fusion` is set to False,
+                AND `use_fuzzy_match` or `use_bm25_search` is True,
+                AND `n_relevant_chunks` is less than `n_similar_chunks` * (
+                    `use_bm25_search` + `use_fuzzy_match`
+                ),
+                BUT there is no way to rerank the chunks retrieved by multiple methods,
+                so we will set `use_reciprocal_rank_fusion` to True.
+                """
+            )
+            self.config.use_reciprocal_rank_fusion = True
+        # Handle backward compatibility for deprecated n_similar_docs
+        if self.config.parsing.n_similar_docs is not None:
+            logger.warning(
+                """
+                The parameter `parsing.n_similar_docs` is deprecated and will be
+                removed in a future version. Please use `n_similar_chunks` and
+                `n_relevant_chunks` instead, which provide more fine-grained
+                control over retrieval.
+                - n_similar_chunks: number of chunks to retrieve by each method
+                - n_relevant_chunks: final number of chunks to return after reranking
+                """
+            )
+            # Use the deprecated value for both parameters
+            self.config.n_similar_chunks = self.config.parsing.n_similar_docs
+            self.config.n_relevant_chunks = self.config.parsing.n_similar_docs
         self.ingest()
     def clear(self) -> None:
@@ -486,7 +543,7 @@ class DocChatAgent(ChatAgent):
     def retrieval_tool(self, msg: RetrievalTool) -> str:
         """Handle the RetrievalTool message"""
         self.config.retrieve_only = True
-        self.config.parsing.n_similar_docs = msg.num_results
+        self.config.n_relevant_chunks = msg.num_results
         content_doc = self.answer_from_docs(msg.query)
         return content_doc.content
@@ -1005,7 +1062,7 @@ class DocChatAgent(ChatAgent):
                 self.chunked_docs,
                 self.chunked_docs_clean,  # already pre-processed!
                 query,
-                k=self.config.parsing.n_similar_docs * multiple,
+                k=self.config.n_similar_chunks * multiple,
             )
         return docs_scores
@@ -1025,7 +1082,7 @@ class DocChatAgent(ChatAgent):
                 query,
                 self.chunked_docs,
                 self.chunked_docs_clean,
-                k=self.config.parsing.n_similar_docs * multiple,
+                k=self.config.n_similar_chunks * multiple,
                 words_before=self.config.n_fuzzy_neighbor_words or None,
                 words_after=self.config.n_fuzzy_neighbor_words or None,
             )
@@ -1056,9 +1113,7 @@ class DocChatAgent(ChatAgent):
                 key=lambda x: x[0],
                 reverse=True,
             )
-            passages = [
-                d for _, d in sorted_pairs[: self.config.parsing.n_similar_docs]
-            ]
+            passages = [d for _, d in sorted_pairs[: self.config.n_similar_chunks]]
         return passages
     def rerank_with_diversity(self, passages: List[Document]) -> List[Document]:
@@ -1229,7 +1284,7 @@ class DocChatAgent(ChatAgent):
         # if we are using cross-encoder reranking or reciprocal rank fusion (RRF),
         # we can retrieve more docs during retrieval, and leave it to the cross-encoder
-        # or RRF reranking to whittle down to self.config.parsing.n_similar_docs
+        # or RRF reranking to whittle down to self.config.n_similar_chunks
         retrieval_multiple = (
             1
             if (
@@ -1247,7 +1302,7 @@ class DocChatAgent(ChatAgent):
             for q in [query] + query_proxies:
                 docs_and_scores += self.get_semantic_search_results(
                     q,
-                    k=self.config.parsing.n_similar_docs * retrieval_multiple,
+                    k=self.config.n_similar_chunks * retrieval_multiple,
                 )
                 # sort by score descending
                 docs_and_scores = sorted(
@@ -1265,8 +1320,12 @@ class DocChatAgent(ChatAgent):
             # TODO: Add score threshold in config
             docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
             id2doc.update({d.id(): d for d, _ in docs_scores})
-            if self.config.cross_encoder_reranking_model == "":
-                # only if we're not re-ranking with a cross-encoder,
+            if (
+                self.config.cross_encoder_reranking_model == ""
+                and self.config.use_reciprocal_rank_fusion
+            ):
+                # if we're not re-ranking with a cross-encoder, and have RRF enabled,
+                # instead of accumulating the bm25 results into passages,
                 # we collect these ranks for Reciprocal Rank Fusion down below.
                 docs_scores = sorted(docs_scores, key=lambda x: x[1], reverse=True)
                 id2_rank_bm25 = {d.id(): i for i, (d, _) in enumerate(docs_scores)}
@@ -1279,8 +1338,12 @@ class DocChatAgent(ChatAgent):
         if self.config.use_fuzzy_match:
             # TODO: Add score threshold in config
             fuzzy_match_doc_scores = self.get_fuzzy_matches(query, retrieval_multiple)
-            if self.config.cross_encoder_reranking_model == "":
-                # only if we're not re-ranking with a cross-encoder,
+            if (
+                self.config.cross_encoder_reranking_model == ""
+                and self.config.use_reciprocal_rank_fusion
+            ):
+                # if we're not re-ranking with a cross-encoder,
+                # instead of accumulating the fuzzy match results into passages,
                 # we collect these ranks for Reciprocal Rank Fusion down below.
                 fuzzy_match_doc_scores = sorted(
                     fuzzy_match_doc_scores, key=lambda x: x[1], reverse=True
@@ -1316,9 +1379,12 @@ class DocChatAgent(ChatAgent):
                 | set(id2_rank_bm25.keys())
                 | set(id2_rank_fuzzy.keys())
             ):
-                rank_semantic = id2_rank_semantic.get(id_, float("inf"))
-                rank_bm25 = id2_rank_bm25.get(id_, float("inf"))
-                rank_fuzzy = id2_rank_fuzzy.get(id_, float("inf"))
+                # Use max_rank instead of infinity to avoid bias against
+                # single-method docs
+                max_rank = self.config.n_similar_chunks * retrieval_multiple
+                rank_semantic = id2_rank_semantic.get(id_, max_rank)
+                rank_bm25 = id2_rank_bm25.get(id_, max_rank)
+                rank_fuzzy = id2_rank_fuzzy.get(id_, max_rank)
                 c = self.config.reciprocal_rank_fusion_constant
                 reciprocal_fusion_score = (
                     1 / (rank_semantic + c) + 1 / (rank_bm25 + c) + 1 / (rank_fuzzy + c)
@@ -1333,12 +1399,12 @@ class DocChatAgent(ChatAgent):
                     reverse=True,
                 )
             )
-            # each method retrieved up to retrieval_multiple * n_similar_docs,
-            # so we need to take the top n_similar_docs from the combined list
+            # each method retrieved up to retrieval_multiple * n_similar_chunks,
+            # so we need to take the top n_similar_chunks from the combined list
             passages = [
                 id2doc[id]
                 for i, (id, _) in enumerate(id2_reciprocal_score.items())
-                if i < self.config.parsing.n_similar_docs
+                if i < self.config.n_similar_chunks
             ]
             # passages must have distinct ids
             assert len(passages) == len(set([d.id() for d in passages])), (
@@ -1355,7 +1421,7 @@ class DocChatAgent(ChatAgent):
             passages = [p for p, _ in passages_scores]
         # now passages can potentially have a lot of doc chunks,
         # so we re-rank them using a cross-encoder scoring model,
-        # and pick top k where k = config.parsing.n_similar_docs
+        # and pick top k where k = config..n_similar_chunks
         # https://www.sbert.net/examples/applications/retrieve_rerank
         if self.config.cross_encoder_reranking_model != "":
             passages = self.rerank_with_cross_encoder(query, passages)
@@ -1374,7 +1440,7 @@ class DocChatAgent(ChatAgent):
             passages_scores = self.add_context_window(passages_scores)
             passages = [p for p, _ in passages_scores]
-        return passages[: self.config.parsing.n_similar_docs]
+        return passages[: self.config.n_relevant_chunks]
     @no_type_check
     def get_relevant_extracts(self, query: str) -> Tuple[str, List[Document]]:

langroid/agent/special/lance_doc_chat_agent.py CHANGED Viewed

@@ -255,7 +255,7 @@ class LanceDocChatAgent(DocChatAgent):
         result = (
             tbl.search(query_clean)
             .where(self.config.filter or None)
-            .limit(self.config.parsing.n_similar_docs * multiple)
+            .limit(self.config.n_similar_chunks * multiple)
         )
         docs = self.vecdb._lance_result_to_docs(result)
         scores = [r["score"] for r in result.to_list()]

langroid/parsing/parser.py CHANGED Viewed

@@ -120,7 +120,7 @@ class ParsingConfig(BaseSettings):
     # aim to have at least this many chars per chunk when truncating due to punctuation
     min_chunk_chars: int = 350
     discard_chunk_chars: int = 5  # discard chunks with fewer than this many chars
-    n_similar_docs: int = 4
+    n_similar_docs: Optional[int] = 4  # deprecated
     n_neighbor_ids: int = 5  # window size to store around each chunk
     separators: List[str] = ["\n\n", "\n", " ", ""]
     token_encoding_model: str = "text-embedding-3-small"

{langroid-0.56.4.dist-info → langroid-0.56.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langroid
-Version: 0.56.4
+Version: 0.56.6
 Summary: Harness LLMs with Multi-Agent Programming
 Author-email: Prasad Chalasani <pchalasani@gmail.com>
 License: MIT

{langroid-0.56.4.dist-info → langroid-0.56.6.dist-info}/RECORD RENAMED Viewed

@@ -3,7 +3,7 @@ langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
 langroid/mytypes.py,sha256=HIcYAqGeA9OK0Hlscym2FI5Oax9QFljDZoVgRlomhRk,4014
 langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
-langroid/agent/base.py,sha256=EMZGPVSqd4_optr9_FG1TLRF8-noEAloen8J4c7Ayto,86262
+langroid/agent/base.py,sha256=GVE_vdtDUJpldACH4LQwjqbQ11UDn9thr2-uBXk0RjU,86009
 langroid/agent/batch.py,sha256=wpE9RqCNDVDhAXkCB7wEqfCIEAi6qKcrhaZ-Zr9T4C0,21375
 langroid/agent/chat_agent.py,sha256=2HIYzYxkrGkRIS97ioKfIqjaW3RbX89M39LjzBobBEY,88381
 langroid/agent/chat_document.py,sha256=0e6zYkqIorMIVbCsxOul9ziwAPPOWDsBsRV9E8ux-WI,18055
@@ -15,9 +15,9 @@ langroid/agent/xml_tool_message.py,sha256=oeBKnJNoGaKdtz39XoWGMTNlVyXew2MWH5lgtY
 langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/callbacks/chainlit.py,sha256=4rJw07NIIVTIVvksVY08h5PdLE_kRoJItjbQM0UjRn0,20962
 langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
-langroid/agent/special/doc_chat_agent.py,sha256=7PvVKHrXHw2LoSgU2-3hE7mz46r5oKB3o_bFhWmfT_I,65642
+langroid/agent/special/doc_chat_agent.py,sha256=q-W4fDM-kdv_keBQjIsSZEUcmUvDAK1Cb2GcfJ9KhmY,68852
 langroid/agent/special/doc_chat_task.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
+langroid/agent/special/lance_doc_chat_agent.py,sha256=6pIqi2DF-MvYYN3-blsdUgulYnOBTl7I21T7wPAt1zM,10413
 langroid/agent/special/lance_tools.py,sha256=qS8x4wi8mrqfbYV2ztFzrcxyhHQ0ZWOc-zkYiH7awj0,2105
 langroid/agent/special/relevance_extractor_agent.py,sha256=zIx8GUdVo1aGW6ASla0NPQjYYIpmriK_TYMijqAx3F8,4796
 langroid/agent/special/retriever_agent.py,sha256=o2UfqiCGME0t85SZ6qjK041_WZYqXSuV1SeH_3KtVuc,1931
@@ -94,7 +94,7 @@ langroid/parsing/file_attachment.py,sha256=ryJVhVFOhINrfkf9Z0vWTTwCnm80z2qzXgp20
 langroid/parsing/md_parser.py,sha256=JUgsUpCaeAuBndmtDaJR9HMZaje1gmtXtaLXJHst3i8,21340
 langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
 langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
-langroid/parsing/parser.py,sha256=uaAITarcGI2504zcP_dLhp3LjNdh9A6R_yS-o_VcaH8,15599
+langroid/parsing/parser.py,sha256=pFolKsWxr2uQ5zoAqby9eunZ0baBlzCs9LfJ6NPV_8I,15623
 langroid/parsing/pdf_utils.py,sha256=QogxU_B1N3WSLyZ9PEcJDaJoZShKs7CPQRVyF1V2DiE,3143
 langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
 langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
@@ -137,7 +137,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
 langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
 langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
 langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
-langroid-0.56.4.dist-info/METADATA,sha256=punxDL35bHKw4J-PDHEMYfWvwb7gu3ccpuMlTPmU1GU,65744
-langroid-0.56.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-langroid-0.56.4.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.56.4.dist-info/RECORD,,
+langroid-0.56.6.dist-info/METADATA,sha256=Obrt0l7fxia2D7Fd9M9SQFCulOxfAJwqo9DxorPzgfA,65744
+langroid-0.56.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+langroid-0.56.6.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.56.6.dist-info/RECORD,,

{langroid-0.56.4.dist-info → langroid-0.56.6.dist-info}/WHEEL RENAMED Viewed

File without changes

{langroid-0.56.4.dist-info → langroid-0.56.6.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

langroid 0.56.4__py3-none-any.whl → 0.56.6__py3-none-any.whl

langroid 0.56.4py3-none-any.whl → 0.56.6py3-none-any.whl