PyPI - langroid - Versions diffs - 0.1.134__py3-none-any.whl → 0.1.135__py3-none-any.whl - Mend

langroid 0.1.134py3-none-any.whl → 0.1.135py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

langroid/parsing/parser.py CHANGED Viewed

@@ -52,13 +52,14 @@ class Parser:
         return len(tokens)
     def add_window_ids(self, chunks: List[Document]) -> None:
-        """Chunks are consecutive parts of a single original document.
-        Add window_ids in metadata"""
+        """Chunks may belong to multiple docs, but for each doc,
+        they appear consecutively. Add window_ids in metadata"""
         # The original metadata.id (if any) is ignored since it will be same for all
         # chunks and is useless. We want a distinct id for each chunk.
         orig_ids = [c.metadata.id for c in chunks]
         ids = [Document.hash_id(str(c)) for c in chunks]
+        id2chunk = {id: c for id, c in zip(ids, chunks)}
         # group the ids by orig_id
         orig_id_to_ids: Dict[str, List[str]] = {}
@@ -71,9 +72,11 @@ class Parser:
         k = self.config.n_neighbor_ids
         for orig, ids in orig_id_to_ids.items():
+            # ids are consecutive chunks in a single doc
             n = len(ids)
             window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
-            for i, c in enumerate(chunks):
+            for i, _ in enumerate(ids):
+                c = id2chunk[ids[i]]
                 if c.content.strip() == "":
                     continue
                 c.metadata.window_ids = window_ids[i]
@@ -101,35 +104,35 @@ class Parser:
         return final_docs
     def split_para_sentence(self, docs: List[Document]) -> List[Document]:
-        final_chunks = []
         chunks = docs
         while True:
-            long_chunks = [
-                p
-                for p in chunks
-                if self.num_tokens(p.content) > 1.3 * self.config.chunk_size
-            ]
-            if len(long_chunks) == 0:
-                break
-            short_chunks = [
-                p
-                for p in chunks
-                if self.num_tokens(p.content) <= 1.3 * self.config.chunk_size
-            ]
-            final_chunks += short_chunks
-            chunks = self._split_para_sentence_once(long_chunks)
-            if len(chunks) == len(long_chunks):
-                max_len = max([self.num_tokens(p.content) for p in long_chunks])
-                logger.warning(
-                    f"""
-                    Unable to split {len(long_chunks)} long chunks
-                    using chunk_size = {self.config.chunk_size}.
-                    Max chunk size is {max_len} tokens.
-                    """
-                )
+            un_splittables = 0
+            split_chunks = []
+            for c in chunks:
+                if c.content.strip() == "":
+                    continue
+                if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
+                    # small chunk: no need to split
+                    split_chunks.append(c)
+                    continue
+                splits = self._split_para_sentence_once([c])
+                un_splittables += len(splits) == 1
+                split_chunks += splits
+            if len(split_chunks) == len(chunks):
+                if un_splittables > 0:
+                    max_len = max([self.num_tokens(p.content) for p in chunks])
+                    logger.warning(
+                        f"""
+                        Unable to split {un_splittables} chunks
+                        using chunk_size = {self.config.chunk_size}.
+                        Max chunk size is {max_len} tokens.
+                        """
+                    )
                 break  # we won't be able to shorten them with current settings
+            chunks = split_chunks.copy()
-        return final_chunks + chunks
+        self.add_window_ids(chunks)
+        return chunks
     def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
         final_chunks = []
@@ -144,7 +147,6 @@ class Parser:
                 for c in chunks
                 if c.strip() != ""
             ]
-            self.add_window_ids(chunk_docs)
             final_chunks += chunk_docs
         return final_chunks

{langroid-0.1.134.dist-info → langroid-0.1.135.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langroid
-Version: 0.1.134
+Version: 0.1.135
 Summary: Harness LLMs with Multi-Agent Programming
 License: MIT
 Author: Prasad Chalasani

{langroid-0.1.134.dist-info → langroid-0.1.135.dist-info}/RECORD RENAMED Viewed

@@ -57,7 +57,7 @@ langroid/parsing/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/parsing/document_parser.py,sha256=YC3IXQ9ErpBGBZh6Be9gfJWHcTwGTSMfNQMT5ARrj5g,14615
 langroid/parsing/json.py,sha256=MVqBUfInALQm1QKbcfEvLzWxBz_UztCIyGk7AK5uFPo,1650
 langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
-langroid/parsing/parser.py,sha256=HU8d8wzzo2rSX2tJFYVLbf_gZ3qtz6goDkctdkNHuzw,10032
+langroid/parsing/parser.py,sha256=3xjnbdsxPR7Y5gN3kclkoNWslldS1CiKBdTdUclsxjI,10264
 langroid/parsing/repo_loader.py,sha256=4qCyRRHCKIYd8F1ghT-D8ko1C2sXpF7UYP1L5Im1hRE,27705
 langroid/parsing/search.py,sha256=xmQdAdTIwZ0REEUeQVFlGZlqf7k8Poah7-ALuyW7Ov0,8440
 langroid/parsing/spider.py,sha256=w_mHR1B4KOmxsBLoVI8kMkMTEbwTzeK3ath9fOMJrTk,3043
@@ -97,7 +97,7 @@ langroid/vector_store/meilisearch.py,sha256=h4e1MZJ9J3EnFfcUhLshK1Duwy1dpHWH4Ajt
 langroid/vector_store/momento.py,sha256=otoUrpgwEduFOCUhbFFSZWKEzWF2di1d4-m3n5PIuHI,9964
 langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
 langroid/vector_store/qdrantdb.py,sha256=ZEJnlNIJwWIySGhPz3jBc6spcLCPqOcUDBYBisLF90I,11379
-langroid-0.1.134.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.1.134.dist-info/METADATA,sha256=IJPjs9SDcsyMwSqS5yWg0N4vPpxHEJzwzHTqlMaiSvI,42646
-langroid-0.1.134.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
-langroid-0.1.134.dist-info/RECORD,,
+langroid-0.1.135.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.1.135.dist-info/METADATA,sha256=fTE3Rlvjlw1MbBxay8Wr7Hp2KHqAveo2W-sEjqskeTc,42646
+langroid-0.1.135.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
+langroid-0.1.135.dist-info/RECORD,,

{langroid-0.1.134.dist-info → langroid-0.1.135.dist-info}/LICENSE RENAMED Viewed

File without changes

{langroid-0.1.134.dist-info → langroid-0.1.135.dist-info}/WHEEL RENAMED Viewed

File without changes

langroid 0.1.134__py3-none-any.whl → 0.1.135__py3-none-any.whl

langroid 0.1.134py3-none-any.whl → 0.1.135py3-none-any.whl