PyPI - langroid - Versions diffs - 0.1.161__py3-none-any.whl → 0.1.162__py3-none-any.whl - Mend

langroid 0.1.161py3-none-any.whl → 0.1.162py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

langroid/parsing/utils.py CHANGED Viewed

@@ -101,14 +101,33 @@ def split_paragraphs(text: str) -> List[str]:
     return [para.strip() for para in paras if para.strip()]
-def number_segments(s: str, len: int = 1) -> str:
+def split_newlines(text: str) -> List[str]:
+    """
+    Split the input text into lines using "\n" as the delimiter.
+    Args:
+        text (str): The input text.
+    Returns:
+        list: A list of lines.
+    """
+    lines = re.split(r"\n", text)
+    return [line.strip() for line in lines if line.strip()]
+def number_segments(s: str, granularity: int = 1) -> str:
     """
     Number the segments in a given text, preserving paragraph structure.
-    A segment is a sequence of `len` consecutive sentences.
+    A segment is a sequence of `len` consecutive "sentences", where a "sentence"
+    is either a normal sentence, or if there isn't enough punctuation to properly
+    identify sentences, then we use a pseudo-sentence via heuristics (split by newline
+    or failing that, just split every 40 words). The goal here is simply to number
+    segments at a reasonable granularity so the LLM can identify relevant segments,
+    in the RelevanceExtractorAgent.
     Args:
         s (str): The input text.
-        len (int): The number of sentences in a segment.
+        granularity (int): The number of sentences in a segment.
             If this is -1, then the entire text is treated as a single segment,
             and is numbered as <#1#>.
@@ -119,7 +138,7 @@ def number_segments(s: str, len: int = 1) -> str:
         >>> number_segments("Hello world! How are you? Have a good day.")
         '<#1#> Hello world! <#2#> How are you? <#3#> Have a good day.'
     """
-    if len < 0:
+    if granularity < 0:
         return "<#1#> " + s
     numbered_text = []
     count = 0
@@ -127,9 +146,34 @@ def number_segments(s: str, len: int = 1) -> str:
     paragraphs = split_paragraphs(s)
     for paragraph in paragraphs:
         sentences = nltk.sent_tokenize(paragraph)
+        # Some docs are problematic (e.g. resumes) and have no (or too few) periods,
+        # so we can't split usefully into sentences.
+        # We try a series of heuristics to split into sentences,
+        # until the avg num words per sentence is less than 40.
+        avg_words_per_sentence = sum(
+            len(nltk.word_tokenize(sentence)) for sentence in sentences
+        ) / len(sentences)
+        if avg_words_per_sentence > 40:
+            sentences = split_newlines(paragraph)
+        avg_words_per_sentence = sum(
+            len(nltk.word_tokenize(sentence)) for sentence in sentences
+        ) / len(sentences)
+        if avg_words_per_sentence > 40:
+            # Still too long, just split on every 40 words
+            sentences = []
+            for sentence in nltk.sent_tokenize(paragraph):
+                words = nltk.word_tokenize(sentence)
+                for i in range(0, len(words), 40):
+                    # if there are less than 20 words left after this,
+                    # just add them to the last sentence and break
+                    if len(words) - i < 20:
+                        sentences.append(" ".join(words[i:]))
+                        break
+                    else:
+                        sentences.append(" ".join(words[i : i + 40]))
         for i, sentence in enumerate(sentences):
-            num = count // len + 1
-            number_prefix = f"<#{num}#>" if count % len == 0 else ""
+            num = count // granularity + 1
+            number_prefix = f"<#{num}#>" if count % granularity == 0 else ""
             sentence = f"{number_prefix} {sentence}"
             count += 1
             sentences[i] = sentence
@@ -140,7 +184,7 @@ def number_segments(s: str, len: int = 1) -> str:
 def number_sentences(s: str) -> str:
-    return number_segments(s, len=1)
+    return number_segments(s, granularity=1)
 def parse_number_range_list(specs: str) -> List[int]:

{langroid-0.1.161.dist-info → langroid-0.1.162.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langroid
-Version: 0.1.161
+Version: 0.1.162
 Summary: Harness LLMs with Multi-Agent Programming
 License: MIT
 Author: Prasad Chalasani

{langroid-0.1.161.dist-info → langroid-0.1.162.dist-info}/RECORD RENAMED Viewed

@@ -70,7 +70,7 @@ langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz
 langroid/parsing/url_loader.py,sha256=RZCX1RJuQpTatJjBOU74_gJ5Ab7xwarRmFh5ON4n_G4,2279
 langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
 langroid/parsing/urls.py,sha256=Nv4yCWQLLBEjaiRdaZZVQNBEl_cfK_V6cVuPm91wGtU,7686
-langroid/parsing/utils.py,sha256=AaUt7mnQ-VNBI-pIDr-ZtprmeKHOv0LwdonaPxmI47g,7801
+langroid/parsing/utils.py,sha256=g5tRl0HWLXYzkiwYdMfreamzG76tK6ieiUqPNx35ln4,9845
 langroid/parsing/web_search.py,sha256=hGUVoSJNdpoT5rsm-ikAteMiUropHrzKaxN8EVVqO2U,2496
 langroid/prompts/__init__.py,sha256=aTW86CbDZM7tntqiTVeNLYJv7pbRDcKOI3qHVXCEHUY,99
 langroid/prompts/dialog.py,sha256=SpfiSyofSgy2pwD1YboHR_yHO3LEEMbv6j2sm874jKo,331
@@ -103,7 +103,7 @@ langroid/vector_store/meilisearch.py,sha256=d2huA9P-NoYRuAQ9ZeXJmMKr7ry8u90RUSR2
 langroid/vector_store/momento.py,sha256=j6Eo6oIDN2fe7lsBOlCXJn3uvvERHHTFL5QJfeREeOM,10044
 langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
 langroid/vector_store/qdrantdb.py,sha256=qt7Dye6rcgoe0551WzmOxRGIlJfL87D4MX7HdqxuEok,13393
-langroid-0.1.161.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.1.161.dist-info/METADATA,sha256=HfW5EbqWr_y-7LTBCwVUydCsTNLpy8DFLD565RgpDXM,42745
-langroid-0.1.161.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
-langroid-0.1.161.dist-info/RECORD,,
+langroid-0.1.162.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.1.162.dist-info/METADATA,sha256=j6ZBZx4nLwIX4NNMNpwu4iDYIxtD6lOFjTFZ3n53zic,42745
+langroid-0.1.162.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
+langroid-0.1.162.dist-info/RECORD,,

{langroid-0.1.161.dist-info → langroid-0.1.162.dist-info}/LICENSE RENAMED Viewed

File without changes

{langroid-0.1.161.dist-info → langroid-0.1.162.dist-info}/WHEEL RENAMED Viewed

File without changes

langroid 0.1.161__py3-none-any.whl → 0.1.162__py3-none-any.whl

langroid 0.1.161py3-none-any.whl → 0.1.162py3-none-any.whl