langroid 0.1.161__py3-none-any.whl → 0.1.162__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langroid/parsing/utils.py CHANGED
@@ -101,14 +101,33 @@ def split_paragraphs(text: str) -> List[str]:
101
101
  return [para.strip() for para in paras if para.strip()]
102
102
 
103
103
 
104
- def number_segments(s: str, len: int = 1) -> str:
104
+ def split_newlines(text: str) -> List[str]:
105
+ """
106
+ Split the input text into lines using "\n" as the delimiter.
107
+
108
+ Args:
109
+ text (str): The input text.
110
+
111
+ Returns:
112
+ list: A list of lines.
113
+ """
114
+ lines = re.split(r"\n", text)
115
+ return [line.strip() for line in lines if line.strip()]
116
+
117
+
118
+ def number_segments(s: str, granularity: int = 1) -> str:
105
119
  """
106
120
  Number the segments in a given text, preserving paragraph structure.
107
- A segment is a sequence of `len` consecutive sentences.
121
+ A segment is a sequence of `len` consecutive "sentences", where a "sentence"
122
+ is either a normal sentence, or if there isn't enough punctuation to properly
123
+ identify sentences, then we use a pseudo-sentence via heuristics (split by newline
124
+ or failing that, just split every 40 words). The goal here is simply to number
125
+ segments at a reasonable granularity so the LLM can identify relevant segments,
126
+ in the RelevanceExtractorAgent.
108
127
 
109
128
  Args:
110
129
  s (str): The input text.
111
- len (int): The number of sentences in a segment.
130
+ granularity (int): The number of sentences in a segment.
112
131
  If this is -1, then the entire text is treated as a single segment,
113
132
  and is numbered as <#1#>.
114
133
 
@@ -119,7 +138,7 @@ def number_segments(s: str, len: int = 1) -> str:
119
138
  >>> number_segments("Hello world! How are you? Have a good day.")
120
139
  '<#1#> Hello world! <#2#> How are you? <#3#> Have a good day.'
121
140
  """
122
- if len < 0:
141
+ if granularity < 0:
123
142
  return "<#1#> " + s
124
143
  numbered_text = []
125
144
  count = 0
@@ -127,9 +146,34 @@ def number_segments(s: str, len: int = 1) -> str:
127
146
  paragraphs = split_paragraphs(s)
128
147
  for paragraph in paragraphs:
129
148
  sentences = nltk.sent_tokenize(paragraph)
149
+ # Some docs are problematic (e.g. resumes) and have no (or too few) periods,
150
+ # so we can't split usefully into sentences.
151
+ # We try a series of heuristics to split into sentences,
152
+ # until the avg num words per sentence is less than 40.
153
+ avg_words_per_sentence = sum(
154
+ len(nltk.word_tokenize(sentence)) for sentence in sentences
155
+ ) / len(sentences)
156
+ if avg_words_per_sentence > 40:
157
+ sentences = split_newlines(paragraph)
158
+ avg_words_per_sentence = sum(
159
+ len(nltk.word_tokenize(sentence)) for sentence in sentences
160
+ ) / len(sentences)
161
+ if avg_words_per_sentence > 40:
162
+ # Still too long, just split on every 40 words
163
+ sentences = []
164
+ for sentence in nltk.sent_tokenize(paragraph):
165
+ words = nltk.word_tokenize(sentence)
166
+ for i in range(0, len(words), 40):
167
+ # if there are less than 20 words left after this,
168
+ # just add them to the last sentence and break
169
+ if len(words) - i < 20:
170
+ sentences.append(" ".join(words[i:]))
171
+ break
172
+ else:
173
+ sentences.append(" ".join(words[i : i + 40]))
130
174
  for i, sentence in enumerate(sentences):
131
- num = count // len + 1
132
- number_prefix = f"<#{num}#>" if count % len == 0 else ""
175
+ num = count // granularity + 1
176
+ number_prefix = f"<#{num}#>" if count % granularity == 0 else ""
133
177
  sentence = f"{number_prefix} {sentence}"
134
178
  count += 1
135
179
  sentences[i] = sentence
@@ -140,7 +184,7 @@ def number_segments(s: str, len: int = 1) -> str:
140
184
 
141
185
 
142
186
  def number_sentences(s: str) -> str:
143
- return number_segments(s, len=1)
187
+ return number_segments(s, granularity=1)
144
188
 
145
189
 
146
190
  def parse_number_range_list(specs: str) -> List[int]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.1.161
3
+ Version: 0.1.162
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -70,7 +70,7 @@ langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz
70
70
  langroid/parsing/url_loader.py,sha256=RZCX1RJuQpTatJjBOU74_gJ5Ab7xwarRmFh5ON4n_G4,2279
71
71
  langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
72
72
  langroid/parsing/urls.py,sha256=Nv4yCWQLLBEjaiRdaZZVQNBEl_cfK_V6cVuPm91wGtU,7686
73
- langroid/parsing/utils.py,sha256=AaUt7mnQ-VNBI-pIDr-ZtprmeKHOv0LwdonaPxmI47g,7801
73
+ langroid/parsing/utils.py,sha256=g5tRl0HWLXYzkiwYdMfreamzG76tK6ieiUqPNx35ln4,9845
74
74
  langroid/parsing/web_search.py,sha256=hGUVoSJNdpoT5rsm-ikAteMiUropHrzKaxN8EVVqO2U,2496
75
75
  langroid/prompts/__init__.py,sha256=aTW86CbDZM7tntqiTVeNLYJv7pbRDcKOI3qHVXCEHUY,99
76
76
  langroid/prompts/dialog.py,sha256=SpfiSyofSgy2pwD1YboHR_yHO3LEEMbv6j2sm874jKo,331
@@ -103,7 +103,7 @@ langroid/vector_store/meilisearch.py,sha256=d2huA9P-NoYRuAQ9ZeXJmMKr7ry8u90RUSR2
103
103
  langroid/vector_store/momento.py,sha256=j6Eo6oIDN2fe7lsBOlCXJn3uvvERHHTFL5QJfeREeOM,10044
104
104
  langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
105
105
  langroid/vector_store/qdrantdb.py,sha256=qt7Dye6rcgoe0551WzmOxRGIlJfL87D4MX7HdqxuEok,13393
106
- langroid-0.1.161.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
107
- langroid-0.1.161.dist-info/METADATA,sha256=HfW5EbqWr_y-7LTBCwVUydCsTNLpy8DFLD565RgpDXM,42745
108
- langroid-0.1.161.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
109
- langroid-0.1.161.dist-info/RECORD,,
106
+ langroid-0.1.162.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
107
+ langroid-0.1.162.dist-info/METADATA,sha256=j6ZBZx4nLwIX4NNMNpwu4iDYIxtD6lOFjTFZ3n53zic,42745
108
+ langroid-0.1.162.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
109
+ langroid-0.1.162.dist-info/RECORD,,