langroid 0.1.161__py3-none-any.whl → 0.1.162__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/parsing/utils.py +51 -7
- {langroid-0.1.161.dist-info → langroid-0.1.162.dist-info}/METADATA +1 -1
- {langroid-0.1.161.dist-info → langroid-0.1.162.dist-info}/RECORD +5 -5
- {langroid-0.1.161.dist-info → langroid-0.1.162.dist-info}/LICENSE +0 -0
- {langroid-0.1.161.dist-info → langroid-0.1.162.dist-info}/WHEEL +0 -0
langroid/parsing/utils.py
CHANGED
@@ -101,14 +101,33 @@ def split_paragraphs(text: str) -> List[str]:
|
|
101
101
|
return [para.strip() for para in paras if para.strip()]
|
102
102
|
|
103
103
|
|
104
|
-
def
|
104
|
+
def split_newlines(text: str) -> List[str]:
|
105
|
+
"""
|
106
|
+
Split the input text into lines using "\n" as the delimiter.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
text (str): The input text.
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
list: A list of lines.
|
113
|
+
"""
|
114
|
+
lines = re.split(r"\n", text)
|
115
|
+
return [line.strip() for line in lines if line.strip()]
|
116
|
+
|
117
|
+
|
118
|
+
def number_segments(s: str, granularity: int = 1) -> str:
|
105
119
|
"""
|
106
120
|
Number the segments in a given text, preserving paragraph structure.
|
107
|
-
A segment is a sequence of `len` consecutive sentences
|
121
|
+
A segment is a sequence of `len` consecutive "sentences", where a "sentence"
|
122
|
+
is either a normal sentence, or if there isn't enough punctuation to properly
|
123
|
+
identify sentences, then we use a pseudo-sentence via heuristics (split by newline
|
124
|
+
or failing that, just split every 40 words). The goal here is simply to number
|
125
|
+
segments at a reasonable granularity so the LLM can identify relevant segments,
|
126
|
+
in the RelevanceExtractorAgent.
|
108
127
|
|
109
128
|
Args:
|
110
129
|
s (str): The input text.
|
111
|
-
|
130
|
+
granularity (int): The number of sentences in a segment.
|
112
131
|
If this is -1, then the entire text is treated as a single segment,
|
113
132
|
and is numbered as <#1#>.
|
114
133
|
|
@@ -119,7 +138,7 @@ def number_segments(s: str, len: int = 1) -> str:
|
|
119
138
|
>>> number_segments("Hello world! How are you? Have a good day.")
|
120
139
|
'<#1#> Hello world! <#2#> How are you? <#3#> Have a good day.'
|
121
140
|
"""
|
122
|
-
if
|
141
|
+
if granularity < 0:
|
123
142
|
return "<#1#> " + s
|
124
143
|
numbered_text = []
|
125
144
|
count = 0
|
@@ -127,9 +146,34 @@ def number_segments(s: str, len: int = 1) -> str:
|
|
127
146
|
paragraphs = split_paragraphs(s)
|
128
147
|
for paragraph in paragraphs:
|
129
148
|
sentences = nltk.sent_tokenize(paragraph)
|
149
|
+
# Some docs are problematic (e.g. resumes) and have no (or too few) periods,
|
150
|
+
# so we can't split usefully into sentences.
|
151
|
+
# We try a series of heuristics to split into sentences,
|
152
|
+
# until the avg num words per sentence is less than 40.
|
153
|
+
avg_words_per_sentence = sum(
|
154
|
+
len(nltk.word_tokenize(sentence)) for sentence in sentences
|
155
|
+
) / len(sentences)
|
156
|
+
if avg_words_per_sentence > 40:
|
157
|
+
sentences = split_newlines(paragraph)
|
158
|
+
avg_words_per_sentence = sum(
|
159
|
+
len(nltk.word_tokenize(sentence)) for sentence in sentences
|
160
|
+
) / len(sentences)
|
161
|
+
if avg_words_per_sentence > 40:
|
162
|
+
# Still too long, just split on every 40 words
|
163
|
+
sentences = []
|
164
|
+
for sentence in nltk.sent_tokenize(paragraph):
|
165
|
+
words = nltk.word_tokenize(sentence)
|
166
|
+
for i in range(0, len(words), 40):
|
167
|
+
# if there are less than 20 words left after this,
|
168
|
+
# just add them to the last sentence and break
|
169
|
+
if len(words) - i < 20:
|
170
|
+
sentences.append(" ".join(words[i:]))
|
171
|
+
break
|
172
|
+
else:
|
173
|
+
sentences.append(" ".join(words[i : i + 40]))
|
130
174
|
for i, sentence in enumerate(sentences):
|
131
|
-
num = count //
|
132
|
-
number_prefix = f"<#{num}#>" if count %
|
175
|
+
num = count // granularity + 1
|
176
|
+
number_prefix = f"<#{num}#>" if count % granularity == 0 else ""
|
133
177
|
sentence = f"{number_prefix} {sentence}"
|
134
178
|
count += 1
|
135
179
|
sentences[i] = sentence
|
@@ -140,7 +184,7 @@ def number_segments(s: str, len: int = 1) -> str:
|
|
140
184
|
|
141
185
|
|
142
186
|
def number_sentences(s: str) -> str:
|
143
|
-
return number_segments(s,
|
187
|
+
return number_segments(s, granularity=1)
|
144
188
|
|
145
189
|
|
146
190
|
def parse_number_range_list(specs: str) -> List[int]:
|
@@ -70,7 +70,7 @@ langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz
|
|
70
70
|
langroid/parsing/url_loader.py,sha256=RZCX1RJuQpTatJjBOU74_gJ5Ab7xwarRmFh5ON4n_G4,2279
|
71
71
|
langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
|
72
72
|
langroid/parsing/urls.py,sha256=Nv4yCWQLLBEjaiRdaZZVQNBEl_cfK_V6cVuPm91wGtU,7686
|
73
|
-
langroid/parsing/utils.py,sha256=
|
73
|
+
langroid/parsing/utils.py,sha256=g5tRl0HWLXYzkiwYdMfreamzG76tK6ieiUqPNx35ln4,9845
|
74
74
|
langroid/parsing/web_search.py,sha256=hGUVoSJNdpoT5rsm-ikAteMiUropHrzKaxN8EVVqO2U,2496
|
75
75
|
langroid/prompts/__init__.py,sha256=aTW86CbDZM7tntqiTVeNLYJv7pbRDcKOI3qHVXCEHUY,99
|
76
76
|
langroid/prompts/dialog.py,sha256=SpfiSyofSgy2pwD1YboHR_yHO3LEEMbv6j2sm874jKo,331
|
@@ -103,7 +103,7 @@ langroid/vector_store/meilisearch.py,sha256=d2huA9P-NoYRuAQ9ZeXJmMKr7ry8u90RUSR2
|
|
103
103
|
langroid/vector_store/momento.py,sha256=j6Eo6oIDN2fe7lsBOlCXJn3uvvERHHTFL5QJfeREeOM,10044
|
104
104
|
langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
|
105
105
|
langroid/vector_store/qdrantdb.py,sha256=qt7Dye6rcgoe0551WzmOxRGIlJfL87D4MX7HdqxuEok,13393
|
106
|
-
langroid-0.1.
|
107
|
-
langroid-0.1.
|
108
|
-
langroid-0.1.
|
109
|
-
langroid-0.1.
|
106
|
+
langroid-0.1.162.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
107
|
+
langroid-0.1.162.dist-info/METADATA,sha256=j6ZBZx4nLwIX4NNMNpwu4iDYIxtD6lOFjTFZ3n53zic,42745
|
108
|
+
langroid-0.1.162.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
109
|
+
langroid-0.1.162.dist-info/RECORD,,
|
File without changes
|
File without changes
|