langroid 0.1.134__py3-none-any.whl → 0.1.135__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/parsing/parser.py +31 -29
- {langroid-0.1.134.dist-info → langroid-0.1.135.dist-info}/METADATA +1 -1
- {langroid-0.1.134.dist-info → langroid-0.1.135.dist-info}/RECORD +5 -5
- {langroid-0.1.134.dist-info → langroid-0.1.135.dist-info}/LICENSE +0 -0
- {langroid-0.1.134.dist-info → langroid-0.1.135.dist-info}/WHEEL +0 -0
langroid/parsing/parser.py
CHANGED
@@ -52,13 +52,14 @@ class Parser:
|
|
52
52
|
return len(tokens)
|
53
53
|
|
54
54
|
def add_window_ids(self, chunks: List[Document]) -> None:
|
55
|
-
"""Chunks
|
56
|
-
Add window_ids in metadata"""
|
55
|
+
"""Chunks may belong to multiple docs, but for each doc,
|
56
|
+
they appear consecutively. Add window_ids in metadata"""
|
57
57
|
|
58
58
|
# The original metadata.id (if any) is ignored since it will be same for all
|
59
59
|
# chunks and is useless. We want a distinct id for each chunk.
|
60
60
|
orig_ids = [c.metadata.id for c in chunks]
|
61
61
|
ids = [Document.hash_id(str(c)) for c in chunks]
|
62
|
+
id2chunk = {id: c for id, c in zip(ids, chunks)}
|
62
63
|
|
63
64
|
# group the ids by orig_id
|
64
65
|
orig_id_to_ids: Dict[str, List[str]] = {}
|
@@ -71,9 +72,11 @@ class Parser:
|
|
71
72
|
|
72
73
|
k = self.config.n_neighbor_ids
|
73
74
|
for orig, ids in orig_id_to_ids.items():
|
75
|
+
# ids are consecutive chunks in a single doc
|
74
76
|
n = len(ids)
|
75
77
|
window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
|
76
|
-
for i,
|
78
|
+
for i, _ in enumerate(ids):
|
79
|
+
c = id2chunk[ids[i]]
|
77
80
|
if c.content.strip() == "":
|
78
81
|
continue
|
79
82
|
c.metadata.window_ids = window_ids[i]
|
@@ -101,35 +104,35 @@ class Parser:
|
|
101
104
|
return final_docs
|
102
105
|
|
103
106
|
def split_para_sentence(self, docs: List[Document]) -> List[Document]:
|
104
|
-
final_chunks = []
|
105
107
|
chunks = docs
|
106
108
|
while True:
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
if
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
)
|
109
|
+
un_splittables = 0
|
110
|
+
split_chunks = []
|
111
|
+
for c in chunks:
|
112
|
+
if c.content.strip() == "":
|
113
|
+
continue
|
114
|
+
if self.num_tokens(c.content) <= 1.3 * self.config.chunk_size:
|
115
|
+
# small chunk: no need to split
|
116
|
+
split_chunks.append(c)
|
117
|
+
continue
|
118
|
+
splits = self._split_para_sentence_once([c])
|
119
|
+
un_splittables += len(splits) == 1
|
120
|
+
split_chunks += splits
|
121
|
+
if len(split_chunks) == len(chunks):
|
122
|
+
if un_splittables > 0:
|
123
|
+
max_len = max([self.num_tokens(p.content) for p in chunks])
|
124
|
+
logger.warning(
|
125
|
+
f"""
|
126
|
+
Unable to split {un_splittables} chunks
|
127
|
+
using chunk_size = {self.config.chunk_size}.
|
128
|
+
Max chunk size is {max_len} tokens.
|
129
|
+
"""
|
130
|
+
)
|
130
131
|
break # we won't be able to shorten them with current settings
|
132
|
+
chunks = split_chunks.copy()
|
131
133
|
|
132
|
-
|
134
|
+
self.add_window_ids(chunks)
|
135
|
+
return chunks
|
133
136
|
|
134
137
|
def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
|
135
138
|
final_chunks = []
|
@@ -144,7 +147,6 @@ class Parser:
|
|
144
147
|
for c in chunks
|
145
148
|
if c.strip() != ""
|
146
149
|
]
|
147
|
-
self.add_window_ids(chunk_docs)
|
148
150
|
final_chunks += chunk_docs
|
149
151
|
|
150
152
|
return final_chunks
|
@@ -57,7 +57,7 @@ langroid/parsing/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
57
57
|
langroid/parsing/document_parser.py,sha256=YC3IXQ9ErpBGBZh6Be9gfJWHcTwGTSMfNQMT5ARrj5g,14615
|
58
58
|
langroid/parsing/json.py,sha256=MVqBUfInALQm1QKbcfEvLzWxBz_UztCIyGk7AK5uFPo,1650
|
59
59
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
60
|
-
langroid/parsing/parser.py,sha256=
|
60
|
+
langroid/parsing/parser.py,sha256=3xjnbdsxPR7Y5gN3kclkoNWslldS1CiKBdTdUclsxjI,10264
|
61
61
|
langroid/parsing/repo_loader.py,sha256=4qCyRRHCKIYd8F1ghT-D8ko1C2sXpF7UYP1L5Im1hRE,27705
|
62
62
|
langroid/parsing/search.py,sha256=xmQdAdTIwZ0REEUeQVFlGZlqf7k8Poah7-ALuyW7Ov0,8440
|
63
63
|
langroid/parsing/spider.py,sha256=w_mHR1B4KOmxsBLoVI8kMkMTEbwTzeK3ath9fOMJrTk,3043
|
@@ -97,7 +97,7 @@ langroid/vector_store/meilisearch.py,sha256=h4e1MZJ9J3EnFfcUhLshK1Duwy1dpHWH4Ajt
|
|
97
97
|
langroid/vector_store/momento.py,sha256=otoUrpgwEduFOCUhbFFSZWKEzWF2di1d4-m3n5PIuHI,9964
|
98
98
|
langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
|
99
99
|
langroid/vector_store/qdrantdb.py,sha256=ZEJnlNIJwWIySGhPz3jBc6spcLCPqOcUDBYBisLF90I,11379
|
100
|
-
langroid-0.1.
|
101
|
-
langroid-0.1.
|
102
|
-
langroid-0.1.
|
103
|
-
langroid-0.1.
|
100
|
+
langroid-0.1.135.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
101
|
+
langroid-0.1.135.dist-info/METADATA,sha256=fTE3Rlvjlw1MbBxay8Wr7Hp2KHqAveo2W-sEjqskeTc,42646
|
102
|
+
langroid-0.1.135.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
103
|
+
langroid-0.1.135.dist-info/RECORD,,
|
File without changes
|
File without changes
|