langroid 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/special/doc_chat_agent.py +115 -28
- langroid/parsing/search.py +14 -11
- {langroid-0.12.0.dist-info → langroid-0.14.0.dist-info}/METADATA +3 -1
- {langroid-0.12.0.dist-info → langroid-0.14.0.dist-info}/RECORD +7 -7
- pyproject.toml +1 -1
- {langroid-0.12.0.dist-info → langroid-0.14.0.dist-info}/LICENSE +0 -0
- {langroid-0.12.0.dist-info → langroid-0.14.0.dist-info}/WHEEL +0 -0
@@ -14,6 +14,7 @@ pip install "langroid[hf-embeddings]"
|
|
14
14
|
"""
|
15
15
|
|
16
16
|
import logging
|
17
|
+
from collections import OrderedDict
|
17
18
|
from functools import cache
|
18
19
|
from typing import Any, Dict, List, Optional, Set, Tuple, no_type_check
|
19
20
|
|
@@ -49,7 +50,6 @@ from langroid.parsing.search import (
|
|
49
50
|
from langroid.parsing.table_loader import describe_dataframe
|
50
51
|
from langroid.parsing.url_loader import URLLoader
|
51
52
|
from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
|
52
|
-
from langroid.parsing.utils import batched
|
53
53
|
from langroid.prompts.prompts_config import PromptsConfig
|
54
54
|
from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
|
55
55
|
from langroid.utils.constants import NO_ANSWER
|
@@ -131,13 +131,16 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
131
131
|
n_fuzzy_neighbor_words: int = 100 # num neighbor words to retrieve for fuzzy match
|
132
132
|
use_fuzzy_match: bool = True
|
133
133
|
use_bm25_search: bool = True
|
134
|
+
use_reciprocal_rank_fusion: bool = True # ignored if using cross-encoder reranking
|
134
135
|
cross_encoder_reranking_model: str = (
|
135
136
|
"cross-encoder/ms-marco-MiniLM-L-6-v2" if has_sentence_transformers else ""
|
136
137
|
)
|
137
138
|
rerank_diversity: bool = True # rerank to maximize diversity?
|
138
139
|
rerank_periphery: bool = True # rerank to avoid Lost In the Middle effect?
|
139
140
|
rerank_after_adding_context: bool = True # rerank after adding context window?
|
140
|
-
|
141
|
+
# RRF (Reciprocal Rank Fusion) score = 1/(rank + reciprocal_rank_fusion_constant)
|
142
|
+
# see https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#how-rrf-ranking-works
|
143
|
+
reciprocal_rank_fusion_constant: float = 60.0
|
141
144
|
cache: bool = True # cache results
|
142
145
|
debug: bool = False
|
143
146
|
stream: bool = True # allow streaming where needed
|
@@ -400,7 +403,11 @@ class DocChatAgent(ChatAgent):
|
|
400
403
|
if split:
|
401
404
|
docs = self.parser.split(docs)
|
402
405
|
else:
|
403
|
-
self.
|
406
|
+
if self.config.n_neighbor_chunks > 0:
|
407
|
+
self.parser.add_window_ids(docs)
|
408
|
+
# we're not splitting, so we mark each doc as a chunk
|
409
|
+
for d in docs:
|
410
|
+
d.metadata.is_chunk = True
|
404
411
|
if self.vecdb is None:
|
405
412
|
raise ValueError("VecDB not set")
|
406
413
|
|
@@ -422,10 +429,9 @@ class DocChatAgent(ChatAgent):
|
|
422
429
|
+ d.content
|
423
430
|
)
|
424
431
|
docs = docs[: self.config.parsing.max_chunks]
|
425
|
-
#
|
426
|
-
|
427
|
-
|
428
|
-
self.vecdb.add_documents(batch)
|
432
|
+
# vecdb should take care of adding docs in batches;
|
433
|
+
# batching can be controlled via vecdb.config.batch_size
|
434
|
+
self.vecdb.add_documents(docs)
|
429
435
|
self.original_docs_length = self.doc_length(docs)
|
430
436
|
self.setup_documents(docs, filter=self.config.filter)
|
431
437
|
return len(docs)
|
@@ -894,7 +900,9 @@ class DocChatAgent(ChatAgent):
|
|
894
900
|
)
|
895
901
|
return docs_scores
|
896
902
|
|
897
|
-
def get_fuzzy_matches(
|
903
|
+
def get_fuzzy_matches(
|
904
|
+
self, query: str, multiple: int
|
905
|
+
) -> List[Tuple[Document, float]]:
|
898
906
|
# find similar docs using fuzzy matching:
|
899
907
|
# these may sometimes be more likely to contain a relevant verbatim extract
|
900
908
|
with status("[cyan]Finding fuzzy matches in chunks..."):
|
@@ -909,8 +917,8 @@ class DocChatAgent(ChatAgent):
|
|
909
917
|
self.chunked_docs,
|
910
918
|
self.chunked_docs_clean,
|
911
919
|
k=self.config.parsing.n_similar_docs * multiple,
|
912
|
-
words_before=self.config.n_fuzzy_neighbor_words,
|
913
|
-
words_after=self.config.n_fuzzy_neighbor_words,
|
920
|
+
words_before=self.config.n_fuzzy_neighbor_words or None,
|
921
|
+
words_after=self.config.n_fuzzy_neighbor_words or None,
|
914
922
|
)
|
915
923
|
return fuzzy_match_docs
|
916
924
|
|
@@ -1102,10 +1110,17 @@ class DocChatAgent(ChatAgent):
|
|
1102
1110
|
Returns:
|
1103
1111
|
|
1104
1112
|
"""
|
1105
|
-
# if we are using cross-encoder reranking
|
1106
|
-
# during retrieval, and leave it to the cross-encoder
|
1107
|
-
# to whittle down to self.config.parsing.n_similar_docs
|
1108
|
-
retrieval_multiple =
|
1113
|
+
# if we are using cross-encoder reranking or reciprocal rank fusion (RRF),
|
1114
|
+
# we can retrieve more docs during retrieval, and leave it to the cross-encoder
|
1115
|
+
# or RRF reranking to whittle down to self.config.parsing.n_similar_docs
|
1116
|
+
retrieval_multiple = (
|
1117
|
+
1
|
1118
|
+
if (
|
1119
|
+
self.config.cross_encoder_reranking_model == ""
|
1120
|
+
and not self.config.use_reciprocal_rank_fusion
|
1121
|
+
)
|
1122
|
+
else 3
|
1123
|
+
)
|
1109
1124
|
|
1110
1125
|
if self.vecdb is None:
|
1111
1126
|
raise ValueError("VecDB not set")
|
@@ -1117,26 +1132,98 @@ class DocChatAgent(ChatAgent):
|
|
1117
1132
|
q,
|
1118
1133
|
k=self.config.parsing.n_similar_docs * retrieval_multiple,
|
1119
1134
|
)
|
1135
|
+
# sort by score descending
|
1136
|
+
docs_and_scores = sorted(
|
1137
|
+
docs_and_scores, key=lambda x: x[1], reverse=True
|
1138
|
+
)
|
1139
|
+
|
1120
1140
|
# keep only docs with unique d.id()
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
# Document(content=d.content, metadata=d.metadata)
|
1126
|
-
# for (d, _) in docs_and_scores
|
1127
|
-
# ]
|
1141
|
+
id2_rank_semantic = {d.id(): i for i, (d, _) in enumerate(docs_and_scores)}
|
1142
|
+
id2doc = {d.id(): d for d, _ in docs_and_scores}
|
1143
|
+
# make sure we get unique docs
|
1144
|
+
passages = [id2doc[id] for id, _ in id2_rank_semantic.items()]
|
1128
1145
|
|
1146
|
+
id2_rank_bm25 = {}
|
1129
1147
|
if self.config.use_bm25_search:
|
1148
|
+
# TODO: Add score threshold in config
|
1130
1149
|
docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
|
1131
|
-
|
1150
|
+
if self.config.cross_encoder_reranking_model == "":
|
1151
|
+
# only if we're not re-ranking with a cross-encoder,
|
1152
|
+
# we collect these ranks for Reciprocal Rank Fusion down below.
|
1153
|
+
docs_scores = sorted(docs_scores, key=lambda x: x[1], reverse=True)
|
1154
|
+
id2_rank_bm25 = {d.id(): i for i, (d, _) in enumerate(docs_scores)}
|
1155
|
+
id2doc.update({d.id(): d for d, _ in docs_scores})
|
1156
|
+
else:
|
1157
|
+
passages += [d for (d, _) in docs_scores]
|
1132
1158
|
|
1159
|
+
id2_rank_fuzzy = {}
|
1133
1160
|
if self.config.use_fuzzy_match:
|
1134
|
-
|
1135
|
-
|
1161
|
+
# TODO: Add score threshold in config
|
1162
|
+
fuzzy_match_doc_scores = self.get_fuzzy_matches(query, retrieval_multiple)
|
1163
|
+
if self.config.cross_encoder_reranking_model == "":
|
1164
|
+
# only if we're not re-ranking with a cross-encoder,
|
1165
|
+
# we collect these ranks for Reciprocal Rank Fusion down below.
|
1166
|
+
fuzzy_match_doc_scores = sorted(
|
1167
|
+
fuzzy_match_doc_scores, key=lambda x: x[1], reverse=True
|
1168
|
+
)
|
1169
|
+
id2_rank_fuzzy = {
|
1170
|
+
d.id(): i for i, (d, _) in enumerate(fuzzy_match_doc_scores)
|
1171
|
+
}
|
1172
|
+
id2doc.update({d.id(): d for d, _ in fuzzy_match_doc_scores})
|
1173
|
+
else:
|
1174
|
+
passages += [d for (d, _) in fuzzy_match_doc_scores]
|
1136
1175
|
|
1137
|
-
|
1138
|
-
|
1139
|
-
|
1176
|
+
if (
|
1177
|
+
self.config.cross_encoder_reranking_model == ""
|
1178
|
+
and self.config.use_reciprocal_rank_fusion
|
1179
|
+
and (self.config.use_bm25_search or self.config.use_fuzzy_match)
|
1180
|
+
):
|
1181
|
+
# Since we're not using cross-enocder re-ranking,
|
1182
|
+
# we need to re-order the retrieved chunks from potentially three
|
1183
|
+
# different retrieval methods (semantic, bm25, fuzzy), where the
|
1184
|
+
# similarity scores are on different scales.
|
1185
|
+
# We order the retrieved chunks using Reciprocal Rank Fusion (RRF) score.
|
1186
|
+
# Combine the ranks from each id2doc_rank_* dict into a single dict,
|
1187
|
+
# where the reciprocal rank score is the sum of
|
1188
|
+
# 1/(rank + self.config.reciprocal_rank_fusion_constant).
|
1189
|
+
# See https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking
|
1190
|
+
#
|
1191
|
+
# Note: diversity/periphery-reranking below may modify the final ranking.
|
1192
|
+
id2_reciprocal_score = {}
|
1193
|
+
for id_ in (
|
1194
|
+
set(id2_rank_semantic.keys())
|
1195
|
+
| set(id2_rank_bm25.keys())
|
1196
|
+
| set(id2_rank_fuzzy.keys())
|
1197
|
+
):
|
1198
|
+
rank_semantic = id2_rank_semantic.get(id_, float("inf"))
|
1199
|
+
rank_bm25 = id2_rank_bm25.get(id_, float("inf"))
|
1200
|
+
rank_fuzzy = id2_rank_fuzzy.get(id_, float("inf"))
|
1201
|
+
c = self.config.reciprocal_rank_fusion_constant
|
1202
|
+
reciprocal_fusion_score = (
|
1203
|
+
1 / (rank_semantic + c) + 1 / (rank_bm25 + c) + 1 / (rank_fuzzy + c)
|
1204
|
+
)
|
1205
|
+
id2_reciprocal_score[id_] = reciprocal_fusion_score
|
1206
|
+
|
1207
|
+
# sort the docs by the reciprocal score, in descending order
|
1208
|
+
id2_reciprocal_score = OrderedDict(
|
1209
|
+
sorted(
|
1210
|
+
id2_reciprocal_score.items(),
|
1211
|
+
key=lambda x: x[1],
|
1212
|
+
reverse=True,
|
1213
|
+
)
|
1214
|
+
)
|
1215
|
+
# each method retrieved up to retrieval_multiple * n_similar_docs,
|
1216
|
+
# so we need to take the top n_similar_docs from the combined list
|
1217
|
+
passages = [
|
1218
|
+
id2doc[id]
|
1219
|
+
for i, (id, _) in enumerate(id2_reciprocal_score.items())
|
1220
|
+
if i < self.config.parsing.n_similar_docs
|
1221
|
+
]
|
1222
|
+
# passages must have distinct ids
|
1223
|
+
assert len(passages) == len(set([d.id() for d in passages])), (
|
1224
|
+
f"Duplicate passages in retrieved docs: {len(passages)} != "
|
1225
|
+
f"{len(set([d.id() for d in passages]))}"
|
1226
|
+
)
|
1140
1227
|
|
1141
1228
|
if len(passages) == 0:
|
1142
1229
|
return []
|
@@ -1166,7 +1253,7 @@ class DocChatAgent(ChatAgent):
|
|
1166
1253
|
passages_scores = self.add_context_window(passages_scores)
|
1167
1254
|
passages = [p for p, _ in passages_scores]
|
1168
1255
|
|
1169
|
-
return passages
|
1256
|
+
return passages[: self.config.parsing.n_similar_docs]
|
1170
1257
|
|
1171
1258
|
@no_type_check
|
1172
1259
|
def get_relevant_extracts(self, query: str) -> Tuple[str, List[Document]]:
|
langroid/parsing/search.py
CHANGED
@@ -27,7 +27,7 @@ def find_fuzzy_matches_in_docs(
|
|
27
27
|
k: int,
|
28
28
|
words_before: int | None = None,
|
29
29
|
words_after: int | None = None,
|
30
|
-
) -> List[Document]:
|
30
|
+
) -> List[Tuple[Document, float]]:
|
31
31
|
"""
|
32
32
|
Find approximate matches of the query in the docs and return surrounding
|
33
33
|
characters.
|
@@ -35,6 +35,7 @@ def find_fuzzy_matches_in_docs(
|
|
35
35
|
Args:
|
36
36
|
query (str): The search string.
|
37
37
|
docs (List[Document]): List of Document objects to search through.
|
38
|
+
docs_clean (List[Document]): List of Document objects with cleaned content.
|
38
39
|
k (int): Number of best matches to return.
|
39
40
|
words_before (int|None): Number of words to include before each match.
|
40
41
|
Default None => return max
|
@@ -42,8 +43,7 @@ def find_fuzzy_matches_in_docs(
|
|
42
43
|
Default None => return max
|
43
44
|
|
44
45
|
Returns:
|
45
|
-
List[Document]: List of
|
46
|
-
including the given number of words around the match.
|
46
|
+
List[Tuple[Document,float]]: List of (Document, score) tuples.
|
47
47
|
"""
|
48
48
|
if len(docs) == 0:
|
49
49
|
return []
|
@@ -54,19 +54,19 @@ def find_fuzzy_matches_in_docs(
|
|
54
54
|
scorer=fuzz.partial_ratio,
|
55
55
|
)
|
56
56
|
|
57
|
-
real_matches = [m for m, score in best_matches if score > 50]
|
57
|
+
real_matches = [(m, score) for m, score in best_matches if score > 50]
|
58
58
|
# find the original docs that corresponding to the matches
|
59
59
|
orig_doc_matches = []
|
60
|
-
for i, m in enumerate(real_matches):
|
60
|
+
for i, (m, s) in enumerate(real_matches):
|
61
61
|
for j, doc_clean in enumerate(docs_clean):
|
62
62
|
if m in doc_clean.content:
|
63
|
-
orig_doc_matches.append(docs[j])
|
63
|
+
orig_doc_matches.append((docs[j], s))
|
64
64
|
break
|
65
65
|
if words_after is None and words_before is None:
|
66
66
|
return orig_doc_matches
|
67
67
|
if len(orig_doc_matches) == 0:
|
68
68
|
return []
|
69
|
-
if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
|
69
|
+
if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
|
70
70
|
# If there are fields beyond just content and metadata,
|
71
71
|
# we do NOT want to create new document objects with content fields
|
72
72
|
# based on words_before and words_after, since we don't know how to
|
@@ -74,7 +74,7 @@ def find_fuzzy_matches_in_docs(
|
|
74
74
|
return orig_doc_matches
|
75
75
|
|
76
76
|
contextual_matches = []
|
77
|
-
for match in orig_doc_matches:
|
77
|
+
for match, score in orig_doc_matches:
|
78
78
|
choice_text = match.content
|
79
79
|
contexts = []
|
80
80
|
while choice_text != "":
|
@@ -89,9 +89,12 @@ def find_fuzzy_matches_in_docs(
|
|
89
89
|
choice_text = " ".join(words[end_pos:])
|
90
90
|
if len(contexts) > 0:
|
91
91
|
contextual_matches.append(
|
92
|
-
|
93
|
-
|
94
|
-
|
92
|
+
(
|
93
|
+
Document(
|
94
|
+
content=" ... ".join(contexts),
|
95
|
+
metadata=match.metadata,
|
96
|
+
),
|
97
|
+
score,
|
95
98
|
)
|
96
99
|
)
|
97
100
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.14.0
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
License: MIT
|
6
6
|
Author: Prasad Chalasani
|
@@ -153,6 +153,8 @@ This Multi-Agent paradigm is inspired by the
|
|
153
153
|
`Langroid` is a fresh take on LLM app-development, where considerable thought has gone
|
154
154
|
into simplifying the developer experience; it does not use `Langchain`.
|
155
155
|
|
156
|
+
:fire: Read the (WIP) [overview of the langroid architecture](https://langroid.github.io/langroid/blog/2024/08/15/overview-of-langroids-multi-agent-architecture-prelim/)
|
157
|
+
|
156
158
|
📢 Companies are using/adapting Langroid in **production**. Here is a quote:
|
157
159
|
|
158
160
|
>[Nullify](https://www.nullify.ai) uses AI Agents for secure software development.
|
@@ -10,7 +10,7 @@ langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
|
11
11
|
langroid/agent/openai_assistant.py,sha256=2rjCZw45ysNBEGNzQM4uf0bTC4KkatGYAWcVcW4xcek,34337
|
12
12
|
langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
|
13
|
-
langroid/agent/special/doc_chat_agent.py,sha256=
|
13
|
+
langroid/agent/special/doc_chat_agent.py,sha256=r1uPunYf2lQcqYQ4fsD8Q5gB9cZyf7cn0KPcR_CLtrU,59065
|
14
14
|
langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
|
15
15
|
langroid/agent/special/lance_rag/__init__.py,sha256=QTbs0IVE2ZgDg8JJy1zN97rUUg4uEPH7SLGctFNumk4,174
|
16
16
|
langroid/agent/special/lance_rag/critic_agent.py,sha256=OtFuHthKQLkdVkvuZ2m0GNq1qOYLqHkm1pfLRFnSg5c,9548
|
@@ -91,7 +91,7 @@ langroid/parsing/parse_json.py,sha256=sKrYv9-IUqRFaTJA24_rmfjN1E7dQSrTBrtd1jYDE1
|
|
91
91
|
langroid/parsing/parser.py,sha256=AgtmlVUvrkSG1l7-YZPX8rlldgXjh_HqXAMqpXkBxUo,11746
|
92
92
|
langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
|
93
93
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
94
|
-
langroid/parsing/search.py,sha256=
|
94
|
+
langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
|
95
95
|
langroid/parsing/spider.py,sha256=Y6y7b86Y2k770LdhxgjVlImBxuuy1V9n8-XQ3QPaG5s,3199
|
96
96
|
langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
|
97
97
|
langroid/parsing/url_loader.py,sha256=Na2TBlKuQkloZzkE2d7xl6mh9olS3CbpgCsJbJ-xhIA,4472
|
@@ -137,8 +137,8 @@ langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3Hmh
|
|
137
137
|
langroid/vector_store/momento.py,sha256=qR-zBF1RKVHQZPZQYW_7g-XpTwr46p8HJuYPCkfJbM4,10534
|
138
138
|
langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
|
139
139
|
langroid/vector_store/qdrantdb.py,sha256=v88lqFkepADvlN6lByUj9I4NEKa9X9lWH16uTPPbYrE,17457
|
140
|
-
pyproject.toml,sha256=
|
141
|
-
langroid-0.
|
142
|
-
langroid-0.
|
143
|
-
langroid-0.
|
144
|
-
langroid-0.
|
140
|
+
pyproject.toml,sha256=W5AMGnCoX4SvE5HYNJlJcernYJ-sbIVoVmfpVifMMm8,7107
|
141
|
+
langroid-0.14.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
142
|
+
langroid-0.14.0.dist-info/METADATA,sha256=hEJyAJh8I1K9102zVxSya1pVgXxTUNkPXKo__JUtf54,55430
|
143
|
+
langroid-0.14.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
144
|
+
langroid-0.14.0.dist-info/RECORD,,
|
pyproject.toml
CHANGED
File without changes
|
File without changes
|