langroid 0.12.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@ pip install "langroid[hf-embeddings]"
14
14
  """
15
15
 
16
16
  import logging
17
+ from collections import OrderedDict
17
18
  from functools import cache
18
19
  from typing import Any, Dict, List, Optional, Set, Tuple, no_type_check
19
20
 
@@ -49,7 +50,6 @@ from langroid.parsing.search import (
49
50
  from langroid.parsing.table_loader import describe_dataframe
50
51
  from langroid.parsing.url_loader import URLLoader
51
52
  from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
52
- from langroid.parsing.utils import batched
53
53
  from langroid.prompts.prompts_config import PromptsConfig
54
54
  from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
55
55
  from langroid.utils.constants import NO_ANSWER
@@ -131,13 +131,16 @@ class DocChatAgentConfig(ChatAgentConfig):
131
131
  n_fuzzy_neighbor_words: int = 100 # num neighbor words to retrieve for fuzzy match
132
132
  use_fuzzy_match: bool = True
133
133
  use_bm25_search: bool = True
134
+ use_reciprocal_rank_fusion: bool = True # ignored if using cross-encoder reranking
134
135
  cross_encoder_reranking_model: str = (
135
136
  "cross-encoder/ms-marco-MiniLM-L-6-v2" if has_sentence_transformers else ""
136
137
  )
137
138
  rerank_diversity: bool = True # rerank to maximize diversity?
138
139
  rerank_periphery: bool = True # rerank to avoid Lost In the Middle effect?
139
140
  rerank_after_adding_context: bool = True # rerank after adding context window?
140
- embed_batch_size: int = 500 # get embedding of at most this many at a time
141
+ # RRF (Reciprocal Rank Fusion) score = 1/(rank + reciprocal_rank_fusion_constant)
142
+ # see https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#how-rrf-ranking-works
143
+ reciprocal_rank_fusion_constant: float = 60.0
141
144
  cache: bool = True # cache results
142
145
  debug: bool = False
143
146
  stream: bool = True # allow streaming where needed
@@ -400,7 +403,11 @@ class DocChatAgent(ChatAgent):
400
403
  if split:
401
404
  docs = self.parser.split(docs)
402
405
  else:
403
- self.parser.add_window_ids(docs)
406
+ if self.config.n_neighbor_chunks > 0:
407
+ self.parser.add_window_ids(docs)
408
+ # we're not splitting, so we mark each doc as a chunk
409
+ for d in docs:
410
+ d.metadata.is_chunk = True
404
411
  if self.vecdb is None:
405
412
  raise ValueError("VecDB not set")
406
413
 
@@ -422,10 +429,9 @@ class DocChatAgent(ChatAgent):
422
429
  + d.content
423
430
  )
424
431
  docs = docs[: self.config.parsing.max_chunks]
425
- # add embeddings in batches, to stay under limit of embeddings API
426
- batches = list(batched(docs, self.config.embed_batch_size))
427
- for batch in batches:
428
- self.vecdb.add_documents(batch)
432
+ # vecdb should take care of adding docs in batches;
433
+ # batching can be controlled via vecdb.config.batch_size
434
+ self.vecdb.add_documents(docs)
429
435
  self.original_docs_length = self.doc_length(docs)
430
436
  self.setup_documents(docs, filter=self.config.filter)
431
437
  return len(docs)
@@ -894,7 +900,9 @@ class DocChatAgent(ChatAgent):
894
900
  )
895
901
  return docs_scores
896
902
 
897
- def get_fuzzy_matches(self, query: str, multiple: int) -> List[Document]:
903
+ def get_fuzzy_matches(
904
+ self, query: str, multiple: int
905
+ ) -> List[Tuple[Document, float]]:
898
906
  # find similar docs using fuzzy matching:
899
907
  # these may sometimes be more likely to contain a relevant verbatim extract
900
908
  with status("[cyan]Finding fuzzy matches in chunks..."):
@@ -909,8 +917,8 @@ class DocChatAgent(ChatAgent):
909
917
  self.chunked_docs,
910
918
  self.chunked_docs_clean,
911
919
  k=self.config.parsing.n_similar_docs * multiple,
912
- words_before=self.config.n_fuzzy_neighbor_words,
913
- words_after=self.config.n_fuzzy_neighbor_words,
920
+ words_before=self.config.n_fuzzy_neighbor_words or None,
921
+ words_after=self.config.n_fuzzy_neighbor_words or None,
914
922
  )
915
923
  return fuzzy_match_docs
916
924
 
@@ -1102,10 +1110,17 @@ class DocChatAgent(ChatAgent):
1102
1110
  Returns:
1103
1111
 
1104
1112
  """
1105
- # if we are using cross-encoder reranking, we can retrieve more docs
1106
- # during retrieval, and leave it to the cross-encoder re-ranking
1107
- # to whittle down to self.config.parsing.n_similar_docs
1108
- retrieval_multiple = 1 if self.config.cross_encoder_reranking_model == "" else 3
1113
+ # if we are using cross-encoder reranking or reciprocal rank fusion (RRF),
1114
+ # we can retrieve more docs during retrieval, and leave it to the cross-encoder
1115
+ # or RRF reranking to whittle down to self.config.parsing.n_similar_docs
1116
+ retrieval_multiple = (
1117
+ 1
1118
+ if (
1119
+ self.config.cross_encoder_reranking_model == ""
1120
+ and not self.config.use_reciprocal_rank_fusion
1121
+ )
1122
+ else 3
1123
+ )
1109
1124
 
1110
1125
  if self.vecdb is None:
1111
1126
  raise ValueError("VecDB not set")
@@ -1117,26 +1132,98 @@ class DocChatAgent(ChatAgent):
1117
1132
  q,
1118
1133
  k=self.config.parsing.n_similar_docs * retrieval_multiple,
1119
1134
  )
1135
+ # sort by score descending
1136
+ docs_and_scores = sorted(
1137
+ docs_and_scores, key=lambda x: x[1], reverse=True
1138
+ )
1139
+
1120
1140
  # keep only docs with unique d.id()
1121
- id2doc_score = {d.id(): (d, s) for d, s in docs_and_scores}
1122
- docs_and_scores = list(id2doc_score.values())
1123
- passages = [d for (d, _) in docs_and_scores]
1124
- # passages = [
1125
- # Document(content=d.content, metadata=d.metadata)
1126
- # for (d, _) in docs_and_scores
1127
- # ]
1141
+ id2_rank_semantic = {d.id(): i for i, (d, _) in enumerate(docs_and_scores)}
1142
+ id2doc = {d.id(): d for d, _ in docs_and_scores}
1143
+ # make sure we get unique docs
1144
+ passages = [id2doc[id] for id, _ in id2_rank_semantic.items()]
1128
1145
 
1146
+ id2_rank_bm25 = {}
1129
1147
  if self.config.use_bm25_search:
1148
+ # TODO: Add score threshold in config
1130
1149
  docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
1131
- passages += [d for (d, _) in docs_scores]
1150
+ if self.config.cross_encoder_reranking_model == "":
1151
+ # only if we're not re-ranking with a cross-encoder,
1152
+ # we collect these ranks for Reciprocal Rank Fusion down below.
1153
+ docs_scores = sorted(docs_scores, key=lambda x: x[1], reverse=True)
1154
+ id2_rank_bm25 = {d.id(): i for i, (d, _) in enumerate(docs_scores)}
1155
+ id2doc.update({d.id(): d for d, _ in docs_scores})
1156
+ else:
1157
+ passages += [d for (d, _) in docs_scores]
1132
1158
 
1159
+ id2_rank_fuzzy = {}
1133
1160
  if self.config.use_fuzzy_match:
1134
- fuzzy_match_docs = self.get_fuzzy_matches(query, retrieval_multiple)
1135
- passages += fuzzy_match_docs
1161
+ # TODO: Add score threshold in config
1162
+ fuzzy_match_doc_scores = self.get_fuzzy_matches(query, retrieval_multiple)
1163
+ if self.config.cross_encoder_reranking_model == "":
1164
+ # only if we're not re-ranking with a cross-encoder,
1165
+ # we collect these ranks for Reciprocal Rank Fusion down below.
1166
+ fuzzy_match_doc_scores = sorted(
1167
+ fuzzy_match_doc_scores, key=lambda x: x[1], reverse=True
1168
+ )
1169
+ id2_rank_fuzzy = {
1170
+ d.id(): i for i, (d, _) in enumerate(fuzzy_match_doc_scores)
1171
+ }
1172
+ id2doc.update({d.id(): d for d, _ in fuzzy_match_doc_scores})
1173
+ else:
1174
+ passages += [d for (d, _) in fuzzy_match_doc_scores]
1136
1175
 
1137
- # keep unique passages
1138
- id2passage = {p.id(): p for p in passages}
1139
- passages = list(id2passage.values())
1176
+ if (
1177
+ self.config.cross_encoder_reranking_model == ""
1178
+ and self.config.use_reciprocal_rank_fusion
1179
+ and (self.config.use_bm25_search or self.config.use_fuzzy_match)
1180
+ ):
1181
+ # Since we're not using cross-enocder re-ranking,
1182
+ # we need to re-order the retrieved chunks from potentially three
1183
+ # different retrieval methods (semantic, bm25, fuzzy), where the
1184
+ # similarity scores are on different scales.
1185
+ # We order the retrieved chunks using Reciprocal Rank Fusion (RRF) score.
1186
+ # Combine the ranks from each id2doc_rank_* dict into a single dict,
1187
+ # where the reciprocal rank score is the sum of
1188
+ # 1/(rank + self.config.reciprocal_rank_fusion_constant).
1189
+ # See https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking
1190
+ #
1191
+ # Note: diversity/periphery-reranking below may modify the final ranking.
1192
+ id2_reciprocal_score = {}
1193
+ for id_ in (
1194
+ set(id2_rank_semantic.keys())
1195
+ | set(id2_rank_bm25.keys())
1196
+ | set(id2_rank_fuzzy.keys())
1197
+ ):
1198
+ rank_semantic = id2_rank_semantic.get(id_, float("inf"))
1199
+ rank_bm25 = id2_rank_bm25.get(id_, float("inf"))
1200
+ rank_fuzzy = id2_rank_fuzzy.get(id_, float("inf"))
1201
+ c = self.config.reciprocal_rank_fusion_constant
1202
+ reciprocal_fusion_score = (
1203
+ 1 / (rank_semantic + c) + 1 / (rank_bm25 + c) + 1 / (rank_fuzzy + c)
1204
+ )
1205
+ id2_reciprocal_score[id_] = reciprocal_fusion_score
1206
+
1207
+ # sort the docs by the reciprocal score, in descending order
1208
+ id2_reciprocal_score = OrderedDict(
1209
+ sorted(
1210
+ id2_reciprocal_score.items(),
1211
+ key=lambda x: x[1],
1212
+ reverse=True,
1213
+ )
1214
+ )
1215
+ # each method retrieved up to retrieval_multiple * n_similar_docs,
1216
+ # so we need to take the top n_similar_docs from the combined list
1217
+ passages = [
1218
+ id2doc[id]
1219
+ for i, (id, _) in enumerate(id2_reciprocal_score.items())
1220
+ if i < self.config.parsing.n_similar_docs
1221
+ ]
1222
+ # passages must have distinct ids
1223
+ assert len(passages) == len(set([d.id() for d in passages])), (
1224
+ f"Duplicate passages in retrieved docs: {len(passages)} != "
1225
+ f"{len(set([d.id() for d in passages]))}"
1226
+ )
1140
1227
 
1141
1228
  if len(passages) == 0:
1142
1229
  return []
@@ -1166,7 +1253,7 @@ class DocChatAgent(ChatAgent):
1166
1253
  passages_scores = self.add_context_window(passages_scores)
1167
1254
  passages = [p for p, _ in passages_scores]
1168
1255
 
1169
- return passages
1256
+ return passages[: self.config.parsing.n_similar_docs]
1170
1257
 
1171
1258
  @no_type_check
1172
1259
  def get_relevant_extracts(self, query: str) -> Tuple[str, List[Document]]:
@@ -27,7 +27,7 @@ def find_fuzzy_matches_in_docs(
27
27
  k: int,
28
28
  words_before: int | None = None,
29
29
  words_after: int | None = None,
30
- ) -> List[Document]:
30
+ ) -> List[Tuple[Document, float]]:
31
31
  """
32
32
  Find approximate matches of the query in the docs and return surrounding
33
33
  characters.
@@ -35,6 +35,7 @@ def find_fuzzy_matches_in_docs(
35
35
  Args:
36
36
  query (str): The search string.
37
37
  docs (List[Document]): List of Document objects to search through.
38
+ docs_clean (List[Document]): List of Document objects with cleaned content.
38
39
  k (int): Number of best matches to return.
39
40
  words_before (int|None): Number of words to include before each match.
40
41
  Default None => return max
@@ -42,8 +43,7 @@ def find_fuzzy_matches_in_docs(
42
43
  Default None => return max
43
44
 
44
45
  Returns:
45
- List[Document]: List of Documents containing the matches,
46
- including the given number of words around the match.
46
+ List[Tuple[Document,float]]: List of (Document, score) tuples.
47
47
  """
48
48
  if len(docs) == 0:
49
49
  return []
@@ -54,19 +54,19 @@ def find_fuzzy_matches_in_docs(
54
54
  scorer=fuzz.partial_ratio,
55
55
  )
56
56
 
57
- real_matches = [m for m, score in best_matches if score > 50]
57
+ real_matches = [(m, score) for m, score in best_matches if score > 50]
58
58
  # find the original docs that corresponding to the matches
59
59
  orig_doc_matches = []
60
- for i, m in enumerate(real_matches):
60
+ for i, (m, s) in enumerate(real_matches):
61
61
  for j, doc_clean in enumerate(docs_clean):
62
62
  if m in doc_clean.content:
63
- orig_doc_matches.append(docs[j])
63
+ orig_doc_matches.append((docs[j], s))
64
64
  break
65
65
  if words_after is None and words_before is None:
66
66
  return orig_doc_matches
67
67
  if len(orig_doc_matches) == 0:
68
68
  return []
69
- if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
69
+ if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
70
70
  # If there are fields beyond just content and metadata,
71
71
  # we do NOT want to create new document objects with content fields
72
72
  # based on words_before and words_after, since we don't know how to
@@ -74,7 +74,7 @@ def find_fuzzy_matches_in_docs(
74
74
  return orig_doc_matches
75
75
 
76
76
  contextual_matches = []
77
- for match in orig_doc_matches:
77
+ for match, score in orig_doc_matches:
78
78
  choice_text = match.content
79
79
  contexts = []
80
80
  while choice_text != "":
@@ -89,9 +89,12 @@ def find_fuzzy_matches_in_docs(
89
89
  choice_text = " ".join(words[end_pos:])
90
90
  if len(contexts) > 0:
91
91
  contextual_matches.append(
92
- Document(
93
- content=" ... ".join(contexts),
94
- metadata=match.metadata,
92
+ (
93
+ Document(
94
+ content=" ... ".join(contexts),
95
+ metadata=match.metadata,
96
+ ),
97
+ score,
95
98
  )
96
99
  )
97
100
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.12.0
3
+ Version: 0.14.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -153,6 +153,8 @@ This Multi-Agent paradigm is inspired by the
153
153
  `Langroid` is a fresh take on LLM app-development, where considerable thought has gone
154
154
  into simplifying the developer experience; it does not use `Langchain`.
155
155
 
156
+ :fire: Read the (WIP) [overview of the langroid architecture](https://langroid.github.io/langroid/blog/2024/08/15/overview-of-langroids-multi-agent-architecture-prelim/)
157
+
156
158
  📢 Companies are using/adapting Langroid in **production**. Here is a quote:
157
159
 
158
160
  >[Nullify](https://www.nullify.ai) uses AI Agents for secure software development.
@@ -10,7 +10,7 @@ langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
11
11
  langroid/agent/openai_assistant.py,sha256=2rjCZw45ysNBEGNzQM4uf0bTC4KkatGYAWcVcW4xcek,34337
12
12
  langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
13
- langroid/agent/special/doc_chat_agent.py,sha256=3EICtutRADu8S8v0qO8PGFu3VyqjDY6Gp8xYgNtiNSY,54596
13
+ langroid/agent/special/doc_chat_agent.py,sha256=r1uPunYf2lQcqYQ4fsD8Q5gB9cZyf7cn0KPcR_CLtrU,59065
14
14
  langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
15
15
  langroid/agent/special/lance_rag/__init__.py,sha256=QTbs0IVE2ZgDg8JJy1zN97rUUg4uEPH7SLGctFNumk4,174
16
16
  langroid/agent/special/lance_rag/critic_agent.py,sha256=OtFuHthKQLkdVkvuZ2m0GNq1qOYLqHkm1pfLRFnSg5c,9548
@@ -91,7 +91,7 @@ langroid/parsing/parse_json.py,sha256=sKrYv9-IUqRFaTJA24_rmfjN1E7dQSrTBrtd1jYDE1
91
91
  langroid/parsing/parser.py,sha256=AgtmlVUvrkSG1l7-YZPX8rlldgXjh_HqXAMqpXkBxUo,11746
92
92
  langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
93
93
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
94
- langroid/parsing/search.py,sha256=plQtjarB9afGfJLB0CyPXPq3mM4m7kRsfd0_4brziEI,8846
94
+ langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
95
95
  langroid/parsing/spider.py,sha256=Y6y7b86Y2k770LdhxgjVlImBxuuy1V9n8-XQ3QPaG5s,3199
96
96
  langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
97
97
  langroid/parsing/url_loader.py,sha256=Na2TBlKuQkloZzkE2d7xl6mh9olS3CbpgCsJbJ-xhIA,4472
@@ -137,8 +137,8 @@ langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3Hmh
137
137
  langroid/vector_store/momento.py,sha256=qR-zBF1RKVHQZPZQYW_7g-XpTwr46p8HJuYPCkfJbM4,10534
138
138
  langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
139
139
  langroid/vector_store/qdrantdb.py,sha256=v88lqFkepADvlN6lByUj9I4NEKa9X9lWH16uTPPbYrE,17457
140
- pyproject.toml,sha256=oocGdj8dqhrarP8c5LeFeOKboZ4WYNzs1YpcKszoJgM,7107
141
- langroid-0.12.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
142
- langroid-0.12.0.dist-info/METADATA,sha256=S-V-w4lhAay08FYPRyJcT7FliitUSaLkL9gQS25luSQ,55259
143
- langroid-0.12.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
144
- langroid-0.12.0.dist-info/RECORD,,
140
+ pyproject.toml,sha256=W5AMGnCoX4SvE5HYNJlJcernYJ-sbIVoVmfpVifMMm8,7107
141
+ langroid-0.14.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
142
+ langroid-0.14.0.dist-info/METADATA,sha256=hEJyAJh8I1K9102zVxSya1pVgXxTUNkPXKo__JUtf54,55430
143
+ langroid-0.14.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
144
+ langroid-0.14.0.dist-info/RECORD,,
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "langroid"
3
- version = "0.12.0"
3
+ version = "0.14.0"
4
4
  description = "Harness LLMs with Multi-Agent Programming"
5
5
  authors = ["Prasad Chalasani <pchalasani@gmail.com>"]
6
6
  readme = "README.md"