langroid 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -49,7 +49,6 @@ from langroid.parsing.search import (
49
49
  from langroid.parsing.table_loader import describe_dataframe
50
50
  from langroid.parsing.url_loader import URLLoader
51
51
  from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
52
- from langroid.parsing.utils import batched
53
52
  from langroid.prompts.prompts_config import PromptsConfig
54
53
  from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
55
54
  from langroid.utils.constants import NO_ANSWER
@@ -137,7 +136,6 @@ class DocChatAgentConfig(ChatAgentConfig):
137
136
  rerank_diversity: bool = True # rerank to maximize diversity?
138
137
  rerank_periphery: bool = True # rerank to avoid Lost In the Middle effect?
139
138
  rerank_after_adding_context: bool = True # rerank after adding context window?
140
- embed_batch_size: int = 500 # get embedding of at most this many at a time
141
139
  cache: bool = True # cache results
142
140
  debug: bool = False
143
141
  stream: bool = True # allow streaming where needed
@@ -400,7 +398,11 @@ class DocChatAgent(ChatAgent):
400
398
  if split:
401
399
  docs = self.parser.split(docs)
402
400
  else:
403
- self.parser.add_window_ids(docs)
401
+ if self.config.n_neighbor_chunks > 0:
402
+ self.parser.add_window_ids(docs)
403
+ # we're not splitting, so we mark each doc as a chunk
404
+ for d in docs:
405
+ d.metadata.is_chunk = True
404
406
  if self.vecdb is None:
405
407
  raise ValueError("VecDB not set")
406
408
 
@@ -422,10 +424,9 @@ class DocChatAgent(ChatAgent):
422
424
  + d.content
423
425
  )
424
426
  docs = docs[: self.config.parsing.max_chunks]
425
- # add embeddings in batches, to stay under limit of embeddings API
426
- batches = list(batched(docs, self.config.embed_batch_size))
427
- for batch in batches:
428
- self.vecdb.add_documents(batch)
427
+ # vecdb should take care of adding docs in batches;
428
+ # batching can be controlled via vecdb.config.batch_size
429
+ self.vecdb.add_documents(docs)
429
430
  self.original_docs_length = self.doc_length(docs)
430
431
  self.setup_documents(docs, filter=self.config.filter)
431
432
  return len(docs)
@@ -894,7 +895,9 @@ class DocChatAgent(ChatAgent):
894
895
  )
895
896
  return docs_scores
896
897
 
897
- def get_fuzzy_matches(self, query: str, multiple: int) -> List[Document]:
898
+ def get_fuzzy_matches(
899
+ self, query: str, multiple: int
900
+ ) -> List[Tuple[Document, float]]:
898
901
  # find similar docs using fuzzy matching:
899
902
  # these may sometimes be more likely to contain a relevant verbatim extract
900
903
  with status("[cyan]Finding fuzzy matches in chunks..."):
@@ -909,8 +912,8 @@ class DocChatAgent(ChatAgent):
909
912
  self.chunked_docs,
910
913
  self.chunked_docs_clean,
911
914
  k=self.config.parsing.n_similar_docs * multiple,
912
- words_before=self.config.n_fuzzy_neighbor_words,
913
- words_after=self.config.n_fuzzy_neighbor_words,
915
+ words_before=self.config.n_fuzzy_neighbor_words or None,
916
+ words_after=self.config.n_fuzzy_neighbor_words or None,
914
917
  )
915
918
  return fuzzy_match_docs
916
919
 
@@ -1127,12 +1130,14 @@ class DocChatAgent(ChatAgent):
1127
1130
  # ]
1128
1131
 
1129
1132
  if self.config.use_bm25_search:
1133
+ # TODO: Add score threshold in config
1130
1134
  docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
1131
1135
  passages += [d for (d, _) in docs_scores]
1132
1136
 
1133
1137
  if self.config.use_fuzzy_match:
1134
- fuzzy_match_docs = self.get_fuzzy_matches(query, retrieval_multiple)
1135
- passages += fuzzy_match_docs
1138
+ # TODO: Add score threshold in config
1139
+ fuzzy_match_doc_scores = self.get_fuzzy_matches(query, retrieval_multiple)
1140
+ passages += [d for (d, _) in fuzzy_match_doc_scores]
1136
1141
 
1137
1142
  # keep unique passages
1138
1143
  id2passage = {p.id(): p for p in passages}
@@ -27,7 +27,7 @@ def find_fuzzy_matches_in_docs(
27
27
  k: int,
28
28
  words_before: int | None = None,
29
29
  words_after: int | None = None,
30
- ) -> List[Document]:
30
+ ) -> List[Tuple[Document, float]]:
31
31
  """
32
32
  Find approximate matches of the query in the docs and return surrounding
33
33
  characters.
@@ -35,6 +35,7 @@ def find_fuzzy_matches_in_docs(
35
35
  Args:
36
36
  query (str): The search string.
37
37
  docs (List[Document]): List of Document objects to search through.
38
+ docs_clean (List[Document]): List of Document objects with cleaned content.
38
39
  k (int): Number of best matches to return.
39
40
  words_before (int|None): Number of words to include before each match.
40
41
  Default None => return max
@@ -42,8 +43,7 @@ def find_fuzzy_matches_in_docs(
42
43
  Default None => return max
43
44
 
44
45
  Returns:
45
- List[Document]: List of Documents containing the matches,
46
- including the given number of words around the match.
46
+ List[Tuple[Document,float]]: List of (Document, score) tuples.
47
47
  """
48
48
  if len(docs) == 0:
49
49
  return []
@@ -54,19 +54,19 @@ def find_fuzzy_matches_in_docs(
54
54
  scorer=fuzz.partial_ratio,
55
55
  )
56
56
 
57
- real_matches = [m for m, score in best_matches if score > 50]
57
+ real_matches = [(m, score) for m, score in best_matches if score > 50]
58
58
  # find the original docs that corresponding to the matches
59
59
  orig_doc_matches = []
60
- for i, m in enumerate(real_matches):
60
+ for i, (m, s) in enumerate(real_matches):
61
61
  for j, doc_clean in enumerate(docs_clean):
62
62
  if m in doc_clean.content:
63
- orig_doc_matches.append(docs[j])
63
+ orig_doc_matches.append((docs[j], s))
64
64
  break
65
65
  if words_after is None and words_before is None:
66
66
  return orig_doc_matches
67
67
  if len(orig_doc_matches) == 0:
68
68
  return []
69
- if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
69
+ if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
70
70
  # If there are fields beyond just content and metadata,
71
71
  # we do NOT want to create new document objects with content fields
72
72
  # based on words_before and words_after, since we don't know how to
@@ -74,7 +74,7 @@ def find_fuzzy_matches_in_docs(
74
74
  return orig_doc_matches
75
75
 
76
76
  contextual_matches = []
77
- for match in orig_doc_matches:
77
+ for match, score in orig_doc_matches:
78
78
  choice_text = match.content
79
79
  contexts = []
80
80
  while choice_text != "":
@@ -89,9 +89,12 @@ def find_fuzzy_matches_in_docs(
89
89
  choice_text = " ".join(words[end_pos:])
90
90
  if len(contexts) > 0:
91
91
  contextual_matches.append(
92
- Document(
93
- content=" ... ".join(contexts),
94
- metadata=match.metadata,
92
+ (
93
+ Document(
94
+ content=" ... ".join(contexts),
95
+ metadata=match.metadata,
96
+ ),
97
+ score,
95
98
  )
96
99
  )
97
100
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.12.0
3
+ Version: 0.13.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -10,7 +10,7 @@ langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
11
11
  langroid/agent/openai_assistant.py,sha256=2rjCZw45ysNBEGNzQM4uf0bTC4KkatGYAWcVcW4xcek,34337
12
12
  langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
13
- langroid/agent/special/doc_chat_agent.py,sha256=3EICtutRADu8S8v0qO8PGFu3VyqjDY6Gp8xYgNtiNSY,54596
13
+ langroid/agent/special/doc_chat_agent.py,sha256=dqm0Gp11Mfl4hOWN4sUR1uZL-oHEmHzcB6bNN6WFgqw,54784
14
14
  langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
15
15
  langroid/agent/special/lance_rag/__init__.py,sha256=QTbs0IVE2ZgDg8JJy1zN97rUUg4uEPH7SLGctFNumk4,174
16
16
  langroid/agent/special/lance_rag/critic_agent.py,sha256=OtFuHthKQLkdVkvuZ2m0GNq1qOYLqHkm1pfLRFnSg5c,9548
@@ -91,7 +91,7 @@ langroid/parsing/parse_json.py,sha256=sKrYv9-IUqRFaTJA24_rmfjN1E7dQSrTBrtd1jYDE1
91
91
  langroid/parsing/parser.py,sha256=AgtmlVUvrkSG1l7-YZPX8rlldgXjh_HqXAMqpXkBxUo,11746
92
92
  langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
93
93
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
94
- langroid/parsing/search.py,sha256=plQtjarB9afGfJLB0CyPXPq3mM4m7kRsfd0_4brziEI,8846
94
+ langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
95
95
  langroid/parsing/spider.py,sha256=Y6y7b86Y2k770LdhxgjVlImBxuuy1V9n8-XQ3QPaG5s,3199
96
96
  langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
97
97
  langroid/parsing/url_loader.py,sha256=Na2TBlKuQkloZzkE2d7xl6mh9olS3CbpgCsJbJ-xhIA,4472
@@ -137,8 +137,8 @@ langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3Hmh
137
137
  langroid/vector_store/momento.py,sha256=qR-zBF1RKVHQZPZQYW_7g-XpTwr46p8HJuYPCkfJbM4,10534
138
138
  langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
139
139
  langroid/vector_store/qdrantdb.py,sha256=v88lqFkepADvlN6lByUj9I4NEKa9X9lWH16uTPPbYrE,17457
140
- pyproject.toml,sha256=oocGdj8dqhrarP8c5LeFeOKboZ4WYNzs1YpcKszoJgM,7107
141
- langroid-0.12.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
142
- langroid-0.12.0.dist-info/METADATA,sha256=S-V-w4lhAay08FYPRyJcT7FliitUSaLkL9gQS25luSQ,55259
143
- langroid-0.12.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
144
- langroid-0.12.0.dist-info/RECORD,,
140
+ pyproject.toml,sha256=g99bgxP-XUiTx-KsdFICVJuV2bB89areQkDRU5sIgmk,7107
141
+ langroid-0.13.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
142
+ langroid-0.13.0.dist-info/METADATA,sha256=Znhge-Z8nn_L7Lxeh8dWs04d4ejZfj0NCCRutJJSkdg,55259
143
+ langroid-0.13.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
144
+ langroid-0.13.0.dist-info/RECORD,,
pyproject.toml CHANGED
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "langroid"
3
- version = "0.12.0"
3
+ version = "0.13.0"
4
4
  description = "Harness LLMs with Multi-Agent Programming"
5
5
  authors = ["Prasad Chalasani <pchalasani@gmail.com>"]
6
6
  readme = "README.md"