langroid 0.12.0__py3-none-any.whl → 0.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/special/doc_chat_agent.py +17 -12
- langroid/parsing/search.py +14 -11
- {langroid-0.12.0.dist-info → langroid-0.13.0.dist-info}/METADATA +1 -1
- {langroid-0.12.0.dist-info → langroid-0.13.0.dist-info}/RECORD +7 -7
- pyproject.toml +1 -1
- {langroid-0.12.0.dist-info → langroid-0.13.0.dist-info}/LICENSE +0 -0
- {langroid-0.12.0.dist-info → langroid-0.13.0.dist-info}/WHEEL +0 -0
@@ -49,7 +49,6 @@ from langroid.parsing.search import (
|
|
49
49
|
from langroid.parsing.table_loader import describe_dataframe
|
50
50
|
from langroid.parsing.url_loader import URLLoader
|
51
51
|
from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
|
52
|
-
from langroid.parsing.utils import batched
|
53
52
|
from langroid.prompts.prompts_config import PromptsConfig
|
54
53
|
from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
|
55
54
|
from langroid.utils.constants import NO_ANSWER
|
@@ -137,7 +136,6 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
137
136
|
rerank_diversity: bool = True # rerank to maximize diversity?
|
138
137
|
rerank_periphery: bool = True # rerank to avoid Lost In the Middle effect?
|
139
138
|
rerank_after_adding_context: bool = True # rerank after adding context window?
|
140
|
-
embed_batch_size: int = 500 # get embedding of at most this many at a time
|
141
139
|
cache: bool = True # cache results
|
142
140
|
debug: bool = False
|
143
141
|
stream: bool = True # allow streaming where needed
|
@@ -400,7 +398,11 @@ class DocChatAgent(ChatAgent):
|
|
400
398
|
if split:
|
401
399
|
docs = self.parser.split(docs)
|
402
400
|
else:
|
403
|
-
self.
|
401
|
+
if self.config.n_neighbor_chunks > 0:
|
402
|
+
self.parser.add_window_ids(docs)
|
403
|
+
# we're not splitting, so we mark each doc as a chunk
|
404
|
+
for d in docs:
|
405
|
+
d.metadata.is_chunk = True
|
404
406
|
if self.vecdb is None:
|
405
407
|
raise ValueError("VecDB not set")
|
406
408
|
|
@@ -422,10 +424,9 @@ class DocChatAgent(ChatAgent):
|
|
422
424
|
+ d.content
|
423
425
|
)
|
424
426
|
docs = docs[: self.config.parsing.max_chunks]
|
425
|
-
#
|
426
|
-
|
427
|
-
|
428
|
-
self.vecdb.add_documents(batch)
|
427
|
+
# vecdb should take care of adding docs in batches;
|
428
|
+
# batching can be controlled via vecdb.config.batch_size
|
429
|
+
self.vecdb.add_documents(docs)
|
429
430
|
self.original_docs_length = self.doc_length(docs)
|
430
431
|
self.setup_documents(docs, filter=self.config.filter)
|
431
432
|
return len(docs)
|
@@ -894,7 +895,9 @@ class DocChatAgent(ChatAgent):
|
|
894
895
|
)
|
895
896
|
return docs_scores
|
896
897
|
|
897
|
-
def get_fuzzy_matches(
|
898
|
+
def get_fuzzy_matches(
|
899
|
+
self, query: str, multiple: int
|
900
|
+
) -> List[Tuple[Document, float]]:
|
898
901
|
# find similar docs using fuzzy matching:
|
899
902
|
# these may sometimes be more likely to contain a relevant verbatim extract
|
900
903
|
with status("[cyan]Finding fuzzy matches in chunks..."):
|
@@ -909,8 +912,8 @@ class DocChatAgent(ChatAgent):
|
|
909
912
|
self.chunked_docs,
|
910
913
|
self.chunked_docs_clean,
|
911
914
|
k=self.config.parsing.n_similar_docs * multiple,
|
912
|
-
words_before=self.config.n_fuzzy_neighbor_words,
|
913
|
-
words_after=self.config.n_fuzzy_neighbor_words,
|
915
|
+
words_before=self.config.n_fuzzy_neighbor_words or None,
|
916
|
+
words_after=self.config.n_fuzzy_neighbor_words or None,
|
914
917
|
)
|
915
918
|
return fuzzy_match_docs
|
916
919
|
|
@@ -1127,12 +1130,14 @@ class DocChatAgent(ChatAgent):
|
|
1127
1130
|
# ]
|
1128
1131
|
|
1129
1132
|
if self.config.use_bm25_search:
|
1133
|
+
# TODO: Add score threshold in config
|
1130
1134
|
docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
|
1131
1135
|
passages += [d for (d, _) in docs_scores]
|
1132
1136
|
|
1133
1137
|
if self.config.use_fuzzy_match:
|
1134
|
-
|
1135
|
-
|
1138
|
+
# TODO: Add score threshold in config
|
1139
|
+
fuzzy_match_doc_scores = self.get_fuzzy_matches(query, retrieval_multiple)
|
1140
|
+
passages += [d for (d, _) in fuzzy_match_doc_scores]
|
1136
1141
|
|
1137
1142
|
# keep unique passages
|
1138
1143
|
id2passage = {p.id(): p for p in passages}
|
langroid/parsing/search.py
CHANGED
@@ -27,7 +27,7 @@ def find_fuzzy_matches_in_docs(
|
|
27
27
|
k: int,
|
28
28
|
words_before: int | None = None,
|
29
29
|
words_after: int | None = None,
|
30
|
-
) -> List[Document]:
|
30
|
+
) -> List[Tuple[Document, float]]:
|
31
31
|
"""
|
32
32
|
Find approximate matches of the query in the docs and return surrounding
|
33
33
|
characters.
|
@@ -35,6 +35,7 @@ def find_fuzzy_matches_in_docs(
|
|
35
35
|
Args:
|
36
36
|
query (str): The search string.
|
37
37
|
docs (List[Document]): List of Document objects to search through.
|
38
|
+
docs_clean (List[Document]): List of Document objects with cleaned content.
|
38
39
|
k (int): Number of best matches to return.
|
39
40
|
words_before (int|None): Number of words to include before each match.
|
40
41
|
Default None => return max
|
@@ -42,8 +43,7 @@ def find_fuzzy_matches_in_docs(
|
|
42
43
|
Default None => return max
|
43
44
|
|
44
45
|
Returns:
|
45
|
-
List[Document]: List of
|
46
|
-
including the given number of words around the match.
|
46
|
+
List[Tuple[Document,float]]: List of (Document, score) tuples.
|
47
47
|
"""
|
48
48
|
if len(docs) == 0:
|
49
49
|
return []
|
@@ -54,19 +54,19 @@ def find_fuzzy_matches_in_docs(
|
|
54
54
|
scorer=fuzz.partial_ratio,
|
55
55
|
)
|
56
56
|
|
57
|
-
real_matches = [m for m, score in best_matches if score > 50]
|
57
|
+
real_matches = [(m, score) for m, score in best_matches if score > 50]
|
58
58
|
# find the original docs that corresponding to the matches
|
59
59
|
orig_doc_matches = []
|
60
|
-
for i, m in enumerate(real_matches):
|
60
|
+
for i, (m, s) in enumerate(real_matches):
|
61
61
|
for j, doc_clean in enumerate(docs_clean):
|
62
62
|
if m in doc_clean.content:
|
63
|
-
orig_doc_matches.append(docs[j])
|
63
|
+
orig_doc_matches.append((docs[j], s))
|
64
64
|
break
|
65
65
|
if words_after is None and words_before is None:
|
66
66
|
return orig_doc_matches
|
67
67
|
if len(orig_doc_matches) == 0:
|
68
68
|
return []
|
69
|
-
if set(orig_doc_matches[0].__fields__) != {"content", "metadata"}:
|
69
|
+
if set(orig_doc_matches[0][0].__fields__) != {"content", "metadata"}:
|
70
70
|
# If there are fields beyond just content and metadata,
|
71
71
|
# we do NOT want to create new document objects with content fields
|
72
72
|
# based on words_before and words_after, since we don't know how to
|
@@ -74,7 +74,7 @@ def find_fuzzy_matches_in_docs(
|
|
74
74
|
return orig_doc_matches
|
75
75
|
|
76
76
|
contextual_matches = []
|
77
|
-
for match in orig_doc_matches:
|
77
|
+
for match, score in orig_doc_matches:
|
78
78
|
choice_text = match.content
|
79
79
|
contexts = []
|
80
80
|
while choice_text != "":
|
@@ -89,9 +89,12 @@ def find_fuzzy_matches_in_docs(
|
|
89
89
|
choice_text = " ".join(words[end_pos:])
|
90
90
|
if len(contexts) > 0:
|
91
91
|
contextual_matches.append(
|
92
|
-
|
93
|
-
|
94
|
-
|
92
|
+
(
|
93
|
+
Document(
|
94
|
+
content=" ... ".join(contexts),
|
95
|
+
metadata=match.metadata,
|
96
|
+
),
|
97
|
+
score,
|
95
98
|
)
|
96
99
|
)
|
97
100
|
|
@@ -10,7 +10,7 @@ langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
|
11
11
|
langroid/agent/openai_assistant.py,sha256=2rjCZw45ysNBEGNzQM4uf0bTC4KkatGYAWcVcW4xcek,34337
|
12
12
|
langroid/agent/special/__init__.py,sha256=gik_Xtm_zV7U9s30Mn8UX3Gyuy4jTjQe9zjiE3HWmEo,1273
|
13
|
-
langroid/agent/special/doc_chat_agent.py,sha256=
|
13
|
+
langroid/agent/special/doc_chat_agent.py,sha256=dqm0Gp11Mfl4hOWN4sUR1uZL-oHEmHzcB6bNN6WFgqw,54784
|
14
14
|
langroid/agent/special/lance_doc_chat_agent.py,sha256=s8xoRs0gGaFtDYFUSIRchsgDVbS5Q3C2b2mr3V1Fd-Q,10419
|
15
15
|
langroid/agent/special/lance_rag/__init__.py,sha256=QTbs0IVE2ZgDg8JJy1zN97rUUg4uEPH7SLGctFNumk4,174
|
16
16
|
langroid/agent/special/lance_rag/critic_agent.py,sha256=OtFuHthKQLkdVkvuZ2m0GNq1qOYLqHkm1pfLRFnSg5c,9548
|
@@ -91,7 +91,7 @@ langroid/parsing/parse_json.py,sha256=sKrYv9-IUqRFaTJA24_rmfjN1E7dQSrTBrtd1jYDE1
|
|
91
91
|
langroid/parsing/parser.py,sha256=AgtmlVUvrkSG1l7-YZPX8rlldgXjh_HqXAMqpXkBxUo,11746
|
92
92
|
langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
|
93
93
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
94
|
-
langroid/parsing/search.py,sha256=
|
94
|
+
langroid/parsing/search.py,sha256=0i_r0ESb5HEQfagA2g7_uMQyxYPADWVbdcN9ixZhS4E,8992
|
95
95
|
langroid/parsing/spider.py,sha256=Y6y7b86Y2k770LdhxgjVlImBxuuy1V9n8-XQ3QPaG5s,3199
|
96
96
|
langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
|
97
97
|
langroid/parsing/url_loader.py,sha256=Na2TBlKuQkloZzkE2d7xl6mh9olS3CbpgCsJbJ-xhIA,4472
|
@@ -137,8 +137,8 @@ langroid/vector_store/meilisearch.py,sha256=6frB7GFWeWmeKzRfLZIvzRjllniZ1cYj3Hmh
|
|
137
137
|
langroid/vector_store/momento.py,sha256=qR-zBF1RKVHQZPZQYW_7g-XpTwr46p8HJuYPCkfJbM4,10534
|
138
138
|
langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
|
139
139
|
langroid/vector_store/qdrantdb.py,sha256=v88lqFkepADvlN6lByUj9I4NEKa9X9lWH16uTPPbYrE,17457
|
140
|
-
pyproject.toml,sha256=
|
141
|
-
langroid-0.
|
142
|
-
langroid-0.
|
143
|
-
langroid-0.
|
144
|
-
langroid-0.
|
140
|
+
pyproject.toml,sha256=g99bgxP-XUiTx-KsdFICVJuV2bB89areQkDRU5sIgmk,7107
|
141
|
+
langroid-0.13.0.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
142
|
+
langroid-0.13.0.dist-info/METADATA,sha256=Znhge-Z8nn_L7Lxeh8dWs04d4ejZfj0NCCRutJJSkdg,55259
|
143
|
+
langroid-0.13.0.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
144
|
+
langroid-0.13.0.dist-info/RECORD,,
|
pyproject.toml
CHANGED
File without changes
|
File without changes
|