langroid 0.1.101__py3-none-any.whl → 0.1.102__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/batch.py +2 -2
- langroid/agent/special/doc_chat_agent.py +48 -3
- langroid/agent/special/retriever_agent.py +1 -1
- langroid/mytypes.py +10 -4
- langroid/parsing/document_parser.py +1 -0
- langroid/parsing/parser.py +62 -31
- langroid/parsing/search.py +54 -49
- langroid/parsing/utils.py +26 -0
- langroid/utils/algorithms/graph.py +49 -0
- langroid/utils/configuration.py +13 -0
- langroid/utils/pydantic_utils.py +3 -1
- langroid/vector_store/base.py +157 -1
- langroid/vector_store/chromadb.py +12 -19
- langroid/vector_store/meilisearch.py +1 -0
- langroid/vector_store/momento.py +1 -0
- langroid/vector_store/qdrantdb.py +10 -4
- {langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/METADATA +1 -1
- {langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/RECORD +20 -19
- {langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/LICENSE +0 -0
- {langroid-0.1.101.dist-info → langroid-0.1.102.dist-info}/WHEEL +0 -0
langroid/agent/batch.py
CHANGED
@@ -9,7 +9,7 @@ from rich.console import Console
|
|
9
9
|
from langroid.agent.base import Agent
|
10
10
|
from langroid.agent.chat_document import ChatDocument
|
11
11
|
from langroid.agent.task import Task
|
12
|
-
from langroid.utils.configuration import
|
12
|
+
from langroid.utils.configuration import quiet_mode, settings
|
13
13
|
from langroid.utils.logging import setup_colored_logging
|
14
14
|
|
15
15
|
console = Console(quiet=settings.quiet)
|
@@ -53,7 +53,7 @@ def run_batch_tasks(
|
|
53
53
|
return output_map(result)
|
54
54
|
|
55
55
|
async def _do_all() -> List[Any]:
|
56
|
-
with
|
56
|
+
with quiet_mode():
|
57
57
|
return await asyncio.gather( # type: ignore
|
58
58
|
*(_do_task(input, i) for i, input in enumerate(inputs))
|
59
59
|
)
|
@@ -66,6 +66,10 @@ You are a helpful assistant, helping me understand a collection of documents.
|
|
66
66
|
"""
|
67
67
|
|
68
68
|
|
69
|
+
class DocChunkMetqdata(DocMetaData):
|
70
|
+
id: str
|
71
|
+
|
72
|
+
|
69
73
|
class DocChatAgentConfig(ChatAgentConfig):
|
70
74
|
"""
|
71
75
|
Attributes:
|
@@ -95,6 +99,7 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
95
99
|
# It is False by default; its benefits depends on the context.
|
96
100
|
hypothetical_answer: bool = False
|
97
101
|
n_query_rephrases: int = 0
|
102
|
+
n_neighbor_chunks: int = 0 # how many neighbors on either side of match to retrieve
|
98
103
|
use_fuzzy_match: bool = True
|
99
104
|
use_bm25_search: bool = True
|
100
105
|
cross_encoder_reranking_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
|
@@ -122,6 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
122
127
|
min_chunk_chars=200,
|
123
128
|
discard_chunk_chars=5, # discard chunks with fewer than this many chars
|
124
129
|
n_similar_docs=3,
|
130
|
+
n_neighbor_ids=0, # num chunk IDs to store on either side of each chunk
|
125
131
|
pdf=PdfParsingConfig(
|
126
132
|
# NOTE: PDF parsing is extremely challenging, and each library
|
127
133
|
# has its own strengths and weaknesses.
|
@@ -195,6 +201,7 @@ class DocChatAgent(ChatAgent):
|
|
195
201
|
if self.vecdb is None:
|
196
202
|
raise ValueError("VecDB not set")
|
197
203
|
self.chunked_docs = self.vecdb.get_all_documents()
|
204
|
+
# used for lexical similarity e.g. keyword search (bm25 etc)
|
198
205
|
self.chunked_docs_clean = [
|
199
206
|
Document(content=preprocess_text(d.content), metadata=d.metadata)
|
200
207
|
for d in self.chunked_docs
|
@@ -509,9 +516,13 @@ class DocChatAgent(ChatAgent):
|
|
509
516
|
if self.chunked_docs is None:
|
510
517
|
logger.warning("No chunked docs; cannot use fuzzy matching")
|
511
518
|
return []
|
519
|
+
if self.chunked_docs_clean is None:
|
520
|
+
logger.warning("No cleaned chunked docs; cannot use fuzzy-search")
|
521
|
+
return []
|
512
522
|
fuzzy_match_docs = find_fuzzy_matches_in_docs(
|
513
523
|
query,
|
514
524
|
self.chunked_docs,
|
525
|
+
self.chunked_docs_clean,
|
515
526
|
k=self.config.parsing.n_similar_docs * multiple,
|
516
527
|
words_before=1000,
|
517
528
|
words_after=1000,
|
@@ -546,6 +557,36 @@ class DocChatAgent(ChatAgent):
|
|
546
557
|
]
|
547
558
|
return passages
|
548
559
|
|
560
|
+
def add_context_window(
|
561
|
+
self,
|
562
|
+
docs_scores: List[Tuple[Document, float]],
|
563
|
+
) -> List[Tuple[Document, float]]:
|
564
|
+
"""
|
565
|
+
In each doc's metadata, there may be a window_ids field indicating
|
566
|
+
the ids of the chunks around the current chunk.
|
567
|
+
These window_ids may overlap, so we
|
568
|
+
- gather connected-components of overlapping windows,
|
569
|
+
- split each component into roughly equal parts,
|
570
|
+
- create a new document for each part, preserving metadata,
|
571
|
+
|
572
|
+
We may have stored a longer set of window_ids than we need.
|
573
|
+
We just want `neighbors` on each side of the center of window_ids.
|
574
|
+
|
575
|
+
Args:
|
576
|
+
docs (List[Document]): List of documents to add context window to.
|
577
|
+
scores (List[float]): List of match scores for each document.
|
578
|
+
neighbors (int, optional): Number of neighbors on "each side" of match to
|
579
|
+
retrieve. Defaults to 0.
|
580
|
+
"Each side" here means before and after the match,
|
581
|
+
in the original text.
|
582
|
+
|
583
|
+
Returns:
|
584
|
+
List[Tuple[Document, float]]: List of (Document, score) tuples.
|
585
|
+
"""
|
586
|
+
if self.vecdb is None or self.config.n_neighbor_chunks == 0:
|
587
|
+
return docs_scores
|
588
|
+
return self.vecdb.add_context_window(docs_scores, self.config.n_neighbor_chunks)
|
589
|
+
|
549
590
|
def get_relevant_chunks(
|
550
591
|
self, query: str, query_proxies: List[str] = []
|
551
592
|
) -> List[Document]:
|
@@ -560,10 +601,11 @@ class DocChatAgent(ChatAgent):
|
|
560
601
|
dynamically retrieved based on a window around a lexical match.
|
561
602
|
|
562
603
|
These are the steps (some optional based on config):
|
563
|
-
- vector-embedding distance, from vecdb
|
564
|
-
- bm25-ranking (keyword similarity)
|
604
|
+
- semantic search based on vector-embedding distance, from vecdb
|
605
|
+
- lexical search using bm25-ranking (keyword similarity)
|
565
606
|
- fuzzy matching (keyword similarity)
|
566
|
-
- re-ranking of doc-chunks using cross-encoder,
|
607
|
+
- re-ranking of doc-chunks by relevance to query, using cross-encoder,
|
608
|
+
and pick top k
|
567
609
|
|
568
610
|
Args:
|
569
611
|
query: original query (assumed to be in stand-alone form)
|
@@ -612,6 +654,9 @@ class DocChatAgent(ChatAgent):
|
|
612
654
|
if len(passages) == 0:
|
613
655
|
return []
|
614
656
|
|
657
|
+
passages_scores = [(p, 0.0) for p in passages]
|
658
|
+
passages_scores = self.add_context_window(passages_scores)
|
659
|
+
passages = [p for p, _ in passages_scores]
|
615
660
|
# now passages can potentially have a lot of doc chunks,
|
616
661
|
# so we re-rank them using a cross-encoder scoring model
|
617
662
|
# https://www.sbert.net/examples/applications/retrieve_rerank
|
langroid/mytypes.py
CHANGED
@@ -26,6 +26,8 @@ class DocMetaData(BaseModel):
|
|
26
26
|
|
27
27
|
source: str = "context"
|
28
28
|
is_chunk: bool = False # if it is a chunk, don't split
|
29
|
+
id: str | None = None # unique id for the document
|
30
|
+
window_ids: List[str] = [] # for RAG: ids of chunks around this one
|
29
31
|
|
30
32
|
def dict(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
|
31
33
|
"""
|
@@ -51,9 +53,10 @@ class Document(BaseModel):
|
|
51
53
|
content: str
|
52
54
|
metadata: DocMetaData
|
53
55
|
|
54
|
-
|
56
|
+
@staticmethod
|
57
|
+
def hash_id(doc: str) -> str:
|
55
58
|
# Encode the document as UTF-8
|
56
|
-
doc_utf8 = str(
|
59
|
+
doc_utf8 = str(doc).encode("utf-8")
|
57
60
|
|
58
61
|
# Create a SHA256 hash object
|
59
62
|
sha256_hash = hashlib.sha256()
|
@@ -69,8 +72,11 @@ class Document(BaseModel):
|
|
69
72
|
|
70
73
|
return str(hash_uuid)
|
71
74
|
|
72
|
-
def
|
73
|
-
|
75
|
+
def _unique_hash_id(self) -> str:
|
76
|
+
return self.hash_id(str(self))
|
77
|
+
|
78
|
+
def id(self) -> str:
|
79
|
+
if hasattr(self.metadata, "id") and self.metadata.id is not None:
|
74
80
|
return self.metadata.id
|
75
81
|
else:
|
76
82
|
return self._unique_hash_id()
|
langroid/parsing/parser.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
import logging
|
2
2
|
from enum import Enum
|
3
|
-
from functools import reduce
|
4
3
|
from typing import List
|
5
4
|
|
6
5
|
import tiktoken
|
@@ -36,6 +35,7 @@ class ParsingConfig(BaseSettings):
|
|
36
35
|
min_chunk_chars: int = 350
|
37
36
|
discard_chunk_chars: int = 5 # discard chunks with fewer than this many chars
|
38
37
|
n_similar_docs: int = 4
|
38
|
+
n_neighbor_ids: int = 0 # window size to store around each chunk
|
39
39
|
separators: List[str] = ["\n\n", "\n", " ", ""]
|
40
40
|
token_encoding_model: str = "text-embedding-ada-002"
|
41
41
|
pdf: PdfParsingConfig = PdfParsingConfig()
|
@@ -51,17 +51,42 @@ class Parser:
|
|
51
51
|
tokens = self.tokenizer.encode(text)
|
52
52
|
return len(tokens)
|
53
53
|
|
54
|
+
def add_window_ids(self, chunks: List[Document]) -> None:
|
55
|
+
"""Chunks are consecutive parts of a single original document.
|
56
|
+
Add window_ids in metadata"""
|
57
|
+
|
58
|
+
# The original metadata.id (if any) is ignored since it will be same for all
|
59
|
+
# chunks and is useless. We want a distinct id for each chunk.
|
60
|
+
ids = [Document.hash_id(str(c)) for c in chunks]
|
61
|
+
|
62
|
+
k = self.config.n_neighbor_ids
|
63
|
+
n = len(ids)
|
64
|
+
window_ids = [ids[max(0, i - k) : min(n, i + k + 1)] for i in range(n)]
|
65
|
+
for i, c in enumerate(chunks):
|
66
|
+
if c.content.strip() == "":
|
67
|
+
continue
|
68
|
+
c.metadata.window_ids = window_ids[i]
|
69
|
+
c.metadata.id = ids[i]
|
70
|
+
c.metadata.is_chunk = True
|
71
|
+
|
54
72
|
def split_simple(self, docs: List[Document]) -> List[Document]:
|
55
73
|
if len(self.config.separators) == 0:
|
56
74
|
raise ValueError("Must have at least one separator")
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
75
|
+
final_docs = []
|
76
|
+
for d in docs:
|
77
|
+
if d.content.strip() == "":
|
78
|
+
continue
|
79
|
+
chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
|
80
|
+
chunk_docs = [
|
81
|
+
Document(
|
82
|
+
content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
|
83
|
+
)
|
84
|
+
for c in chunks
|
85
|
+
if c.strip() != ""
|
86
|
+
]
|
87
|
+
self.add_window_ids(chunk_docs)
|
88
|
+
final_docs += chunk_docs
|
89
|
+
return final_docs
|
65
90
|
|
66
91
|
def split_para_sentence(self, docs: List[Document]) -> List[Document]:
|
67
92
|
final_chunks = []
|
@@ -95,28 +120,37 @@ class Parser:
|
|
95
120
|
return final_chunks + chunks
|
96
121
|
|
97
122
|
def _split_para_sentence_once(self, docs: List[Document]) -> List[Document]:
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
123
|
+
final_chunks = []
|
124
|
+
for d in docs:
|
125
|
+
if d.content.strip() == "":
|
126
|
+
continue
|
127
|
+
chunks = create_chunks(d.content, self.config.chunk_size, self.num_tokens)
|
128
|
+
chunk_docs = [
|
129
|
+
Document(
|
130
|
+
content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
|
103
131
|
)
|
104
|
-
|
132
|
+
for c in chunks
|
133
|
+
if c.strip() != ""
|
105
134
|
]
|
106
|
-
|
107
|
-
|
108
|
-
|
135
|
+
self.add_window_ids(chunk_docs)
|
136
|
+
final_chunks += chunk_docs
|
137
|
+
|
138
|
+
return final_chunks
|
109
139
|
|
110
140
|
def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
141
|
+
final_docs = []
|
142
|
+
for d in docs:
|
143
|
+
chunks = self.chunk_tokens(d.content)
|
144
|
+
chunk_docs = [
|
145
|
+
Document(
|
146
|
+
content=c, metadata=d.metadata.copy(update=dict(is_chunk=True))
|
147
|
+
)
|
148
|
+
for c in chunks
|
149
|
+
if c.strip() != ""
|
116
150
|
]
|
117
|
-
|
118
|
-
|
119
|
-
return
|
151
|
+
self.add_window_ids(chunk_docs)
|
152
|
+
final_docs += chunk_docs
|
153
|
+
return final_docs
|
120
154
|
|
121
155
|
def chunk_tokens(
|
122
156
|
self,
|
@@ -198,11 +232,8 @@ class Parser:
|
|
198
232
|
# Increment the number of chunks
|
199
233
|
num_chunks += 1
|
200
234
|
|
201
|
-
#
|
202
|
-
|
203
|
-
remaining_text = self.tokenizer.decode(tokens).replace("\n", " ").strip()
|
204
|
-
if len(remaining_text) > self.config.discard_chunk_chars:
|
205
|
-
chunks.append(remaining_text)
|
235
|
+
# There may be remaining tokens, but we discard them
|
236
|
+
# since we have already reached the maximum number of chunks
|
206
237
|
|
207
238
|
return chunks
|
208
239
|
|
langroid/parsing/search.py
CHANGED
@@ -7,7 +7,6 @@ See tests for examples: tests/main/test_string_search.py
|
|
7
7
|
"""
|
8
8
|
|
9
9
|
import difflib
|
10
|
-
import re
|
11
10
|
from typing import List, Tuple
|
12
11
|
|
13
12
|
from nltk.corpus import stopwords
|
@@ -24,6 +23,7 @@ from .utils import download_nltk_resource
|
|
24
23
|
def find_fuzzy_matches_in_docs(
|
25
24
|
query: str,
|
26
25
|
docs: List[Document],
|
26
|
+
docs_clean: List[Document],
|
27
27
|
k: int,
|
28
28
|
words_before: int | None = None,
|
29
29
|
words_after: int | None = None,
|
@@ -49,45 +49,45 @@ def find_fuzzy_matches_in_docs(
|
|
49
49
|
return []
|
50
50
|
best_matches = process.extract(
|
51
51
|
query,
|
52
|
-
[d.content for d in
|
52
|
+
[d.content for d in docs_clean],
|
53
53
|
limit=k,
|
54
54
|
scorer=fuzz.partial_ratio,
|
55
55
|
)
|
56
56
|
|
57
57
|
real_matches = [m for m, score in best_matches if score > 50]
|
58
|
-
|
59
|
-
|
60
|
-
for
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
words_in_text = doc.content.split()
|
65
|
-
first_word_idx = next(
|
66
|
-
(
|
67
|
-
i
|
68
|
-
for i, word in enumerate(words_in_text)
|
69
|
-
if word.startswith(words[0])
|
70
|
-
),
|
71
|
-
-1,
|
72
|
-
)
|
73
|
-
if words_before is None:
|
74
|
-
words_before = len(words_in_text)
|
75
|
-
if words_after is None:
|
76
|
-
words_after = len(words_in_text)
|
77
|
-
if first_word_idx != -1:
|
78
|
-
start_idx = max(0, first_word_idx - words_before)
|
79
|
-
end_idx = min(
|
80
|
-
len(words_in_text),
|
81
|
-
first_word_idx + len(words) + words_after,
|
82
|
-
)
|
83
|
-
doc_match = Document(
|
84
|
-
content=" ".join(words_in_text[start_idx:end_idx]),
|
85
|
-
metadata=doc.metadata,
|
86
|
-
)
|
87
|
-
results.append(doc_match)
|
58
|
+
# find the original docs that corresponding to the matches
|
59
|
+
orig_doc_matches = []
|
60
|
+
for i, m in enumerate(real_matches):
|
61
|
+
for j, doc_clean in enumerate(docs_clean):
|
62
|
+
if m in doc_clean.content:
|
63
|
+
orig_doc_matches.append(docs[j])
|
88
64
|
break
|
65
|
+
if words_after is None and words_before is None:
|
66
|
+
return orig_doc_matches
|
67
|
+
|
68
|
+
contextual_matches = []
|
69
|
+
for match in orig_doc_matches:
|
70
|
+
choice_text = match.content
|
71
|
+
contexts = []
|
72
|
+
while choice_text != "":
|
73
|
+
context, start_pos, end_pos = get_context(
|
74
|
+
query, choice_text, words_before, words_after
|
75
|
+
)
|
76
|
+
if context == "" or end_pos == 0:
|
77
|
+
break
|
78
|
+
contexts.append(context)
|
79
|
+
words = choice_text.split()
|
80
|
+
end_pos = min(end_pos, len(words))
|
81
|
+
choice_text = " ".join(words[end_pos:])
|
82
|
+
if len(contexts) > 0:
|
83
|
+
contextual_matches.append(
|
84
|
+
Document(
|
85
|
+
content=" ... ".join(contexts),
|
86
|
+
metadata=match.metadata,
|
87
|
+
)
|
88
|
+
)
|
89
89
|
|
90
|
-
return
|
90
|
+
return contextual_matches
|
91
91
|
|
92
92
|
|
93
93
|
def preprocess_text(text: str) -> str:
|
@@ -171,7 +171,7 @@ def get_context(
|
|
171
171
|
text: str,
|
172
172
|
words_before: int | None = 100,
|
173
173
|
words_after: int | None = 100,
|
174
|
-
) -> str:
|
174
|
+
) -> Tuple[str, int, int]:
|
175
175
|
"""
|
176
176
|
Returns a portion of text containing the best approximate match of the query,
|
177
177
|
including b words before and a words after the match.
|
@@ -185,7 +185,9 @@ def get_context(
|
|
185
185
|
Returns:
|
186
186
|
str: A string containing b words before, the match, and a words after
|
187
187
|
the best approximate match position of the query in the text. If no
|
188
|
-
match is found, returns
|
188
|
+
match is found, returns empty string.
|
189
|
+
int: The start position of the match in the text.
|
190
|
+
int: The end position of the match in the text.
|
189
191
|
|
190
192
|
Example:
|
191
193
|
>>> get_context("apple", "The quick brown fox jumps over the apple.", 3, 2)
|
@@ -193,26 +195,29 @@ def get_context(
|
|
193
195
|
"""
|
194
196
|
if words_after is None and words_before is None:
|
195
197
|
# return entire text since we're not asked to return a bounded context
|
196
|
-
return text
|
198
|
+
return text, 0, 0
|
199
|
+
|
200
|
+
# make sure there is a good enough fu
|
201
|
+
if fuzz.partial_ratio(query, text) < 70:
|
202
|
+
return "", 0, 0
|
197
203
|
|
198
204
|
sequence_matcher = difflib.SequenceMatcher(None, text, query)
|
199
205
|
match = sequence_matcher.find_longest_match(0, len(text), 0, len(query))
|
200
206
|
|
201
207
|
if match.size == 0:
|
202
|
-
return "
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
)
|
208
|
+
return "", 0, 0
|
209
|
+
|
210
|
+
segments = text.split()
|
211
|
+
n_segs = len(segments)
|
212
|
+
|
213
|
+
start_segment_pos = len(text[: match.a].split())
|
214
|
+
|
215
|
+
words_before = words_before or n_segs
|
216
|
+
words_after = words_after or n_segs
|
217
|
+
start_pos = max(0, start_segment_pos - words_before)
|
218
|
+
end_pos = min(len(segments), start_segment_pos + words_after + len(query.split()))
|
214
219
|
|
215
|
-
return " ".join(
|
220
|
+
return " ".join(segments[start_pos:end_pos]), start_pos, end_pos
|
216
221
|
|
217
222
|
|
218
223
|
def eliminate_near_duplicates(passages: List[str], threshold: float = 0.8) -> List[str]:
|
langroid/parsing/utils.py
CHANGED
@@ -165,6 +165,32 @@ def parse_number_range_list(specs: str) -> List[int]:
|
|
165
165
|
return sorted(list(spec_indices))
|
166
166
|
|
167
167
|
|
168
|
+
def strip_k(s: str, k: int = 2) -> str:
|
169
|
+
"""
|
170
|
+
Strip any leading and trailing whitespaces from the input text beyond length k.
|
171
|
+
This is useful for removing leading/trailing whitespaces from a text while
|
172
|
+
preserving paragraph structure.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
s (str): The input text.
|
176
|
+
k (int): The number of leading and trailing whitespaces to retain.
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
str: The text with leading and trailing whitespaces removed beyond length k.
|
180
|
+
"""
|
181
|
+
|
182
|
+
# Count leading and trailing whitespaces
|
183
|
+
leading_count = len(s) - len(s.lstrip())
|
184
|
+
trailing_count = len(s) - len(s.rstrip())
|
185
|
+
|
186
|
+
# Determine how many whitespaces to retain
|
187
|
+
leading_keep = min(leading_count, k)
|
188
|
+
trailing_keep = min(trailing_count, k)
|
189
|
+
|
190
|
+
# Use slicing to get the desired output
|
191
|
+
return s[leading_count - leading_keep : len(s) - (trailing_count - trailing_keep)]
|
192
|
+
|
193
|
+
|
168
194
|
def clean_whitespace(text: str) -> str:
|
169
195
|
"""Remove extra whitespace from the input text, while preserving
|
170
196
|
paragraph structure.
|
@@ -0,0 +1,49 @@
|
|
1
|
+
"""
|
2
|
+
Graph algos.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import List, no_type_check
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
|
9
|
+
|
10
|
+
@no_type_check
|
11
|
+
def topological_sort(order: np.array) -> List[int]:
|
12
|
+
"""
|
13
|
+
Given a directed adjacency matrix, return a topological sort of the nodes.
|
14
|
+
order[i,j] = -1 means there is an edge from i to j.
|
15
|
+
order[i,j] = 0 means there is no edge from i to j.
|
16
|
+
order[i,j] = 1 means there is an edge from j to i.
|
17
|
+
|
18
|
+
Args:
|
19
|
+
order (np.array): The adjacency matrix.
|
20
|
+
|
21
|
+
Returns:
|
22
|
+
List[int]: The topological sort of the nodes.
|
23
|
+
|
24
|
+
"""
|
25
|
+
n = order.shape[0]
|
26
|
+
|
27
|
+
# Calculate the in-degrees
|
28
|
+
in_degree = [0] * n
|
29
|
+
for i in range(n):
|
30
|
+
for j in range(n):
|
31
|
+
if order[i, j] == -1:
|
32
|
+
in_degree[j] += 1
|
33
|
+
|
34
|
+
# Initialize the queue with nodes of in-degree 0
|
35
|
+
queue = [i for i in range(n) if in_degree[i] == 0]
|
36
|
+
result = []
|
37
|
+
|
38
|
+
while queue:
|
39
|
+
node = queue.pop(0)
|
40
|
+
result.append(node)
|
41
|
+
|
42
|
+
for i in range(n):
|
43
|
+
if order[node, i] == -1:
|
44
|
+
in_degree[i] -= 1
|
45
|
+
if in_degree[i] == 0:
|
46
|
+
queue.append(i)
|
47
|
+
|
48
|
+
assert len(result) == n, "Cycle detected"
|
49
|
+
return result
|
langroid/utils/configuration.py
CHANGED
@@ -71,6 +71,19 @@ def temporary_settings(temp_settings: Settings) -> Iterator[None]:
|
|
71
71
|
settings.__dict__.update(original_settings.__dict__)
|
72
72
|
|
73
73
|
|
74
|
+
@contextmanager
|
75
|
+
def quiet_mode() -> Iterator[None]:
|
76
|
+
"""Temporarily set quiet=True in global settings and restore afterward."""
|
77
|
+
original_quiet = settings.quiet
|
78
|
+
|
79
|
+
set_global(Settings(quiet=True))
|
80
|
+
|
81
|
+
try:
|
82
|
+
yield
|
83
|
+
finally:
|
84
|
+
settings.quiet = original_quiet
|
85
|
+
|
86
|
+
|
74
87
|
def set_env(settings: BaseSettings) -> None:
|
75
88
|
"""
|
76
89
|
Set environment variables from a BaseSettings instance
|
langroid/utils/pydantic_utils.py
CHANGED
@@ -79,7 +79,9 @@ def flatten_pydantic_model(
|
|
79
79
|
current_model, current_prefix = models_to_process.pop()
|
80
80
|
|
81
81
|
for name, field in current_model.__fields__.items():
|
82
|
-
if
|
82
|
+
if isinstance(field.outer_type_, type) and issubclass(
|
83
|
+
field.outer_type_, BaseModel
|
84
|
+
):
|
83
85
|
new_prefix = (
|
84
86
|
f"{current_prefix}{name}__" if current_prefix else f"{name}__"
|
85
87
|
)
|
langroid/vector_store/base.py
CHANGED
@@ -1,12 +1,16 @@
|
|
1
|
+
import copy
|
1
2
|
import logging
|
2
3
|
from abc import ABC, abstractmethod
|
3
|
-
from
|
4
|
+
from math import ceil
|
5
|
+
from typing import Dict, List, Optional, Sequence, Tuple
|
4
6
|
|
7
|
+
import numpy as np
|
5
8
|
from pydantic import BaseSettings
|
6
9
|
|
7
10
|
from langroid.embedding_models.base import EmbeddingModelsConfig
|
8
11
|
from langroid.embedding_models.models import OpenAIEmbeddingsConfig
|
9
12
|
from langroid.mytypes import Document
|
13
|
+
from langroid.utils.algorithms.graph import topological_sort
|
10
14
|
from langroid.utils.configuration import settings
|
11
15
|
from langroid.utils.output.printing import print_long_text
|
12
16
|
|
@@ -130,8 +134,160 @@ class VectorStore(ABC):
|
|
130
134
|
k: int = 1,
|
131
135
|
where: Optional[str] = None,
|
132
136
|
) -> List[Tuple[Document, float]]:
|
137
|
+
"""
|
138
|
+
Find k most similar texts to the given text, in terms of vector distance metric
|
139
|
+
(e.g., cosine similarity).
|
140
|
+
|
141
|
+
Args:
|
142
|
+
text (str): The text to find similar texts for.
|
143
|
+
k (int, optional): Number of similar texts to retrieve. Defaults to 1.
|
144
|
+
where (Optional[str], optional): Where clause to filter the search.
|
145
|
+
|
146
|
+
Returns:
|
147
|
+
List[Tuple[Document,float]]: List of (Document, score) tuples.
|
148
|
+
|
149
|
+
"""
|
133
150
|
pass
|
134
151
|
|
152
|
+
def add_context_window(
|
153
|
+
self, docs_scores: List[Tuple[Document, float]], neighbors: int = 0
|
154
|
+
) -> List[Tuple[Document, float]]:
|
155
|
+
"""
|
156
|
+
In each doc's metadata, there may be a window_ids field indicating
|
157
|
+
the ids of the chunks around the current chunk.
|
158
|
+
These window_ids may overlap, so we
|
159
|
+
- gather connected-components of overlapping windows,
|
160
|
+
- split each component into roughly equal parts,
|
161
|
+
- create a new document for each part, preserving metadata,
|
162
|
+
|
163
|
+
We may have stored a longer set of window_ids than we need.
|
164
|
+
We just want `neighbors` on each side of the center of window_ids.
|
165
|
+
|
166
|
+
Args:
|
167
|
+
docs (List[Document]): List of documents to add context window to.
|
168
|
+
scores (List[float]): List of match scores for each document.
|
169
|
+
neighbors (int, optional): Number of neighbors on "each side" of match to
|
170
|
+
retrieve. Defaults to 0.
|
171
|
+
"Each side" here means before and after the match,
|
172
|
+
in the original text.
|
173
|
+
|
174
|
+
Returns:
|
175
|
+
List[Tuple[Document, float]]: List of (Document, score) tuples.
|
176
|
+
"""
|
177
|
+
# We return a larger context around each match, i.e.
|
178
|
+
# a window of `neighbors` on each side of the match.
|
179
|
+
docs = [d for d, s in docs_scores]
|
180
|
+
scores = [s for d, s in docs_scores]
|
181
|
+
if neighbors == 0:
|
182
|
+
return docs_scores
|
183
|
+
doc_chunks = [d for d in docs if d.metadata.is_chunk]
|
184
|
+
if len(doc_chunks) == 0:
|
185
|
+
return docs_scores
|
186
|
+
window_ids_list = []
|
187
|
+
id2metadata = {}
|
188
|
+
# id -> highest score of a doc it appears in
|
189
|
+
id2max_score: Dict[int | str, float] = {}
|
190
|
+
for i, d in enumerate(docs):
|
191
|
+
window_ids = d.metadata.window_ids
|
192
|
+
id2metadata.update({id: d.metadata for id in window_ids})
|
193
|
+
|
194
|
+
id2max_score.update(
|
195
|
+
{id: max(id2max_score.get(id, 0), scores[i]) for id in window_ids}
|
196
|
+
)
|
197
|
+
n = len(window_ids)
|
198
|
+
chunk_idx = window_ids.index(d.id())
|
199
|
+
neighbor_ids = window_ids[
|
200
|
+
max(0, chunk_idx - neighbors) : min(n, chunk_idx + neighbors + 1)
|
201
|
+
]
|
202
|
+
window_ids_list += [neighbor_ids]
|
203
|
+
|
204
|
+
# window_ids could be from different docs,
|
205
|
+
# and they may overlap, so we first remove overlaps
|
206
|
+
window_ids_list = self.remove_overlaps(window_ids_list)
|
207
|
+
final_docs = []
|
208
|
+
final_scores = []
|
209
|
+
for w in window_ids_list:
|
210
|
+
metadata = copy.deepcopy(id2metadata[w[0]])
|
211
|
+
metadata.window_ids = w
|
212
|
+
document = Document(
|
213
|
+
content=" ".join([d.content for d in self.get_documents_by_ids(w)]),
|
214
|
+
metadata=metadata,
|
215
|
+
)
|
216
|
+
# make a fresh id since content is in general different
|
217
|
+
document.metadata.id = document.hash_id(document.content)
|
218
|
+
final_docs += [document]
|
219
|
+
final_scores += [max(id2max_score[id] for id in w)]
|
220
|
+
return list(zip(final_docs, final_scores))
|
221
|
+
|
222
|
+
@staticmethod
|
223
|
+
def remove_overlaps(windows: List[List[str]]) -> List[List[str]]:
|
224
|
+
"""
|
225
|
+
Given a collection of windows, where each window is a sequence of ids,
|
226
|
+
identify groups of overlapping windows, and for each overlapping k-group,
|
227
|
+
split the ids into k roughly equal sequences.
|
228
|
+
|
229
|
+
Args:
|
230
|
+
windows (List[int|str]): List of windows, where each window is a
|
231
|
+
sequence of ids.
|
232
|
+
|
233
|
+
Returns:
|
234
|
+
List[int|str]: List of windows, where each window is a sequence of ids,
|
235
|
+
and no two windows overlap.
|
236
|
+
"""
|
237
|
+
ids = set(id for w in windows for id in w)
|
238
|
+
# id -> {win -> # pos}
|
239
|
+
id2win2pos: Dict[str, Dict[int, int]] = {id: {} for id in ids}
|
240
|
+
|
241
|
+
for i, w in enumerate(windows):
|
242
|
+
for j, id in enumerate(w):
|
243
|
+
id2win2pos[id][i] = j
|
244
|
+
|
245
|
+
n = len(windows)
|
246
|
+
# relation between windows:
|
247
|
+
order = np.zeros((n, n), dtype=np.int8)
|
248
|
+
for i, w in enumerate(windows):
|
249
|
+
for j, x in enumerate(windows):
|
250
|
+
if i == j:
|
251
|
+
continue
|
252
|
+
if len(set(w).intersection(x)) == 0:
|
253
|
+
continue
|
254
|
+
id = list(set(w).intersection(x))[0] # any common id
|
255
|
+
if id2win2pos[id][i] > id2win2pos[id][j]:
|
256
|
+
order[i, j] = -1 # win i is before win j
|
257
|
+
else:
|
258
|
+
order[i, j] = 1 # win i is after win j
|
259
|
+
|
260
|
+
# find groups of windows that overlap, like connected components in a graph
|
261
|
+
groups = [[0]]
|
262
|
+
for i in range(1, n):
|
263
|
+
found = False
|
264
|
+
for g in groups:
|
265
|
+
if any(order[i, j] != 0 for j in g):
|
266
|
+
g.append(i)
|
267
|
+
found = True
|
268
|
+
break
|
269
|
+
if not found:
|
270
|
+
groups.append([i])
|
271
|
+
|
272
|
+
# split each group into roughly equal parts
|
273
|
+
new_windows = []
|
274
|
+
max_window_len = max(len(w) for w in windows)
|
275
|
+
for g in groups:
|
276
|
+
# find total ordering among windows in group based on order matrix
|
277
|
+
# (this is a topological sort)
|
278
|
+
_g = np.array(g)
|
279
|
+
order_matrix = order[_g][:, _g]
|
280
|
+
ordered_window_indices = topological_sort(order_matrix)
|
281
|
+
ordered_window_ids = [windows[i] for i in _g[ordered_window_indices]]
|
282
|
+
flattened = [id for w in ordered_window_ids for id in w]
|
283
|
+
flattened_deduped = list(dict.fromkeys(flattened))
|
284
|
+
# split into k parts where k is the smallest integer such that
|
285
|
+
# each part has length <= max_window_len
|
286
|
+
k = max(1, int(ceil(len(flattened_deduped) / max_window_len)))
|
287
|
+
new_windows += np.array_split(flattened_deduped, k)
|
288
|
+
|
289
|
+
return [w.tolist() for w in new_windows]
|
290
|
+
|
135
291
|
@abstractmethod
|
136
292
|
def get_all_documents(self) -> List[Document]:
|
137
293
|
"""
|
@@ -109,14 +109,17 @@ class ChromaDB(VectorStore):
|
|
109
109
|
if documents is None:
|
110
110
|
return
|
111
111
|
contents: List[str] = [document.content for document in documents]
|
112
|
-
metadatas
|
113
|
-
|
114
|
-
|
112
|
+
# convert metadatas to dicts so chroma can handle them
|
113
|
+
metadata_dicts: List[dict[str, Any]] = [d.metadata.dict() for d in documents]
|
114
|
+
for m in metadata_dicts:
|
115
|
+
# chroma does not handle non-atomic types in metadata
|
116
|
+
m["window_ids"] = ",".join(m["window_ids"])
|
117
|
+
|
115
118
|
ids = [str(d.id()) for d in documents]
|
116
119
|
self.collection.add(
|
117
120
|
# embedding_models=embedding_models,
|
118
121
|
documents=contents,
|
119
|
-
metadatas=
|
122
|
+
metadatas=metadata_dicts,
|
120
123
|
ids=ids,
|
121
124
|
)
|
122
125
|
|
@@ -145,7 +148,8 @@ class ChromaDB(VectorStore):
|
|
145
148
|
include=["documents", "distances", "metadatas"],
|
146
149
|
)
|
147
150
|
docs = self._docs_from_results(results)
|
148
|
-
|
151
|
+
# chroma distances are 1 - cosine.
|
152
|
+
scores = [1 - s for s in results["distances"][0]]
|
149
153
|
return list(zip(docs, scores))
|
150
154
|
|
151
155
|
def _docs_from_results(self, results: Dict[str, Any]) -> List[Document]:
|
@@ -164,22 +168,11 @@ class ChromaDB(VectorStore):
|
|
164
168
|
for i, c in enumerate(contents):
|
165
169
|
print_long_text("red", "italic red", f"MATCH-{i}", c)
|
166
170
|
metadatas = results["metadatas"][0]
|
171
|
+
for m in metadatas:
|
172
|
+
# restore the stringified list of window_ids into the original List[str]
|
173
|
+
m["window_ids"] = m["window_ids"].split(",")
|
167
174
|
docs = [
|
168
175
|
Document(content=d, metadata=DocMetaData(**m))
|
169
176
|
for d, m in zip(contents, metadatas)
|
170
177
|
]
|
171
178
|
return docs
|
172
|
-
|
173
|
-
|
174
|
-
# Example usage and testing
|
175
|
-
# chroma_db = ChromaDB.from_documents(
|
176
|
-
# collection_name="all-my-documents",
|
177
|
-
# documents=["doc1000101", "doc288822"],
|
178
|
-
# metadatas=[{"style": "style1"}, {"style": "style2"}],
|
179
|
-
# ids=["uri9", "uri10"]
|
180
|
-
# )
|
181
|
-
# results = chroma_db.query(
|
182
|
-
# query_texts=["This is a query document"],
|
183
|
-
# n_results=2
|
184
|
-
# )
|
185
|
-
# print(results)
|
@@ -263,6 +263,7 @@ class MeiliSearch(VectorStore):
|
|
263
263
|
text: str,
|
264
264
|
k: int = 20,
|
265
265
|
where: Optional[str] = None,
|
266
|
+
neighbors: int = 0, # ignored
|
266
267
|
) -> List[Tuple[Document, float]]:
|
267
268
|
filter = [] if where is None else where
|
268
269
|
if self.config.collection_name is None:
|
langroid/vector_store/momento.py
CHANGED
@@ -222,6 +222,7 @@ class MomentoVI(VectorStore):
|
|
222
222
|
text: str,
|
223
223
|
k: int = 1,
|
224
224
|
where: Optional[str] = None,
|
225
|
+
neighbors: int = 0, # ignored
|
225
226
|
) -> List[Tuple[Document, float]]:
|
226
227
|
if self.config.collection_name is None:
|
227
228
|
raise ValueError("No collection name set, cannot search")
|
@@ -244,7 +244,11 @@ class QdrantDB(VectorStore):
|
|
244
244
|
with_vectors=False,
|
245
245
|
with_payload=True,
|
246
246
|
)
|
247
|
-
|
247
|
+
# Note the records may NOT be in the order of the ids,
|
248
|
+
# so we re-order them here.
|
249
|
+
id2payload = {record.id: record.payload for record in records}
|
250
|
+
ordered_payloads = [id2payload[id] for id in _ids]
|
251
|
+
docs = [Document(**payload) for payload in ordered_payloads] # type: ignore
|
248
252
|
return docs
|
249
253
|
|
250
254
|
def similar_texts_with_scores(
|
@@ -252,6 +256,7 @@ class QdrantDB(VectorStore):
|
|
252
256
|
text: str,
|
253
257
|
k: int = 1,
|
254
258
|
where: Optional[str] = None,
|
259
|
+
neighbors: int = 0,
|
255
260
|
) -> List[Tuple[Document, float]]:
|
256
261
|
embedding = self.embedding_fn([text])[0]
|
257
262
|
# TODO filter may not work yet
|
@@ -268,7 +273,7 @@ class QdrantDB(VectorStore):
|
|
268
273
|
exact=False, # use Apx NN, not exact NN
|
269
274
|
),
|
270
275
|
)
|
271
|
-
scores = [match.score for match in search_result]
|
276
|
+
scores = [match.score for match in search_result if match is not None]
|
272
277
|
docs = [
|
273
278
|
Document(**(match.payload)) # type: ignore
|
274
279
|
for match in search_result
|
@@ -277,8 +282,9 @@ class QdrantDB(VectorStore):
|
|
277
282
|
if len(docs) == 0:
|
278
283
|
logger.warning(f"No matches found for {text}")
|
279
284
|
return []
|
280
|
-
if settings.debug:
|
281
|
-
logger.info(f"Found {len(docs)} matches, max score: {max(scores)}")
|
282
285
|
doc_score_pairs = list(zip(docs, scores))
|
286
|
+
max_score = max(ds[1] for ds in doc_score_pairs)
|
287
|
+
if settings.debug:
|
288
|
+
logger.info(f"Found {len(doc_score_pairs)} matches, max score: {max_score}")
|
283
289
|
self.show_if_debug(doc_score_pairs)
|
284
290
|
return doc_score_pairs
|
@@ -1,16 +1,16 @@
|
|
1
1
|
langroid/__init__.py,sha256=-AWkFhhW0b0paHQ11SORyIVPnXv0nyT2X_0_xh3zLjw,408
|
2
2
|
langroid/agent/__init__.py,sha256=ZqDw3Ktw7XGDl6mC8DN61F71V4ckf0rBoEOydH9l6C4,428
|
3
3
|
langroid/agent/base.py,sha256=NjRf_y5ymZqpFlXh2sK94lcJRJbrBhw5pd1_Qe1Da_E,30151
|
4
|
-
langroid/agent/batch.py,sha256=
|
4
|
+
langroid/agent/batch.py,sha256=p5UPdvRn6QOpb3V4B517nPCF7nZemKk7_9YlJ7jR7w4,5450
|
5
5
|
langroid/agent/chat_agent.py,sha256=qjCwvR7i9DtonTmm1d1mbBHN4aW0LzxABAL-2JfGcF8,33548
|
6
6
|
langroid/agent/chat_document.py,sha256=k7Klav3FIBTf2w95bQtxgqBrf2fMo1ydSlklQvv4RCg,6252
|
7
7
|
langroid/agent/helpers.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
langroid/agent/junk,sha256=LxfuuW7Cijsg0szAzT81OjWWv1PMNI-6w_-DspVIO2s,339
|
9
9
|
langroid/agent/special/__init__.py,sha256=ciNhdoIIjFxNk-5xcy8H76A3d-TldbIYaFexlgfN-2A,575
|
10
|
-
langroid/agent/special/doc_chat_agent.py,sha256=
|
10
|
+
langroid/agent/special/doc_chat_agent.py,sha256=ko6_oYJuv70zQ0mvLXcQMj_ZB1UVE8bw2V2Ng-FoVoI,33425
|
11
11
|
langroid/agent/special/recipient_validator_agent.py,sha256=R3Rit93BNWQar_9stuDBGzmLr2W-IYOQ7oq-tlNNlps,6035
|
12
12
|
langroid/agent/special/relevance_extractor_agent.py,sha256=JU52PbY5FO72kfnA902-UKzVgxExndlwEC7Lb-XqDNI,4348
|
13
|
-
langroid/agent/special/retriever_agent.py,sha256=
|
13
|
+
langroid/agent/special/retriever_agent.py,sha256=ze8jXJW9A_twsrRXVECAQCYicfjm8-a6qv1vDk41AAc,6573
|
14
14
|
langroid/agent/special/sql/__init__.py,sha256=3kR5nC0wnYIzmMrr9L8RJa7JAJpbwBLx7KKygiwz0v0,111
|
15
15
|
langroid/agent/special/sql/sql_chat_agent.py,sha256=Ua_gfK_1k5ct59Zkbe78bzs-2jabtFkEVx76a0pGs9Y,12867
|
16
16
|
langroid/agent/special/sql/utils/__init__.py,sha256=_IBHt3iNXvPqxvDrs5_T86qdj0gPugVGnGNi6Cx7F-I,238
|
@@ -45,24 +45,24 @@ langroid/language_models/prompt_formatter/__init__.py,sha256=wj2e6j7R9d3m63HCbSD
|
|
45
45
|
langroid/language_models/prompt_formatter/base.py,sha256=2y_GcwhstvB5ih3haS7l5Fv79jVnFJ_vEw1jqWJzB9k,1247
|
46
46
|
langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
|
47
47
|
langroid/language_models/utils.py,sha256=rmnSn-sJ3aKl_wBdeLPkck0Li4Ed6zkCxZYYl7n1V34,4668
|
48
|
-
langroid/mytypes.py,sha256=
|
48
|
+
langroid/mytypes.py,sha256=XmEUL_xAZfeWuJLEvQe_4g-W9P7rpY6zOIAHhtYikwk,2363
|
49
49
|
langroid/parsing/__init__.py,sha256=_EZ8iuixxU39zuaydtfjyap8g9C_c1dnrCQ0QR81U2E,340
|
50
50
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
51
51
|
langroid/parsing/code-parsing.md,sha256=--cyyNiSZSDlIwcjAV4-shKrSiRe2ytF3AdSoS_hD2g,3294
|
52
52
|
langroid/parsing/code_parser.py,sha256=BbDAzp35wkYQ9U1dpf1ARL0lVyi0tfqEc6_eox2C090,3727
|
53
53
|
langroid/parsing/config.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
54
|
-
langroid/parsing/document_parser.py,sha256=
|
54
|
+
langroid/parsing/document_parser.py,sha256=YC3IXQ9ErpBGBZh6Be9gfJWHcTwGTSMfNQMT5ARrj5g,14615
|
55
55
|
langroid/parsing/json.py,sha256=MVqBUfInALQm1QKbcfEvLzWxBz_UztCIyGk7AK5uFPo,1650
|
56
56
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
57
|
-
langroid/parsing/parser.py,sha256=
|
57
|
+
langroid/parsing/parser.py,sha256=3EVPkOfXehZwUvdM-tn7LN951722_2c7umGtwzwdxts,9297
|
58
58
|
langroid/parsing/repo_loader.py,sha256=nmtvorVip4VQbUMDxoxpVyAlbLt8R8eJjxpAX0vVlfs,27695
|
59
|
-
langroid/parsing/search.py,sha256=
|
59
|
+
langroid/parsing/search.py,sha256=h-C0Ij111cI7lcddr_vdABjfNKXDqBkJVG48WyJCovA,8424
|
60
60
|
langroid/parsing/spider.py,sha256=aX0ucHQ9SVgpieNjtEn_G1bhq5DH_03VpBXoxcdZPl8,3008
|
61
61
|
langroid/parsing/table_loader.py,sha256=uqbupGr4y_7os18RtaY5GpD0hWcgzROoNy8dQIHB4kc,1767
|
62
62
|
langroid/parsing/url_loader.py,sha256=dhmUTysS_YZyIXVAekxCGPiCbFsOsHXj_eHMow0xoGQ,2153
|
63
63
|
langroid/parsing/url_loader_cookies.py,sha256=Lg4sNpRz9MByWq2mde6T0hKv68VZSV3mtMjNEHuFeSU,2327
|
64
64
|
langroid/parsing/urls.py,sha256=vJ-ZJROtmLwykoE690w5y0BxWN2QOpbxR4hy03knx6Q,7520
|
65
|
-
langroid/parsing/utils.py,sha256=
|
65
|
+
langroid/parsing/utils.py,sha256=nuCW_sRe5js0d-K6EtDEIbFQpMicS1ntr3FXxtYtGzw,7639
|
66
66
|
langroid/parsing/web_search.py,sha256=hGUVoSJNdpoT5rsm-ikAteMiUropHrzKaxN8EVVqO2U,2496
|
67
67
|
langroid/prompts/__init__.py,sha256=aTW86CbDZM7tntqiTVeNLYJv7pbRDcKOI3qHVXCEHUY,99
|
68
68
|
langroid/prompts/dialog.py,sha256=SpfiSyofSgy2pwD1YboHR_yHO3LEEMbv6j2sm874jKo,331
|
@@ -70,7 +70,8 @@ langroid/prompts/prompts_config.py,sha256=EMK1Fm7EmS8y3CV4AkrVgn5K4NipiM4m7J8819
|
|
70
70
|
langroid/prompts/templates.py,sha256=4X-07tnmUQ8Z_zaWRQAUUyKiErGztp3tERujqnG8sGA,6369
|
71
71
|
langroid/prompts/transforms.py,sha256=GsQo1klGxUy0fACh6j0lTblk6XEl2erRnhRWlN2M4-c,2706
|
72
72
|
langroid/utils/__init__.py,sha256=3aMfdwFizpl3W2H5Q-TMqUFqMoYgec1NiX-caSnClmQ,167
|
73
|
-
langroid/utils/
|
73
|
+
langroid/utils/algorithms/graph.py,sha256=5D7scuxeofllU6xh8_tIcc2WiHVn0MjVQ7lSPrOgKr4,1173
|
74
|
+
langroid/utils/configuration.py,sha256=p_MlevqGdS3681u2IiDgrMXBCytg9xZwQH5OK9PUNno,3044
|
74
75
|
langroid/utils/constants.py,sha256=edJ5J-sC9CeUwwNey_uLQbGbHgjX-T8XLf_J53h3Tys,484
|
75
76
|
langroid/utils/docker.py,sha256=kJQOLTgM0x9j9pgIIqp0dZNZCTvoUDhp6i8tYBq1Jr0,1105
|
76
77
|
langroid/utils/globals.py,sha256=UubMelOGkLy3BxByl1vprITU4dbysZmCtYBvZWL8dto,1337
|
@@ -79,20 +80,20 @@ langroid/utils/llms/strings.py,sha256=CSAX9Z6FQOLXOzbLMe_Opqtc3ruDAKTTk7cPqc6Blh
|
|
79
80
|
langroid/utils/logging.py,sha256=xXpohbvK74_reomdkIWTeyDjGG8GT1fuU7zcLL3Ngt8,3951
|
80
81
|
langroid/utils/output/__init__.py,sha256=IpfqnCkfXa4HaOx39EMUhXuA7GPZFd7N_QMm1n43C_I,174
|
81
82
|
langroid/utils/output/printing.py,sha256=RZoY8S-8UljiVURe5o5SljpzwF77LTCO7-68nf_uvA4,2277
|
82
|
-
langroid/utils/pydantic_utils.py,sha256=
|
83
|
+
langroid/utils/pydantic_utils.py,sha256=00ajeBTvxJEOyqd7M7FveRz7oa9wdQ0QFKvOjy_ZfRE,6296
|
83
84
|
langroid/utils/system.py,sha256=LyFrSPfvAnhA8GSRjT-2HOkLzxmziZ8wfpDYMqSv01M,1518
|
84
85
|
langroid/utils/web/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
86
|
langroid/utils/web/login.py,sha256=1iz9eUAHa87vpKIkzwkmFa00avwFWivDSAr7QUhK7U0,2528
|
86
87
|
langroid/utils/web/selenium_login.py,sha256=mYI6EvVmne34N9RajlsxxRqJQJvV-WG4LGp6sEECHPw,1156
|
87
88
|
langroid/vector_store/__init__.py,sha256=NhAXOCKX_x2whfghOn44e0O3-vV0nJRz6ZLsCBqYFyQ,242
|
88
|
-
langroid/vector_store/base.py,sha256=
|
89
|
-
langroid/vector_store/chromadb.py,sha256=
|
89
|
+
langroid/vector_store/base.py,sha256=VQb_7EIJ1r3g-fzyP4b-WSfZg73rtdYsILIcHZLl4NM,11989
|
90
|
+
langroid/vector_store/chromadb.py,sha256=EJONjIa77Bkr8ych5JLykYV9n-DP_9jqFechmmZHHwI,6803
|
90
91
|
langroid/vector_store/lancedb.py,sha256=_d7Mz7O8j4keYgHzFSpEOBFq6L13kDJ3eQOZAIrIaOc,11262
|
91
|
-
langroid/vector_store/meilisearch.py,sha256=
|
92
|
-
langroid/vector_store/momento.py,sha256=
|
92
|
+
langroid/vector_store/meilisearch.py,sha256=aQ5Bo-Rk-BnMxbcCTpR7yVm4aNNZHy4hlJBJxn-UpYw,11207
|
93
|
+
langroid/vector_store/momento.py,sha256=krw1KwyVRE-ekq1KUAktsMxrJfeolsAC5BmK-1zdxsg,9930
|
93
94
|
langroid/vector_store/qdrant_cloud.py,sha256=3im4Mip0QXLkR6wiqVsjV1QvhSElfxdFSuDKddBDQ-4,188
|
94
|
-
langroid/vector_store/qdrantdb.py,sha256=
|
95
|
-
langroid-0.1.
|
96
|
-
langroid-0.1.
|
97
|
-
langroid-0.1.
|
98
|
-
langroid-0.1.
|
95
|
+
langroid/vector_store/qdrantdb.py,sha256=YfH0t5nzBBMmwyH0_QndQNnrSfv_3_LFpjlVzcEhbso,11409
|
96
|
+
langroid-0.1.102.dist-info/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
97
|
+
langroid-0.1.102.dist-info/WHEEL,sha256=vVCvjcmxuUltf8cYhJ0sJMRDLr1XsPuxEId8YDzbyCY,88
|
98
|
+
langroid-0.1.102.dist-info/METADATA,sha256=zcrmh544o2NXQXuOhUt4YTSf9P6McMB3WQQUxVYkp_g,38599
|
99
|
+
langroid-0.1.102.dist-info/RECORD,,
|
File without changes
|
File without changes
|