langroid 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/__init__.py +70 -0
- langroid/agent/__init__.py +22 -0
- langroid/agent/base.py +120 -33
- langroid/agent/batch.py +134 -35
- langroid/agent/callbacks/__init__.py +0 -0
- langroid/agent/callbacks/chainlit.py +608 -0
- langroid/agent/chat_agent.py +164 -100
- langroid/agent/chat_document.py +19 -2
- langroid/agent/openai_assistant.py +20 -10
- langroid/agent/special/__init__.py +33 -10
- langroid/agent/special/doc_chat_agent.py +521 -108
- langroid/agent/special/lance_doc_chat_agent.py +258 -0
- langroid/agent/special/lance_rag/__init__.py +9 -0
- langroid/agent/special/lance_rag/critic_agent.py +136 -0
- langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
- langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
- langroid/agent/special/lance_tools.py +44 -0
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
- langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
- langroid/agent/special/neo4j/utils/__init__.py +0 -0
- langroid/agent/special/neo4j/utils/system_message.py +46 -0
- langroid/agent/special/relevance_extractor_agent.py +23 -7
- langroid/agent/special/retriever_agent.py +29 -174
- langroid/agent/special/sql/__init__.py +7 -0
- langroid/agent/special/sql/sql_chat_agent.py +47 -23
- langroid/agent/special/sql/utils/__init__.py +11 -0
- langroid/agent/special/sql/utils/description_extractors.py +95 -46
- langroid/agent/special/sql/utils/populate_metadata.py +28 -21
- langroid/agent/special/table_chat_agent.py +43 -9
- langroid/agent/task.py +423 -114
- langroid/agent/tool_message.py +67 -10
- langroid/agent/tools/__init__.py +8 -0
- langroid/agent/tools/duckduckgo_search_tool.py +66 -0
- langroid/agent/tools/google_search_tool.py +11 -0
- langroid/agent/tools/metaphor_search_tool.py +67 -0
- langroid/agent/tools/recipient_tool.py +6 -24
- langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
- langroid/cachedb/__init__.py +6 -0
- langroid/embedding_models/__init__.py +24 -0
- langroid/embedding_models/base.py +9 -1
- langroid/embedding_models/models.py +117 -17
- langroid/embedding_models/protoc/embeddings.proto +19 -0
- langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
- langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
- langroid/embedding_models/remote_embeds.py +153 -0
- langroid/language_models/__init__.py +22 -0
- langroid/language_models/azure_openai.py +47 -4
- langroid/language_models/base.py +26 -10
- langroid/language_models/config.py +5 -0
- langroid/language_models/openai_gpt.py +407 -121
- langroid/language_models/prompt_formatter/__init__.py +9 -0
- langroid/language_models/prompt_formatter/base.py +4 -6
- langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
- langroid/language_models/utils.py +10 -9
- langroid/mytypes.py +10 -4
- langroid/parsing/__init__.py +33 -1
- langroid/parsing/document_parser.py +259 -63
- langroid/parsing/image_text.py +32 -0
- langroid/parsing/parse_json.py +143 -0
- langroid/parsing/parser.py +20 -7
- langroid/parsing/repo_loader.py +108 -46
- langroid/parsing/search.py +8 -0
- langroid/parsing/table_loader.py +44 -0
- langroid/parsing/url_loader.py +59 -13
- langroid/parsing/urls.py +18 -9
- langroid/parsing/utils.py +130 -9
- langroid/parsing/web_search.py +73 -0
- langroid/prompts/__init__.py +7 -0
- langroid/prompts/chat-gpt4-system-prompt.md +68 -0
- langroid/prompts/prompts_config.py +1 -1
- langroid/utils/__init__.py +10 -0
- langroid/utils/algorithms/__init__.py +3 -0
- langroid/utils/configuration.py +0 -1
- langroid/utils/constants.py +4 -0
- langroid/utils/logging.py +2 -5
- langroid/utils/output/__init__.py +15 -2
- langroid/utils/output/status.py +33 -0
- langroid/utils/pandas_utils.py +30 -0
- langroid/utils/pydantic_utils.py +446 -4
- langroid/utils/system.py +36 -1
- langroid/vector_store/__init__.py +34 -2
- langroid/vector_store/base.py +33 -2
- langroid/vector_store/chromadb.py +42 -13
- langroid/vector_store/lancedb.py +226 -60
- langroid/vector_store/meilisearch.py +7 -6
- langroid/vector_store/momento.py +3 -2
- langroid/vector_store/qdrantdb.py +82 -11
- {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/METADATA +190 -129
- langroid-0.1.219.dist-info/RECORD +127 -0
- langroid/agent/special/recipient_validator_agent.py +0 -157
- langroid/parsing/json.py +0 -64
- langroid/utils/web/selenium_login.py +0 -36
- langroid-0.1.139.dist-info/RECORD +0 -103
- {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
- {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0
@@ -12,16 +12,17 @@ langroid with the [hf-embeddings] extra, e.g.:
|
|
12
12
|
pip install "langroid[hf-embeddings]"
|
13
13
|
|
14
14
|
"""
|
15
|
+
|
15
16
|
import logging
|
16
17
|
from contextlib import ExitStack
|
17
|
-
from
|
18
|
+
from functools import cache
|
19
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, no_type_check
|
18
20
|
|
21
|
+
import nest_asyncio
|
19
22
|
import numpy as np
|
20
|
-
|
21
|
-
from rich.console import Console
|
23
|
+
import pandas as pd
|
22
24
|
from rich.prompt import Prompt
|
23
25
|
|
24
|
-
from langroid.agent.base import Agent
|
25
26
|
from langroid.agent.batch import run_batch_tasks
|
26
27
|
from langroid.agent.chat_agent import ChatAgent, ChatAgentConfig
|
27
28
|
from langroid.agent.chat_document import ChatDocMetaData, ChatDocument
|
@@ -34,6 +35,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
|
|
34
35
|
from langroid.language_models.base import StreamingIfAllowed
|
35
36
|
from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
|
36
37
|
from langroid.mytypes import DocMetaData, Document, Entity
|
38
|
+
from langroid.parsing.document_parser import DocumentType
|
37
39
|
from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
|
38
40
|
from langroid.parsing.repo_loader import RepoLoader
|
39
41
|
from langroid.parsing.search import (
|
@@ -41,20 +43,26 @@ from langroid.parsing.search import (
|
|
41
43
|
find_fuzzy_matches_in_docs,
|
42
44
|
preprocess_text,
|
43
45
|
)
|
46
|
+
from langroid.parsing.table_loader import describe_dataframe
|
44
47
|
from langroid.parsing.url_loader import URLLoader
|
45
|
-
from langroid.parsing.urls import get_list_from_user,
|
48
|
+
from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
|
46
49
|
from langroid.parsing.utils import batched
|
47
50
|
from langroid.prompts.prompts_config import PromptsConfig
|
48
51
|
from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
|
49
52
|
from langroid.utils.configuration import settings
|
50
53
|
from langroid.utils.constants import NO_ANSWER
|
51
|
-
from langroid.utils.output
|
52
|
-
from langroid.
|
54
|
+
from langroid.utils.output import show_if_debug, status
|
55
|
+
from langroid.utils.pydantic_utils import dataframe_to_documents, extract_fields
|
56
|
+
from langroid.vector_store.base import VectorStore, VectorStoreConfig
|
53
57
|
from langroid.vector_store.lancedb import LanceDBConfig
|
54
58
|
|
55
|
-
logger = logging.getLogger(__name__)
|
56
59
|
|
57
|
-
|
60
|
+
@cache
|
61
|
+
def apply_nest_asyncio() -> None:
|
62
|
+
nest_asyncio.apply()
|
63
|
+
|
64
|
+
|
65
|
+
logger = logging.getLogger(__name__)
|
58
66
|
|
59
67
|
DEFAULT_DOC_CHAT_INSTRUCTIONS = """
|
60
68
|
Your task is to answer questions about various documents.
|
@@ -66,25 +74,29 @@ DEFAULT_DOC_CHAT_SYSTEM_MESSAGE = """
|
|
66
74
|
You are a helpful assistant, helping me understand a collection of documents.
|
67
75
|
"""
|
68
76
|
|
77
|
+
has_sentence_transformers = False
|
78
|
+
try:
|
79
|
+
from sentence_transformer import SentenceTransformer # noqa: F401
|
69
80
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
max_context_tokens (int): threshold to use for various steps, e.g.
|
74
|
-
if we are able to fit the current stage of doc processing into
|
75
|
-
this many tokens, we skip additional compression steps, and
|
76
|
-
use the current docs as-is in the context
|
77
|
-
conversation_mode (bool): if True, we will accumulate message history,
|
78
|
-
and pass entire history to LLM at each round.
|
79
|
-
If False, each request to LLM will consist only of the
|
80
|
-
initial task messages plus the current query.
|
81
|
-
"""
|
81
|
+
has_sentence_transformers = True
|
82
|
+
except ImportError:
|
83
|
+
pass
|
82
84
|
|
85
|
+
|
86
|
+
class DocChatAgentConfig(ChatAgentConfig):
|
83
87
|
system_message: str = DEFAULT_DOC_CHAT_SYSTEM_MESSAGE
|
84
88
|
user_message: str = DEFAULT_DOC_CHAT_INSTRUCTIONS
|
85
89
|
summarize_prompt: str = SUMMARY_ANSWER_PROMPT_GPT4
|
86
|
-
|
87
|
-
|
90
|
+
# extra fields to include in content as key=value pairs
|
91
|
+
# (helps retrieval for table-like data)
|
92
|
+
add_fields_to_content: List[str] = []
|
93
|
+
filter_fields: List[str] = [] # fields usable in filter
|
94
|
+
retrieve_only: bool = False # only retr relevant extracts, don't gen summary answer
|
95
|
+
extraction_granularity: int = 1 # granularity (in sentences) for relev extraction
|
96
|
+
filter: str | None = (
|
97
|
+
None # filter condition for various lexical/semantic search fns
|
98
|
+
)
|
99
|
+
conversation_mode: bool = True # accumulate message history?
|
88
100
|
# In assistant mode, DocChatAgent receives questions from another Agent,
|
89
101
|
# and those will already be in stand-alone form, so in this mode
|
90
102
|
# there is no need to convert them to stand-alone form.
|
@@ -100,17 +112,22 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
100
112
|
n_fuzzy_neighbor_words: int = 100 # num neighbor words to retrieve for fuzzy match
|
101
113
|
use_fuzzy_match: bool = True
|
102
114
|
use_bm25_search: bool = True
|
103
|
-
cross_encoder_reranking_model: str =
|
115
|
+
cross_encoder_reranking_model: str = (
|
116
|
+
"cross-encoder/ms-marco-MiniLM-L-6-v2" if has_sentence_transformers else ""
|
117
|
+
)
|
104
118
|
rerank_diversity: bool = True # rerank to maximize diversity?
|
105
119
|
rerank_periphery: bool = True # rerank to avoid Lost In the Middle effect?
|
106
120
|
embed_batch_size: int = 500 # get embedding of at most this many at a time
|
107
121
|
cache: bool = True # cache results
|
108
122
|
debug: bool = False
|
109
123
|
stream: bool = True # allow streaming where needed
|
110
|
-
|
111
|
-
|
124
|
+
split: bool = True # use chunking
|
125
|
+
relevance_extractor_config: None | RelevanceExtractorAgentConfig = (
|
126
|
+
RelevanceExtractorAgentConfig(
|
127
|
+
llm=None # use the parent's llm unless explicitly set here
|
128
|
+
)
|
112
129
|
)
|
113
|
-
doc_paths: List[str] = []
|
130
|
+
doc_paths: List[str | bytes] = []
|
114
131
|
default_paths: List[str] = [
|
115
132
|
"https://news.ycombinator.com/item?id=35629033",
|
116
133
|
"https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
|
@@ -135,7 +152,7 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
135
152
|
# NOTE: PDF parsing is extremely challenging, and each library
|
136
153
|
# has its own strengths and weaknesses.
|
137
154
|
# Try one that works for your use case.
|
138
|
-
# or "
|
155
|
+
# or "unstructured", "pdfplumber", "fitz", "pypdf"
|
139
156
|
library="pdfplumber",
|
140
157
|
),
|
141
158
|
)
|
@@ -156,7 +173,7 @@ class DocChatAgentConfig(ChatAgentConfig):
|
|
156
173
|
collection_name="doc-chat-lancedb",
|
157
174
|
replace_collection=True,
|
158
175
|
storage_path=".lancedb/data/",
|
159
|
-
embedding=hf_embed_config,
|
176
|
+
embedding=hf_embed_config if has_sentence_transformers else oai_embed_config,
|
160
177
|
)
|
161
178
|
llm: OpenAIGPTConfig = OpenAIGPTConfig(
|
162
179
|
type="openai",
|
@@ -180,14 +197,40 @@ class DocChatAgent(ChatAgent):
|
|
180
197
|
):
|
181
198
|
super().__init__(config)
|
182
199
|
self.config: DocChatAgentConfig = config
|
183
|
-
self.original_docs:
|
200
|
+
self.original_docs: List[Document] = []
|
184
201
|
self.original_docs_length = 0
|
185
|
-
self.
|
186
|
-
self.
|
202
|
+
self.from_dataframe = False
|
203
|
+
self.df_description = ""
|
204
|
+
self.chunked_docs: List[Document] = []
|
205
|
+
self.chunked_docs_clean: List[Document] = []
|
187
206
|
self.response: None | Document = None
|
188
207
|
if len(config.doc_paths) > 0:
|
189
208
|
self.ingest()
|
190
209
|
|
210
|
+
def clear(self) -> None:
|
211
|
+
"""Clear the document collection and the specific collection in vecdb"""
|
212
|
+
if self.vecdb is None:
|
213
|
+
raise ValueError("VecDB not set")
|
214
|
+
self.original_docs = []
|
215
|
+
self.original_docs_length = 0
|
216
|
+
self.chunked_docs = []
|
217
|
+
self.chunked_docs_clean = []
|
218
|
+
collection_name = self.vecdb.config.collection_name
|
219
|
+
if collection_name is None:
|
220
|
+
return
|
221
|
+
try:
|
222
|
+
# Note we may have used a vecdb with a config.collection_name
|
223
|
+
# different from the agent's config.vecdb.collection_name!!
|
224
|
+
self.vecdb.delete_collection(collection_name)
|
225
|
+
self.vecdb = VectorStore.create(self.vecdb.config)
|
226
|
+
except Exception as e:
|
227
|
+
logger.warning(
|
228
|
+
f"""
|
229
|
+
Error while deleting collection {collection_name}:
|
230
|
+
{e}
|
231
|
+
"""
|
232
|
+
)
|
233
|
+
|
191
234
|
def ingest(self) -> None:
|
192
235
|
"""
|
193
236
|
Chunk + embed + store docs specified by self.config.doc_paths
|
@@ -204,63 +247,316 @@ class DocChatAgent(ChatAgent):
|
|
204
247
|
# do keyword and other non-vector searches
|
205
248
|
if self.vecdb is None:
|
206
249
|
raise ValueError("VecDB not set")
|
207
|
-
self.
|
208
|
-
# used for lexical similarity e.g. keyword search (bm25 etc)
|
209
|
-
self.chunked_docs_clean = [
|
210
|
-
Document(content=preprocess_text(d.content), metadata=d.metadata)
|
211
|
-
for d in self.chunked_docs
|
212
|
-
]
|
250
|
+
self.setup_documents(filter=self.config.filter)
|
213
251
|
return
|
214
|
-
|
252
|
+
self.ingest_doc_paths(self.config.doc_paths) # type: ignore
|
253
|
+
|
254
|
+
def ingest_doc_paths(
|
255
|
+
self,
|
256
|
+
paths: str | bytes | List[str | bytes],
|
257
|
+
metadata: (
|
258
|
+
List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
|
259
|
+
) = [],
|
260
|
+
doc_type: str | DocumentType | None = None,
|
261
|
+
) -> List[Document]:
|
262
|
+
"""Split, ingest docs from specified paths,
|
263
|
+
do not add these to config.doc_paths.
|
264
|
+
|
265
|
+
Args:
|
266
|
+
paths: document paths, urls or byte-content of docs.
|
267
|
+
The bytes option is intended to support cases where a document
|
268
|
+
has already been read in as bytes (e.g. from an API or a database),
|
269
|
+
and we want to avoid having to write it to a temporary file
|
270
|
+
just to read it back in.
|
271
|
+
metadata: List of metadata dicts, one for each path.
|
272
|
+
If a single dict is passed in, it is used for all paths.
|
273
|
+
doc_type: DocumentType to use for parsing, if known.
|
274
|
+
MUST apply to all docs if specified.
|
275
|
+
This is especially useful when the `paths` are of bytes type,
|
276
|
+
to help with document type detection.
|
277
|
+
Returns:
|
278
|
+
List of Document objects
|
279
|
+
"""
|
280
|
+
if isinstance(paths, str) or isinstance(paths, bytes):
|
281
|
+
paths = [paths]
|
282
|
+
all_paths = paths
|
283
|
+
paths_meta: Dict[int, Any] = {}
|
284
|
+
urls_meta: Dict[int, Any] = {}
|
285
|
+
idxs = range(len(all_paths))
|
286
|
+
url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
|
287
|
+
urls = [all_paths[i] for i in url_idxs]
|
288
|
+
paths = [all_paths[i] for i in path_idxs]
|
289
|
+
bytes_list = [all_paths[i] for i in bytes_idxs]
|
290
|
+
path_idxs.extend(bytes_idxs)
|
291
|
+
paths.extend(bytes_list)
|
292
|
+
if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
|
293
|
+
metadata, list
|
294
|
+
):
|
295
|
+
if isinstance(metadata, list):
|
296
|
+
idx2meta = {
|
297
|
+
p: (
|
298
|
+
m
|
299
|
+
if isinstance(m, dict)
|
300
|
+
else (isinstance(m, DocMetaData) and m.dict())
|
301
|
+
) # appease mypy
|
302
|
+
for p, m in zip(idxs, metadata)
|
303
|
+
}
|
304
|
+
elif isinstance(metadata, dict):
|
305
|
+
idx2meta = {p: metadata for p in idxs}
|
306
|
+
else:
|
307
|
+
idx2meta = {p: metadata.dict() for p in idxs}
|
308
|
+
urls_meta = {u: idx2meta[u] for u in url_idxs}
|
309
|
+
paths_meta = {p: idx2meta[p] for p in path_idxs}
|
215
310
|
docs: List[Document] = []
|
216
311
|
parser = Parser(self.config.parsing)
|
217
312
|
if len(urls) > 0:
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
313
|
+
for ui in url_idxs:
|
314
|
+
meta = urls_meta.get(ui, {})
|
315
|
+
loader = URLLoader(urls=[all_paths[ui]], parser=parser) # type: ignore
|
316
|
+
url_docs = loader.load()
|
317
|
+
# update metadata of each doc with meta
|
318
|
+
for d in url_docs:
|
319
|
+
d.metadata = d.metadata.copy(update=meta)
|
320
|
+
docs.extend(url_docs)
|
321
|
+
if len(paths) > 0: # paths OR bytes are handled similarly
|
322
|
+
for pi in path_idxs:
|
323
|
+
meta = paths_meta.get(pi, {})
|
324
|
+
p = all_paths[pi]
|
325
|
+
path_docs = RepoLoader.get_documents(
|
326
|
+
p,
|
327
|
+
parser=parser,
|
328
|
+
doc_type=doc_type,
|
329
|
+
)
|
330
|
+
# update metadata of each doc with meta
|
331
|
+
for d in path_docs:
|
332
|
+
d.metadata = d.metadata.copy(update=meta)
|
223
333
|
docs.extend(path_docs)
|
224
334
|
n_docs = len(docs)
|
225
|
-
n_splits = self.ingest_docs(docs)
|
335
|
+
n_splits = self.ingest_docs(docs, split=self.config.split)
|
226
336
|
if n_docs == 0:
|
227
|
-
return
|
337
|
+
return []
|
228
338
|
n_urls = len(urls)
|
229
339
|
n_paths = len(paths)
|
230
340
|
print(
|
231
341
|
f"""
|
232
342
|
[green]I have processed the following {n_urls} URLs
|
233
|
-
and {n_paths}
|
343
|
+
and {n_paths} docs into {n_splits} parts:
|
234
344
|
""".strip()
|
235
345
|
)
|
236
|
-
|
237
|
-
print("\n".join(
|
346
|
+
path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
|
347
|
+
print("\n".join([u for u in urls if isinstance(u, str)])) # appease mypy
|
348
|
+
print("\n".join(path_reps))
|
349
|
+
return docs
|
238
350
|
|
239
|
-
def ingest_docs(
|
351
|
+
def ingest_docs(
|
352
|
+
self,
|
353
|
+
docs: List[Document],
|
354
|
+
split: bool = True,
|
355
|
+
metadata: (
|
356
|
+
List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
|
357
|
+
) = [],
|
358
|
+
) -> int:
|
240
359
|
"""
|
241
360
|
Chunk docs into pieces, map each chunk to vec-embedding, store in vec-db
|
361
|
+
|
362
|
+
Args:
|
363
|
+
docs: List of Document objects
|
364
|
+
split: Whether to split docs into chunks. Default is True.
|
365
|
+
If False, docs are treated as "chunks" and are not split.
|
366
|
+
metadata: List of metadata dicts, one for each doc, to augment
|
367
|
+
whatever metadata is already in the doc.
|
368
|
+
[ASSUME no conflicting keys between the two metadata dicts.]
|
369
|
+
If a single dict is passed in, it is used for all docs.
|
242
370
|
"""
|
243
|
-
|
371
|
+
if isinstance(metadata, list) and len(metadata) > 0:
|
372
|
+
for d, m in zip(docs, metadata):
|
373
|
+
d.metadata = d.metadata.copy(
|
374
|
+
update=m if isinstance(m, dict) else m.dict() # type: ignore
|
375
|
+
)
|
376
|
+
elif isinstance(metadata, dict):
|
377
|
+
for d in docs:
|
378
|
+
d.metadata = d.metadata.copy(update=metadata)
|
379
|
+
elif isinstance(metadata, DocMetaData):
|
380
|
+
for d in docs:
|
381
|
+
d.metadata = d.metadata.copy(update=metadata.dict())
|
382
|
+
|
383
|
+
self.original_docs.extend(docs)
|
244
384
|
if self.parser is None:
|
245
385
|
raise ValueError("Parser not set")
|
246
386
|
for d in docs:
|
247
387
|
if d.metadata.id in [None, ""]:
|
248
388
|
d.metadata.id = d._unique_hash_id()
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
for d in
|
254
|
-
|
389
|
+
if split:
|
390
|
+
docs = self.parser.split(docs)
|
391
|
+
else:
|
392
|
+
# treat each doc as a chunk
|
393
|
+
for d in docs:
|
394
|
+
d.metadata.is_chunk = True
|
255
395
|
if self.vecdb is None:
|
256
396
|
raise ValueError("VecDB not set")
|
397
|
+
|
398
|
+
# If any additional fields need to be added to content,
|
399
|
+
# add them as key=value pairs for all docs, before batching.
|
400
|
+
# This helps retrieval for table-like data.
|
401
|
+
# Note we need to do this at stage so that the embeddings
|
402
|
+
# are computed on the full content with these additional fields.
|
403
|
+
if len(self.config.add_fields_to_content) > 0:
|
404
|
+
fields = [
|
405
|
+
f for f in extract_fields(docs[0], self.config.add_fields_to_content)
|
406
|
+
]
|
407
|
+
if len(fields) > 0:
|
408
|
+
for d in docs:
|
409
|
+
key_vals = extract_fields(d, fields)
|
410
|
+
d.content = (
|
411
|
+
",".join(f"{k}={v}" for k, v in key_vals.items())
|
412
|
+
+ ",content="
|
413
|
+
+ d.content
|
414
|
+
)
|
415
|
+
docs = docs[: self.config.parsing.max_chunks]
|
257
416
|
# add embeddings in batches, to stay under limit of embeddings API
|
258
417
|
batches = list(batched(docs, self.config.embed_batch_size))
|
259
418
|
for batch in batches:
|
260
419
|
self.vecdb.add_documents(batch)
|
261
420
|
self.original_docs_length = self.doc_length(docs)
|
421
|
+
self.setup_documents(docs, filter=self.config.filter)
|
262
422
|
return len(docs)
|
263
423
|
|
424
|
+
@staticmethod
|
425
|
+
def document_compatible_dataframe(
|
426
|
+
df: pd.DataFrame,
|
427
|
+
content: str = "content",
|
428
|
+
metadata: List[str] = [],
|
429
|
+
) -> Tuple[pd.DataFrame, List[str]]:
|
430
|
+
"""
|
431
|
+
Convert dataframe so it is compatible with Document class:
|
432
|
+
- has "content" column
|
433
|
+
- has an "id" column to be used as Document.metadata.id
|
434
|
+
|
435
|
+
Args:
|
436
|
+
df: dataframe to convert
|
437
|
+
content: name of content column
|
438
|
+
metadata: list of metadata column names
|
439
|
+
|
440
|
+
Returns:
|
441
|
+
Tuple[pd.DataFrame, List[str]]: dataframe, metadata
|
442
|
+
- dataframe: dataframe with "content" column and "id" column
|
443
|
+
- metadata: list of metadata column names, including "id"
|
444
|
+
"""
|
445
|
+
if content not in df.columns:
|
446
|
+
raise ValueError(
|
447
|
+
f"""
|
448
|
+
Content column {content} not in dataframe,
|
449
|
+
so we cannot ingest into the DocChatAgent.
|
450
|
+
Please specify the `content` parameter as a suitable
|
451
|
+
text-based column in the dataframe.
|
452
|
+
"""
|
453
|
+
)
|
454
|
+
if content != "content":
|
455
|
+
# rename content column to "content", leave existing column intact
|
456
|
+
df = df.rename(columns={content: "content"}, inplace=False)
|
457
|
+
|
458
|
+
actual_metadata = metadata.copy()
|
459
|
+
if "id" not in df.columns:
|
460
|
+
docs = dataframe_to_documents(df, content="content", metadata=metadata)
|
461
|
+
ids = [str(d.id()) for d in docs]
|
462
|
+
df["id"] = ids
|
463
|
+
|
464
|
+
if "id" not in actual_metadata:
|
465
|
+
actual_metadata += ["id"]
|
466
|
+
|
467
|
+
return df, actual_metadata
|
468
|
+
|
469
|
+
def ingest_dataframe(
|
470
|
+
self,
|
471
|
+
df: pd.DataFrame,
|
472
|
+
content: str = "content",
|
473
|
+
metadata: List[str] = [],
|
474
|
+
) -> int:
|
475
|
+
"""
|
476
|
+
Ingest a dataframe into vecdb.
|
477
|
+
"""
|
478
|
+
self.from_dataframe = True
|
479
|
+
self.df_description = describe_dataframe(
|
480
|
+
df, filter_fields=self.config.filter_fields, n_vals=5
|
481
|
+
)
|
482
|
+
df, metadata = DocChatAgent.document_compatible_dataframe(df, content, metadata)
|
483
|
+
docs = dataframe_to_documents(df, content="content", metadata=metadata)
|
484
|
+
# When ingesting a dataframe we will no longer do any chunking,
|
485
|
+
# so we mark each doc as a chunk.
|
486
|
+
# TODO - revisit this since we may still want to chunk large text columns
|
487
|
+
for d in docs:
|
488
|
+
d.metadata.is_chunk = True
|
489
|
+
return self.ingest_docs(docs)
|
490
|
+
|
491
|
+
def set_filter(self, filter: str) -> None:
|
492
|
+
self.config.filter = filter
|
493
|
+
self.setup_documents(filter=filter)
|
494
|
+
|
495
|
+
def setup_documents(
|
496
|
+
self,
|
497
|
+
docs: List[Document] = [],
|
498
|
+
filter: str | None = None,
|
499
|
+
) -> None:
|
500
|
+
"""
|
501
|
+
Setup `self.chunked_docs` and `self.chunked_docs_clean`
|
502
|
+
based on possible filter.
|
503
|
+
These will be used in various non-vector-based search functions,
|
504
|
+
e.g. self.get_similar_chunks_bm25(), self.get_fuzzy_matches(), etc.
|
505
|
+
|
506
|
+
Args:
|
507
|
+
docs: List of Document objects. This is empty when we are calling this
|
508
|
+
method after initial doc ingestion.
|
509
|
+
filter: Filter condition for various lexical/semantic search fns.
|
510
|
+
"""
|
511
|
+
if filter is None and len(docs) > 0:
|
512
|
+
# no filter, so just use the docs passed in
|
513
|
+
self.chunked_docs.extend(docs)
|
514
|
+
else:
|
515
|
+
if self.vecdb is None:
|
516
|
+
raise ValueError("VecDB not set")
|
517
|
+
self.chunked_docs = self.vecdb.get_all_documents(where=filter or "")
|
518
|
+
|
519
|
+
self.chunked_docs_clean = [
|
520
|
+
Document(content=preprocess_text(d.content), metadata=d.metadata)
|
521
|
+
for d in self.chunked_docs
|
522
|
+
]
|
523
|
+
|
524
|
+
def get_field_values(self, fields: list[str]) -> Dict[str, str]:
|
525
|
+
"""Get string-listing of possible values of each filterable field,
|
526
|
+
e.g.
|
527
|
+
{
|
528
|
+
"genre": "crime, drama, mystery, ... (10 more)",
|
529
|
+
"certificate": "R, PG-13, PG, R",
|
530
|
+
}
|
531
|
+
"""
|
532
|
+
field_values: Dict[str, Set[str]] = {}
|
533
|
+
# make empty set for each field
|
534
|
+
for f in fields:
|
535
|
+
field_values[f] = set()
|
536
|
+
if self.vecdb is None:
|
537
|
+
raise ValueError("VecDB not set")
|
538
|
+
# get all documents and accumulate possible values of each field until 10
|
539
|
+
docs = self.vecdb.get_all_documents() # only works for vecdbs that support this
|
540
|
+
for d in docs:
|
541
|
+
# extract fields from d
|
542
|
+
doc_field_vals = extract_fields(d, fields)
|
543
|
+
for field, val in doc_field_vals.items():
|
544
|
+
field_values[field].add(val)
|
545
|
+
# For each field make a string showing list of possible values,
|
546
|
+
# truncate to 20 values, and if there are more, indicate how many
|
547
|
+
# more there are, e.g. Genre: crime, drama, mystery, ... (20 more)
|
548
|
+
field_values_list = {}
|
549
|
+
for f in fields:
|
550
|
+
vals = list(field_values[f])
|
551
|
+
n = len(vals)
|
552
|
+
remaining = n - 20
|
553
|
+
vals = vals[:20]
|
554
|
+
if n > 20:
|
555
|
+
vals.append(f"(...{remaining} more)")
|
556
|
+
# make a string of the values, ensure they are strings
|
557
|
+
field_values_list[f] = ", ".join(str(v) for v in vals)
|
558
|
+
return field_values_list
|
559
|
+
|
264
560
|
def doc_length(self, docs: List[Document]) -> int:
|
265
561
|
"""
|
266
562
|
Calc token-length of a list of docs
|
@@ -342,10 +638,9 @@ class DocChatAgent(ChatAgent):
|
|
342
638
|
if len(inputs) == 0:
|
343
639
|
if is_new_collection:
|
344
640
|
inputs = self.config.default_paths
|
345
|
-
self.config.doc_paths = inputs
|
641
|
+
self.config.doc_paths = inputs # type: ignore
|
346
642
|
self.ingest()
|
347
643
|
|
348
|
-
@no_type_check
|
349
644
|
def llm_response(
|
350
645
|
self,
|
351
646
|
query: None | str | ChatDocument = None,
|
@@ -362,10 +657,55 @@ class DocChatAgent(ChatAgent):
|
|
362
657
|
query_str = query_str[1:] if query_str is not None else None
|
363
658
|
if self.llm is None:
|
364
659
|
raise ValueError("LLM not set")
|
365
|
-
with StreamingIfAllowed(self.llm):
|
660
|
+
with StreamingIfAllowed(self.llm, self.llm.get_stream()):
|
366
661
|
response = super().llm_response(query_str)
|
367
662
|
if query_str is not None:
|
368
|
-
self.update_dialog(
|
663
|
+
self.update_dialog(
|
664
|
+
query_str, "" if response is None else response.content
|
665
|
+
)
|
666
|
+
return response
|
667
|
+
if query_str == "":
|
668
|
+
return None
|
669
|
+
elif query_str == "?" and self.response is not None:
|
670
|
+
return self.justify_response()
|
671
|
+
elif (query_str.startswith(("summar", "?")) and self.response is None) or (
|
672
|
+
query_str == "??"
|
673
|
+
):
|
674
|
+
return self.summarize_docs()
|
675
|
+
else:
|
676
|
+
self.callbacks.show_start_response(entity="llm")
|
677
|
+
response = self.answer_from_docs(query_str)
|
678
|
+
return ChatDocument(
|
679
|
+
content=response.content,
|
680
|
+
metadata=ChatDocMetaData(
|
681
|
+
source=response.metadata.source,
|
682
|
+
sender=Entity.LLM,
|
683
|
+
),
|
684
|
+
)
|
685
|
+
|
686
|
+
async def llm_response_async(
|
687
|
+
self,
|
688
|
+
query: None | str | ChatDocument = None,
|
689
|
+
) -> Optional[ChatDocument]:
|
690
|
+
apply_nest_asyncio()
|
691
|
+
if not self.llm_can_respond(query):
|
692
|
+
return None
|
693
|
+
query_str: str | None
|
694
|
+
if isinstance(query, ChatDocument):
|
695
|
+
query_str = query.content
|
696
|
+
else:
|
697
|
+
query_str = query
|
698
|
+
if query_str is None or query_str.startswith("!"):
|
699
|
+
# direct query to LLM
|
700
|
+
query_str = query_str[1:] if query_str is not None else None
|
701
|
+
if self.llm is None:
|
702
|
+
raise ValueError("LLM not set")
|
703
|
+
with StreamingIfAllowed(self.llm, self.llm.get_stream()):
|
704
|
+
response = await super().llm_response_async(query_str)
|
705
|
+
if query_str is not None:
|
706
|
+
self.update_dialog(
|
707
|
+
query_str, "" if response is None else response.content
|
708
|
+
)
|
369
709
|
return response
|
370
710
|
if query_str == "":
|
371
711
|
return None
|
@@ -376,6 +716,7 @@ class DocChatAgent(ChatAgent):
|
|
376
716
|
):
|
377
717
|
return self.summarize_docs()
|
378
718
|
else:
|
719
|
+
self.callbacks.show_start_response(entity="llm")
|
379
720
|
response = self.answer_from_docs(query_str)
|
380
721
|
return ChatDocument(
|
381
722
|
content=response.content,
|
@@ -407,7 +748,9 @@ class DocChatAgent(ChatAgent):
|
|
407
748
|
]
|
408
749
|
)
|
409
750
|
|
410
|
-
def get_summary_answer(
|
751
|
+
def get_summary_answer(
|
752
|
+
self, question: str, passages: List[Document]
|
753
|
+
) -> ChatDocument:
|
411
754
|
"""
|
412
755
|
Given a question and a list of (possibly) doc snippets,
|
413
756
|
generate an answer if possible
|
@@ -435,9 +778,6 @@ class DocChatAgent(ChatAgent):
|
|
435
778
|
# 2 new LLMMessage objects:
|
436
779
|
# one for `final_prompt`, and one for the LLM response
|
437
780
|
|
438
|
-
# TODO need to "forget" last two messages in message_history
|
439
|
-
# if we are not in conversation mode
|
440
|
-
|
441
781
|
if self.config.conversation_mode:
|
442
782
|
# respond with temporary context
|
443
783
|
answer_doc = super()._llm_response_temp_context(question, final_prompt)
|
@@ -446,16 +786,23 @@ class DocChatAgent(ChatAgent):
|
|
446
786
|
|
447
787
|
final_answer = answer_doc.content.strip()
|
448
788
|
show_if_debug(final_answer, "SUMMARIZE_RESPONSE= ")
|
449
|
-
|
450
|
-
if
|
451
|
-
|
452
|
-
|
453
|
-
else:
|
789
|
+
|
790
|
+
if final_answer.startswith("SOURCE"):
|
791
|
+
# sometimes SOURCE may be shown first,
|
792
|
+
# in this case just use final_answer as-is for both content and source
|
454
793
|
content = final_answer
|
455
|
-
sources =
|
456
|
-
|
794
|
+
sources = final_answer
|
795
|
+
else:
|
796
|
+
parts = final_answer.split("SOURCE:", maxsplit=1)
|
797
|
+
if len(parts) > 1:
|
798
|
+
content = parts[0].strip()
|
799
|
+
sources = parts[1].strip()
|
800
|
+
else:
|
801
|
+
content = final_answer
|
802
|
+
sources = ""
|
803
|
+
return ChatDocument(
|
457
804
|
content=content,
|
458
|
-
metadata=
|
805
|
+
metadata=ChatDocMetaData(
|
459
806
|
source="SOURCE: " + sources,
|
460
807
|
sender=Entity.LLM,
|
461
808
|
cached=getattr(answer_doc.metadata, "cached", False),
|
@@ -465,7 +812,7 @@ class DocChatAgent(ChatAgent):
|
|
465
812
|
def llm_hypothetical_answer(self, query: str) -> str:
|
466
813
|
if self.llm is None:
|
467
814
|
raise ValueError("LLM not set")
|
468
|
-
with
|
815
|
+
with status("[cyan]LLM generating hypothetical answer..."):
|
469
816
|
with StreamingIfAllowed(self.llm, False):
|
470
817
|
# TODO: provide an easy way to
|
471
818
|
# Adjust this prompt depending on context.
|
@@ -485,7 +832,7 @@ class DocChatAgent(ChatAgent):
|
|
485
832
|
def llm_rephrase_query(self, query: str) -> List[str]:
|
486
833
|
if self.llm is None:
|
487
834
|
raise ValueError("LLM not set")
|
488
|
-
with
|
835
|
+
with status("[cyan]LLM generating rephrases of query..."):
|
489
836
|
with StreamingIfAllowed(self.llm, False):
|
490
837
|
rephrases = self.llm_response_forget(
|
491
838
|
f"""
|
@@ -501,11 +848,11 @@ class DocChatAgent(ChatAgent):
|
|
501
848
|
) -> List[Tuple[Document, float]]:
|
502
849
|
# find similar docs using bm25 similarity:
|
503
850
|
# these may sometimes be more likely to contain a relevant verbatim extract
|
504
|
-
with
|
505
|
-
if self.chunked_docs is None:
|
851
|
+
with status("[cyan]Searching for similar chunks using bm25..."):
|
852
|
+
if self.chunked_docs is None or len(self.chunked_docs) == 0:
|
506
853
|
logger.warning("No chunked docs; cannot use bm25-similarity")
|
507
854
|
return []
|
508
|
-
if self.chunked_docs_clean is None:
|
855
|
+
if self.chunked_docs_clean is None or len(self.chunked_docs_clean) == 0:
|
509
856
|
logger.warning("No cleaned chunked docs; cannot use bm25-similarity")
|
510
857
|
return []
|
511
858
|
docs_scores = find_closest_matches_with_bm25(
|
@@ -519,7 +866,7 @@ class DocChatAgent(ChatAgent):
|
|
519
866
|
def get_fuzzy_matches(self, query: str, multiple: int) -> List[Document]:
|
520
867
|
# find similar docs using fuzzy matching:
|
521
868
|
# these may sometimes be more likely to contain a relevant verbatim extract
|
522
|
-
with
|
869
|
+
with status("[cyan]Finding fuzzy matches in chunks..."):
|
523
870
|
if self.chunked_docs is None:
|
524
871
|
logger.warning("No chunked docs; cannot use fuzzy matching")
|
525
872
|
return []
|
@@ -539,7 +886,7 @@ class DocChatAgent(ChatAgent):
|
|
539
886
|
def rerank_with_cross_encoder(
|
540
887
|
self, query: str, passages: List[Document]
|
541
888
|
) -> List[Document]:
|
542
|
-
with
|
889
|
+
with status("[cyan]Re-ranking retrieved chunks using cross-encoder..."):
|
543
890
|
try:
|
544
891
|
from sentence_transformers import CrossEncoder
|
545
892
|
except ImportError:
|
@@ -657,8 +1004,45 @@ class DocChatAgent(ChatAgent):
|
|
657
1004
|
"""
|
658
1005
|
if self.vecdb is None or self.config.n_neighbor_chunks == 0:
|
659
1006
|
return docs_scores
|
1007
|
+
if len(docs_scores) == 0:
|
1008
|
+
return []
|
1009
|
+
if set(docs_scores[0][0].__fields__) != {"content", "metadata"}:
|
1010
|
+
# Do not add context window when there are other fields besides just
|
1011
|
+
# content and metadata, since we do not know how to set those other fields
|
1012
|
+
# for newly created docs with combined content.
|
1013
|
+
return docs_scores
|
660
1014
|
return self.vecdb.add_context_window(docs_scores, self.config.n_neighbor_chunks)
|
661
1015
|
|
1016
|
+
def get_semantic_search_results(
|
1017
|
+
self,
|
1018
|
+
query: str,
|
1019
|
+
k: int = 10,
|
1020
|
+
) -> List[Tuple[Document, float]]:
|
1021
|
+
"""
|
1022
|
+
Get semantic search results from vecdb.
|
1023
|
+
Args:
|
1024
|
+
query (str): query to search for
|
1025
|
+
k (int): number of results to return
|
1026
|
+
Returns:
|
1027
|
+
List[Tuple[Document, float]]: List of (Document, score) tuples.
|
1028
|
+
"""
|
1029
|
+
if self.vecdb is None:
|
1030
|
+
raise ValueError("VecDB not set")
|
1031
|
+
# Note: for dynamic filtering based on a query, users can
|
1032
|
+
# use the `temp_update` context-manager to pass in a `filter` to self.config,
|
1033
|
+
# e.g.:
|
1034
|
+
# with temp_update(self.config, {"filter": "metadata.source=='source1'"}):
|
1035
|
+
# docs_scores = self.get_semantic_search_results(query, k=k)
|
1036
|
+
# This avoids having pass the `filter` argument to every function call
|
1037
|
+
# upstream of this one.
|
1038
|
+
# The `temp_update` context manager is defined in
|
1039
|
+
# `langroid/utils/pydantic_utils.py`
|
1040
|
+
return self.vecdb.similar_texts_with_scores(
|
1041
|
+
query,
|
1042
|
+
k=k,
|
1043
|
+
where=self.config.filter,
|
1044
|
+
)
|
1045
|
+
|
662
1046
|
def get_relevant_chunks(
|
663
1047
|
self, query: str, query_proxies: List[str] = []
|
664
1048
|
) -> List[Document]:
|
@@ -695,21 +1079,21 @@ class DocChatAgent(ChatAgent):
|
|
695
1079
|
if self.vecdb is None:
|
696
1080
|
raise ValueError("VecDB not set")
|
697
1081
|
|
698
|
-
with
|
1082
|
+
with status("[cyan]Searching VecDB for relevant doc passages..."):
|
699
1083
|
docs_and_scores: List[Tuple[Document, float]] = []
|
700
1084
|
for q in [query] + query_proxies:
|
701
|
-
docs_and_scores += self.
|
1085
|
+
docs_and_scores += self.get_semantic_search_results(
|
702
1086
|
q,
|
703
1087
|
k=self.config.parsing.n_similar_docs * retrieval_multiple,
|
704
1088
|
)
|
705
1089
|
# keep only docs with unique d.id()
|
706
1090
|
id2doc_score = {d.id(): (d, s) for d, s in docs_and_scores}
|
707
1091
|
docs_and_scores = list(id2doc_score.values())
|
708
|
-
|
709
|
-
passages = [
|
710
|
-
|
711
|
-
|
712
|
-
]
|
1092
|
+
passages = [d for (d, _) in docs_and_scores]
|
1093
|
+
# passages = [
|
1094
|
+
# Document(content=d.content, metadata=d.metadata)
|
1095
|
+
# for (d, _) in docs_and_scores
|
1096
|
+
# ]
|
713
1097
|
|
714
1098
|
if self.config.use_bm25_search:
|
715
1099
|
docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
|
@@ -771,7 +1155,7 @@ class DocChatAgent(ChatAgent):
|
|
771
1155
|
# Regardless of whether we are in conversation mode or not,
|
772
1156
|
# for relevant doc/chunk extraction, we must convert the query
|
773
1157
|
# to a standalone query to get more relevant results.
|
774
|
-
with
|
1158
|
+
with status("[cyan]Converting to stand-alone query...[/cyan]"):
|
775
1159
|
with StreamingIfAllowed(self.llm, False):
|
776
1160
|
query = self.llm.followup_to_standalone(self.dialog, query)
|
777
1161
|
print(f"[orange2]New query: {query}")
|
@@ -790,7 +1174,7 @@ class DocChatAgent(ChatAgent):
|
|
790
1174
|
if len(passages) == 0:
|
791
1175
|
return query, []
|
792
1176
|
|
793
|
-
with
|
1177
|
+
with status("[cyan]LLM Extracting verbatim passages..."):
|
794
1178
|
with StreamingIfAllowed(self.llm, False):
|
795
1179
|
# these are async calls, one per passage; turn off streaming
|
796
1180
|
extracts = self.get_verbatim_extracts(query, passages)
|
@@ -814,8 +1198,15 @@ class DocChatAgent(ChatAgent):
|
|
814
1198
|
List[Document]: list of Documents containing extracts and metadata.
|
815
1199
|
"""
|
816
1200
|
agent_cfg = self.config.relevance_extractor_config
|
1201
|
+
if agent_cfg is None:
|
1202
|
+
# no relevance extraction: simply return passages
|
1203
|
+
return passages
|
1204
|
+
if agent_cfg.llm is None:
|
1205
|
+
# Use main DocChatAgent's LLM if not provided explicitly:
|
1206
|
+
# this reduces setup burden on the user
|
1207
|
+
agent_cfg.llm = self.config.llm
|
817
1208
|
agent_cfg.query = query
|
818
|
-
agent_cfg.segment_length =
|
1209
|
+
agent_cfg.segment_length = self.config.extraction_granularity
|
819
1210
|
agent_cfg.llm.stream = False # disable streaming for concurrent calls
|
820
1211
|
|
821
1212
|
agent = RelevanceExtractorAgent(agent_cfg)
|
@@ -831,16 +1222,21 @@ class DocChatAgent(ChatAgent):
|
|
831
1222
|
input_map=lambda msg: msg.content,
|
832
1223
|
output_map=lambda ans: ans.content if ans is not None else NO_ANSWER,
|
833
1224
|
)
|
834
|
-
metadatas = [P.metadata for P in passages]
|
835
|
-
# return with metadata so we can use it downstream, e.g. to cite sources
|
836
|
-
return [
|
837
|
-
Document(content=e, metadata=m)
|
838
|
-
for e, m in zip(extracts, metadatas)
|
839
|
-
if (e != NO_ANSWER and len(e) > 0)
|
840
|
-
]
|
841
1225
|
|
842
|
-
|
843
|
-
|
1226
|
+
# Caution: Retain ALL other fields in the Documents (which could be
|
1227
|
+
# other than just `content` and `metadata`), while simply replacing
|
1228
|
+
# `content` with the extracted portions
|
1229
|
+
passage_extracts = []
|
1230
|
+
for p, e in zip(passages, extracts):
|
1231
|
+
if e == NO_ANSWER or len(e) == 0:
|
1232
|
+
continue
|
1233
|
+
p_copy = p.copy()
|
1234
|
+
p_copy.content = e
|
1235
|
+
passage_extracts.append(p_copy)
|
1236
|
+
|
1237
|
+
return passage_extracts
|
1238
|
+
|
1239
|
+
def answer_from_docs(self, query: str) -> ChatDocument:
|
844
1240
|
"""
|
845
1241
|
Answer query based on relevant docs from the VecDB
|
846
1242
|
|
@@ -850,24 +1246,38 @@ class DocChatAgent(ChatAgent):
|
|
850
1246
|
Returns:
|
851
1247
|
Document: answer
|
852
1248
|
"""
|
853
|
-
response =
|
1249
|
+
response = ChatDocument(
|
854
1250
|
content=NO_ANSWER,
|
855
|
-
metadata=
|
1251
|
+
metadata=ChatDocMetaData(
|
856
1252
|
source="None",
|
1253
|
+
sender=Entity.LLM,
|
857
1254
|
),
|
858
1255
|
)
|
859
1256
|
# query may be updated to a stand-alone version
|
860
1257
|
query, extracts = self.get_relevant_extracts(query)
|
861
1258
|
if len(extracts) == 0:
|
862
1259
|
return response
|
1260
|
+
if self.llm is None:
|
1261
|
+
raise ValueError("LLM not set")
|
1262
|
+
if self.config.retrieve_only:
|
1263
|
+
# only return extracts, skip LLM-based summary answer
|
1264
|
+
meta = dict(
|
1265
|
+
sender=Entity.LLM,
|
1266
|
+
)
|
1267
|
+
# copy metadata from first doc, unclear what to do here.
|
1268
|
+
meta.update(extracts[0].metadata)
|
1269
|
+
return ChatDocument(
|
1270
|
+
content="\n\n".join([e.content for e in extracts]),
|
1271
|
+
metadata=ChatDocMetaData(**meta),
|
1272
|
+
)
|
863
1273
|
with ExitStack() as stack:
|
864
1274
|
# conditionally use Streaming or rich console context
|
865
1275
|
cm = (
|
866
1276
|
StreamingIfAllowed(self.llm)
|
867
1277
|
if settings.stream
|
868
|
-
else (
|
1278
|
+
else (status("LLM Generating final answer..."))
|
869
1279
|
)
|
870
|
-
stack.enter_context(cm)
|
1280
|
+
stack.enter_context(cm) # type: ignore
|
871
1281
|
response = self.get_summary_answer(query, extracts)
|
872
1282
|
|
873
1283
|
self.update_dialog(query, response.content)
|
@@ -881,7 +1291,7 @@ class DocChatAgent(ChatAgent):
|
|
881
1291
|
"""Summarize all docs"""
|
882
1292
|
if self.llm is None:
|
883
1293
|
raise ValueError("LLM not set")
|
884
|
-
if self.original_docs
|
1294
|
+
if len(self.original_docs) == 0:
|
885
1295
|
logger.warning(
|
886
1296
|
"""
|
887
1297
|
No docs to summarize! Perhaps you are re-using a previously
|
@@ -910,19 +1320,22 @@ class DocChatAgent(ChatAgent):
|
|
910
1320
|
)
|
911
1321
|
prompt = f"""
|
912
1322
|
{instruction}
|
1323
|
+
|
1324
|
+
FULL TEXT:
|
913
1325
|
{full_text}
|
914
1326
|
""".strip()
|
915
1327
|
with StreamingIfAllowed(self.llm):
|
916
|
-
summary =
|
917
|
-
return summary
|
1328
|
+
summary = ChatAgent.llm_response(self, prompt)
|
1329
|
+
return summary
|
918
1330
|
|
919
|
-
def justify_response(self) -> None:
|
1331
|
+
def justify_response(self) -> ChatDocument | None:
|
920
1332
|
"""Show evidence for last response"""
|
921
1333
|
if self.response is None:
|
922
1334
|
print("[magenta]No response yet")
|
923
|
-
return
|
1335
|
+
return None
|
924
1336
|
source = self.response.metadata.source
|
925
1337
|
if len(source) > 0:
|
926
1338
|
print("[magenta]" + source)
|
927
1339
|
else:
|
928
1340
|
print("[magenta]No source found")
|
1341
|
+
return None
|