langroid 0.31.2__py3-none-any.whl → 0.33.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
- langroid-0.33.3.dist-info/RECORD +7 -0
- {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
- langroid-0.33.3.dist-info/entry_points.txt +4 -0
- pyproject.toml +317 -212
- langroid/__init__.py +0 -106
- langroid/agent/.chainlit/config.toml +0 -121
- langroid/agent/.chainlit/translations/bn.json +0 -231
- langroid/agent/.chainlit/translations/en-US.json +0 -229
- langroid/agent/.chainlit/translations/gu.json +0 -231
- langroid/agent/.chainlit/translations/he-IL.json +0 -231
- langroid/agent/.chainlit/translations/hi.json +0 -231
- langroid/agent/.chainlit/translations/kn.json +0 -231
- langroid/agent/.chainlit/translations/ml.json +0 -231
- langroid/agent/.chainlit/translations/mr.json +0 -231
- langroid/agent/.chainlit/translations/ta.json +0 -231
- langroid/agent/.chainlit/translations/te.json +0 -231
- langroid/agent/.chainlit/translations/zh-CN.json +0 -229
- langroid/agent/__init__.py +0 -41
- langroid/agent/base.py +0 -1981
- langroid/agent/batch.py +0 -398
- langroid/agent/callbacks/__init__.py +0 -0
- langroid/agent/callbacks/chainlit.py +0 -598
- langroid/agent/chat_agent.py +0 -1899
- langroid/agent/chat_document.py +0 -454
- langroid/agent/helpers.py +0 -0
- langroid/agent/junk +0 -13
- langroid/agent/openai_assistant.py +0 -882
- langroid/agent/special/__init__.py +0 -59
- langroid/agent/special/arangodb/__init__.py +0 -0
- langroid/agent/special/arangodb/arangodb_agent.py +0 -656
- langroid/agent/special/arangodb/system_messages.py +0 -186
- langroid/agent/special/arangodb/tools.py +0 -107
- langroid/agent/special/arangodb/utils.py +0 -36
- langroid/agent/special/doc_chat_agent.py +0 -1466
- langroid/agent/special/lance_doc_chat_agent.py +0 -262
- langroid/agent/special/lance_rag/__init__.py +0 -9
- langroid/agent/special/lance_rag/critic_agent.py +0 -198
- langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
- langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
- langroid/agent/special/lance_tools.py +0 -61
- langroid/agent/special/neo4j/__init__.py +0 -0
- langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
- langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
- langroid/agent/special/neo4j/system_messages.py +0 -120
- langroid/agent/special/neo4j/tools.py +0 -32
- langroid/agent/special/relevance_extractor_agent.py +0 -127
- langroid/agent/special/retriever_agent.py +0 -56
- langroid/agent/special/sql/__init__.py +0 -17
- langroid/agent/special/sql/sql_chat_agent.py +0 -654
- langroid/agent/special/sql/utils/__init__.py +0 -21
- langroid/agent/special/sql/utils/description_extractors.py +0 -190
- langroid/agent/special/sql/utils/populate_metadata.py +0 -85
- langroid/agent/special/sql/utils/system_message.py +0 -35
- langroid/agent/special/sql/utils/tools.py +0 -64
- langroid/agent/special/table_chat_agent.py +0 -263
- langroid/agent/structured_message.py +0 -9
- langroid/agent/task.py +0 -2093
- langroid/agent/tool_message.py +0 -393
- langroid/agent/tools/__init__.py +0 -38
- langroid/agent/tools/duckduckgo_search_tool.py +0 -50
- langroid/agent/tools/file_tools.py +0 -234
- langroid/agent/tools/google_search_tool.py +0 -39
- langroid/agent/tools/metaphor_search_tool.py +0 -67
- langroid/agent/tools/orchestration.py +0 -303
- langroid/agent/tools/recipient_tool.py +0 -235
- langroid/agent/tools/retrieval_tool.py +0 -32
- langroid/agent/tools/rewind_tool.py +0 -137
- langroid/agent/tools/segment_extract_tool.py +0 -41
- langroid/agent/typed_task.py +0 -19
- langroid/agent/xml_tool_message.py +0 -382
- langroid/agent_config.py +0 -0
- langroid/cachedb/__init__.py +0 -17
- langroid/cachedb/base.py +0 -58
- langroid/cachedb/momento_cachedb.py +0 -108
- langroid/cachedb/redis_cachedb.py +0 -153
- langroid/embedding_models/__init__.py +0 -39
- langroid/embedding_models/base.py +0 -74
- langroid/embedding_models/clustering.py +0 -189
- langroid/embedding_models/models.py +0 -461
- langroid/embedding_models/protoc/__init__.py +0 -0
- langroid/embedding_models/protoc/embeddings.proto +0 -19
- langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
- langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
- langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
- langroid/embedding_models/remote_embeds.py +0 -153
- langroid/exceptions.py +0 -65
- langroid/experimental/team-save.py +0 -391
- langroid/language_models/.chainlit/config.toml +0 -121
- langroid/language_models/.chainlit/translations/en-US.json +0 -231
- langroid/language_models/__init__.py +0 -53
- langroid/language_models/azure_openai.py +0 -153
- langroid/language_models/base.py +0 -678
- langroid/language_models/config.py +0 -18
- langroid/language_models/mock_lm.py +0 -124
- langroid/language_models/openai_gpt.py +0 -1923
- langroid/language_models/prompt_formatter/__init__.py +0 -16
- langroid/language_models/prompt_formatter/base.py +0 -40
- langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
- langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
- langroid/language_models/utils.py +0 -147
- langroid/mytypes.py +0 -84
- langroid/parsing/__init__.py +0 -52
- langroid/parsing/agent_chats.py +0 -38
- langroid/parsing/code-parsing.md +0 -86
- langroid/parsing/code_parser.py +0 -121
- langroid/parsing/config.py +0 -0
- langroid/parsing/document_parser.py +0 -718
- langroid/parsing/image_text.py +0 -32
- langroid/parsing/para_sentence_split.py +0 -62
- langroid/parsing/parse_json.py +0 -155
- langroid/parsing/parser.py +0 -313
- langroid/parsing/repo_loader.py +0 -790
- langroid/parsing/routing.py +0 -36
- langroid/parsing/search.py +0 -275
- langroid/parsing/spider.py +0 -102
- langroid/parsing/table_loader.py +0 -94
- langroid/parsing/url_loader.py +0 -111
- langroid/parsing/url_loader_cookies.py +0 -73
- langroid/parsing/urls.py +0 -273
- langroid/parsing/utils.py +0 -373
- langroid/parsing/web_search.py +0 -155
- langroid/prompts/__init__.py +0 -9
- langroid/prompts/chat-gpt4-system-prompt.md +0 -68
- langroid/prompts/dialog.py +0 -17
- langroid/prompts/prompts_config.py +0 -5
- langroid/prompts/templates.py +0 -141
- langroid/pydantic_v1/__init__.py +0 -10
- langroid/pydantic_v1/main.py +0 -4
- langroid/utils/.chainlit/config.toml +0 -121
- langroid/utils/.chainlit/translations/en-US.json +0 -231
- langroid/utils/__init__.py +0 -19
- langroid/utils/algorithms/__init__.py +0 -3
- langroid/utils/algorithms/graph.py +0 -103
- langroid/utils/configuration.py +0 -98
- langroid/utils/constants.py +0 -30
- langroid/utils/docker.py +0 -37
- langroid/utils/git_utils.py +0 -252
- langroid/utils/globals.py +0 -49
- langroid/utils/llms/__init__.py +0 -0
- langroid/utils/llms/strings.py +0 -8
- langroid/utils/logging.py +0 -135
- langroid/utils/object_registry.py +0 -66
- langroid/utils/output/__init__.py +0 -20
- langroid/utils/output/citations.py +0 -41
- langroid/utils/output/printing.py +0 -99
- langroid/utils/output/status.py +0 -40
- langroid/utils/pandas_utils.py +0 -30
- langroid/utils/pydantic_utils.py +0 -602
- langroid/utils/system.py +0 -286
- langroid/utils/types.py +0 -93
- langroid/utils/web/__init__.py +0 -0
- langroid/utils/web/login.py +0 -83
- langroid/vector_store/__init__.py +0 -50
- langroid/vector_store/base.py +0 -357
- langroid/vector_store/chromadb.py +0 -214
- langroid/vector_store/lancedb.py +0 -401
- langroid/vector_store/meilisearch.py +0 -299
- langroid/vector_store/momento.py +0 -278
- langroid/vector_store/qdrant_cloud.py +0 -6
- langroid/vector_store/qdrantdb.py +0 -468
- langroid-0.31.2.dist-info/RECORD +0 -162
- {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,1466 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Agent that supports asking queries about a set of documents, using
|
3
|
-
retrieval-augmented generation (RAG).
|
4
|
-
|
5
|
-
Functionality includes:
|
6
|
-
- summarizing a document, with a custom instruction; see `summarize_docs`
|
7
|
-
- asking a question about a document; see `answer_from_docs`
|
8
|
-
|
9
|
-
Note: to use the sentence-transformer embeddings, you must install
|
10
|
-
langroid with the [hf-embeddings] extra, e.g.:
|
11
|
-
|
12
|
-
pip install "langroid[hf-embeddings]"
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
16
|
-
import logging
|
17
|
-
from collections import OrderedDict
|
18
|
-
from functools import cache
|
19
|
-
from typing import Any, Dict, List, Optional, Set, Tuple, no_type_check
|
20
|
-
|
21
|
-
import nest_asyncio
|
22
|
-
import numpy as np
|
23
|
-
import pandas as pd
|
24
|
-
from rich.prompt import Prompt
|
25
|
-
|
26
|
-
from langroid.agent.batch import run_batch_tasks
|
27
|
-
from langroid.agent.chat_agent import ChatAgent, ChatAgentConfig
|
28
|
-
from langroid.agent.chat_document import ChatDocMetaData, ChatDocument
|
29
|
-
from langroid.agent.special.relevance_extractor_agent import (
|
30
|
-
RelevanceExtractorAgent,
|
31
|
-
RelevanceExtractorAgentConfig,
|
32
|
-
)
|
33
|
-
from langroid.agent.task import Task
|
34
|
-
from langroid.agent.tools.retrieval_tool import RetrievalTool
|
35
|
-
from langroid.embedding_models.models import (
|
36
|
-
OpenAIEmbeddingsConfig,
|
37
|
-
SentenceTransformerEmbeddingsConfig,
|
38
|
-
)
|
39
|
-
from langroid.language_models.base import StreamingIfAllowed
|
40
|
-
from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
|
41
|
-
from langroid.mytypes import DocMetaData, Document, Entity
|
42
|
-
from langroid.parsing.document_parser import DocumentType
|
43
|
-
from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
|
44
|
-
from langroid.parsing.repo_loader import RepoLoader
|
45
|
-
from langroid.parsing.search import (
|
46
|
-
find_closest_matches_with_bm25,
|
47
|
-
find_fuzzy_matches_in_docs,
|
48
|
-
preprocess_text,
|
49
|
-
)
|
50
|
-
from langroid.parsing.table_loader import describe_dataframe
|
51
|
-
from langroid.parsing.url_loader import URLLoader
|
52
|
-
from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
|
53
|
-
from langroid.prompts.prompts_config import PromptsConfig
|
54
|
-
from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
|
55
|
-
from langroid.utils.constants import NO_ANSWER
|
56
|
-
from langroid.utils.object_registry import ObjectRegistry
|
57
|
-
from langroid.utils.output import show_if_debug, status
|
58
|
-
from langroid.utils.output.citations import (
|
59
|
-
extract_markdown_references,
|
60
|
-
format_footnote_text,
|
61
|
-
)
|
62
|
-
from langroid.utils.pydantic_utils import dataframe_to_documents, extract_fields
|
63
|
-
from langroid.vector_store.base import VectorStore, VectorStoreConfig
|
64
|
-
from langroid.vector_store.qdrantdb import QdrantDBConfig
|
65
|
-
|
66
|
-
|
67
|
-
@cache
|
68
|
-
def apply_nest_asyncio() -> None:
|
69
|
-
nest_asyncio.apply()
|
70
|
-
|
71
|
-
|
72
|
-
logger = logging.getLogger(__name__)
|
73
|
-
|
74
|
-
DEFAULT_DOC_CHAT_INSTRUCTIONS = """
|
75
|
-
Your task is to answer questions about various documents.
|
76
|
-
You will be given various passages from these documents, and asked to answer questions
|
77
|
-
about them, or summarize them into coherent answers.
|
78
|
-
"""
|
79
|
-
|
80
|
-
DEFAULT_DOC_CHAT_SYSTEM_MESSAGE = """
|
81
|
-
You are a helpful assistant, helping me understand a collection of documents.
|
82
|
-
"""
|
83
|
-
|
84
|
-
has_sentence_transformers = False
|
85
|
-
try:
|
86
|
-
from sentence_transformers import SentenceTransformer # noqa: F401
|
87
|
-
|
88
|
-
has_sentence_transformers = True
|
89
|
-
except ImportError:
|
90
|
-
pass
|
91
|
-
|
92
|
-
|
93
|
-
hf_embed_config = SentenceTransformerEmbeddingsConfig(
|
94
|
-
model_type="sentence-transformer",
|
95
|
-
model_name="BAAI/bge-large-en-v1.5",
|
96
|
-
)
|
97
|
-
|
98
|
-
oai_embed_config = OpenAIEmbeddingsConfig(
|
99
|
-
model_type="openai",
|
100
|
-
model_name="text-embedding-ada-002",
|
101
|
-
dims=1536,
|
102
|
-
)
|
103
|
-
|
104
|
-
|
105
|
-
class DocChatAgentConfig(ChatAgentConfig):
|
106
|
-
system_message: str = DEFAULT_DOC_CHAT_SYSTEM_MESSAGE
|
107
|
-
user_message: str = DEFAULT_DOC_CHAT_INSTRUCTIONS
|
108
|
-
summarize_prompt: str = SUMMARY_ANSWER_PROMPT_GPT4
|
109
|
-
# extra fields to include in content as key=value pairs
|
110
|
-
# (helps retrieval for table-like data)
|
111
|
-
add_fields_to_content: List[str] = []
|
112
|
-
filter_fields: List[str] = [] # fields usable in filter
|
113
|
-
retrieve_only: bool = False # only retr relevant extracts, don't gen summary answer
|
114
|
-
extraction_granularity: int = 1 # granularity (in sentences) for relev extraction
|
115
|
-
filter: str | None = (
|
116
|
-
None # filter condition for various lexical/semantic search fns
|
117
|
-
)
|
118
|
-
conversation_mode: bool = True # accumulate message history?
|
119
|
-
# In assistant mode, DocChatAgent receives questions from another Agent,
|
120
|
-
# and those will already be in stand-alone form, so in this mode
|
121
|
-
# there is no need to convert them to stand-alone form.
|
122
|
-
assistant_mode: bool = False
|
123
|
-
# Use LLM to generate hypothetical answer A to the query Q,
|
124
|
-
# and use the embed(A) to find similar chunks in vecdb.
|
125
|
-
# Referred to as HyDE in the paper:
|
126
|
-
# https://arxiv.org/pdf/2212.10496.pdf
|
127
|
-
# It is False by default; its benefits depends on the context.
|
128
|
-
hypothetical_answer: bool = False
|
129
|
-
n_query_rephrases: int = 0
|
130
|
-
n_neighbor_chunks: int = 0 # how many neighbors on either side of match to retrieve
|
131
|
-
n_fuzzy_neighbor_words: int = 100 # num neighbor words to retrieve for fuzzy match
|
132
|
-
use_fuzzy_match: bool = True
|
133
|
-
use_bm25_search: bool = True
|
134
|
-
use_reciprocal_rank_fusion: bool = True # ignored if using cross-encoder reranking
|
135
|
-
cross_encoder_reranking_model: str = (
|
136
|
-
"cross-encoder/ms-marco-MiniLM-L-6-v2" if has_sentence_transformers else ""
|
137
|
-
)
|
138
|
-
rerank_diversity: bool = True # rerank to maximize diversity?
|
139
|
-
rerank_periphery: bool = True # rerank to avoid Lost In the Middle effect?
|
140
|
-
rerank_after_adding_context: bool = True # rerank after adding context window?
|
141
|
-
# RRF (Reciprocal Rank Fusion) score = 1/(rank + reciprocal_rank_fusion_constant)
|
142
|
-
# see https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#how-rrf-ranking-works
|
143
|
-
reciprocal_rank_fusion_constant: float = 60.0
|
144
|
-
cache: bool = True # cache results
|
145
|
-
debug: bool = False
|
146
|
-
stream: bool = True # allow streaming where needed
|
147
|
-
split: bool = True # use chunking
|
148
|
-
relevance_extractor_config: None | RelevanceExtractorAgentConfig = (
|
149
|
-
RelevanceExtractorAgentConfig(
|
150
|
-
llm=None # use the parent's llm unless explicitly set here
|
151
|
-
)
|
152
|
-
)
|
153
|
-
doc_paths: List[str | bytes] = []
|
154
|
-
default_paths: List[str] = [
|
155
|
-
"https://news.ycombinator.com/item?id=35629033",
|
156
|
-
"https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
|
157
|
-
"https://www.wired.com/1995/04/maes/",
|
158
|
-
"https://cthiriet.com/articles/scaling-laws",
|
159
|
-
"https://www.jasonwei.net/blog/emergence",
|
160
|
-
"https://www.quantamagazine.org/the-unpredictable-abilities-emerging-from-large-ai-models-20230316/",
|
161
|
-
"https://ai.googleblog.com/2022/11/characterizing-emergent-phenomena-in.html",
|
162
|
-
]
|
163
|
-
parsing: ParsingConfig = ParsingConfig( # modify as needed
|
164
|
-
splitter=Splitter.TOKENS,
|
165
|
-
chunk_size=1000, # aim for this many tokens per chunk
|
166
|
-
overlap=100, # overlap between chunks
|
167
|
-
max_chunks=10_000,
|
168
|
-
# aim to have at least this many chars per chunk when
|
169
|
-
# truncating due to punctuation
|
170
|
-
min_chunk_chars=200,
|
171
|
-
discard_chunk_chars=5, # discard chunks with fewer than this many chars
|
172
|
-
n_similar_docs=3,
|
173
|
-
n_neighbor_ids=0, # num chunk IDs to store on either side of each chunk
|
174
|
-
pdf=PdfParsingConfig(
|
175
|
-
# NOTE: PDF parsing is extremely challenging, and each library
|
176
|
-
# has its own strengths and weaknesses.
|
177
|
-
# Try one that works for your use case.
|
178
|
-
# or "unstructured", "pdfplumber", "fitz", "pypdf"
|
179
|
-
library="pdfplumber",
|
180
|
-
),
|
181
|
-
)
|
182
|
-
|
183
|
-
# Allow vecdb to be None in case we want to explicitly set it later
|
184
|
-
vecdb: Optional[VectorStoreConfig] = QdrantDBConfig(
|
185
|
-
collection_name="doc-chat-qdrantdb",
|
186
|
-
replace_collection=True,
|
187
|
-
storage_path=".qdrantdb/data/",
|
188
|
-
embedding=hf_embed_config if has_sentence_transformers else oai_embed_config,
|
189
|
-
)
|
190
|
-
|
191
|
-
llm: OpenAIGPTConfig = OpenAIGPTConfig(
|
192
|
-
type="openai",
|
193
|
-
chat_model=OpenAIChatModel.GPT4,
|
194
|
-
completion_model=OpenAIChatModel.GPT4,
|
195
|
-
timeout=40,
|
196
|
-
)
|
197
|
-
prompts: PromptsConfig = PromptsConfig(
|
198
|
-
max_tokens=1000,
|
199
|
-
)
|
200
|
-
|
201
|
-
|
202
|
-
class DocChatAgent(ChatAgent):
|
203
|
-
"""
|
204
|
-
Agent for chatting with a collection of documents.
|
205
|
-
"""
|
206
|
-
|
207
|
-
def __init__(
|
208
|
-
self,
|
209
|
-
config: DocChatAgentConfig,
|
210
|
-
):
|
211
|
-
super().__init__(config)
|
212
|
-
self.config: DocChatAgentConfig = config
|
213
|
-
self.original_docs: List[Document] = []
|
214
|
-
self.original_docs_length = 0
|
215
|
-
self.from_dataframe = False
|
216
|
-
self.df_description = ""
|
217
|
-
self.chunked_docs: List[Document] = []
|
218
|
-
self.chunked_docs_clean: List[Document] = []
|
219
|
-
self.response: None | Document = None
|
220
|
-
if len(config.doc_paths) > 0:
|
221
|
-
self.ingest()
|
222
|
-
|
223
|
-
def clear(self) -> None:
|
224
|
-
"""Clear the document collection and the specific collection in vecdb"""
|
225
|
-
self.original_docs = []
|
226
|
-
self.original_docs_length = 0
|
227
|
-
self.chunked_docs = []
|
228
|
-
self.chunked_docs_clean = []
|
229
|
-
if self.vecdb is None:
|
230
|
-
logger.warning("Attempting to clear VecDB, but VecDB not set.")
|
231
|
-
return
|
232
|
-
collection_name = self.vecdb.config.collection_name
|
233
|
-
if collection_name is None:
|
234
|
-
return
|
235
|
-
try:
|
236
|
-
# Note we may have used a vecdb with a config.collection_name
|
237
|
-
# different from the agent's config.vecdb.collection_name!!
|
238
|
-
self.vecdb.delete_collection(collection_name)
|
239
|
-
self.vecdb = VectorStore.create(self.vecdb.config)
|
240
|
-
except Exception as e:
|
241
|
-
logger.warning(
|
242
|
-
f"""
|
243
|
-
Error while deleting collection {collection_name}:
|
244
|
-
{e}
|
245
|
-
"""
|
246
|
-
)
|
247
|
-
|
248
|
-
def ingest(self) -> None:
|
249
|
-
"""
|
250
|
-
Chunk + embed + store docs specified by self.config.doc_paths
|
251
|
-
"""
|
252
|
-
if len(self.config.doc_paths) == 0:
|
253
|
-
# we must be using a previously defined collection
|
254
|
-
# But let's get all the chunked docs so we can
|
255
|
-
# do keyword and other non-vector searches
|
256
|
-
if self.vecdb is None:
|
257
|
-
raise ValueError("VecDB not set")
|
258
|
-
self.setup_documents(filter=self.config.filter)
|
259
|
-
return
|
260
|
-
self.ingest_doc_paths(self.config.doc_paths) # type: ignore
|
261
|
-
|
262
|
-
def ingest_doc_paths(
|
263
|
-
self,
|
264
|
-
paths: str | bytes | List[str | bytes],
|
265
|
-
metadata: (
|
266
|
-
List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
|
267
|
-
) = [],
|
268
|
-
doc_type: str | DocumentType | None = None,
|
269
|
-
) -> List[Document]:
|
270
|
-
"""Split, ingest docs from specified paths,
|
271
|
-
do not add these to config.doc_paths.
|
272
|
-
|
273
|
-
Args:
|
274
|
-
paths: document paths, urls or byte-content of docs.
|
275
|
-
The bytes option is intended to support cases where a document
|
276
|
-
has already been read in as bytes (e.g. from an API or a database),
|
277
|
-
and we want to avoid having to write it to a temporary file
|
278
|
-
just to read it back in.
|
279
|
-
metadata: List of metadata dicts, one for each path.
|
280
|
-
If a single dict is passed in, it is used for all paths.
|
281
|
-
doc_type: DocumentType to use for parsing, if known.
|
282
|
-
MUST apply to all docs if specified.
|
283
|
-
This is especially useful when the `paths` are of bytes type,
|
284
|
-
to help with document type detection.
|
285
|
-
Returns:
|
286
|
-
List of Document objects
|
287
|
-
"""
|
288
|
-
if isinstance(paths, str) or isinstance(paths, bytes):
|
289
|
-
paths = [paths]
|
290
|
-
all_paths = paths
|
291
|
-
paths_meta: Dict[int, Any] = {}
|
292
|
-
urls_meta: Dict[int, Any] = {}
|
293
|
-
idxs = range(len(all_paths))
|
294
|
-
url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
|
295
|
-
urls = [all_paths[i] for i in url_idxs]
|
296
|
-
paths = [all_paths[i] for i in path_idxs]
|
297
|
-
bytes_list = [all_paths[i] for i in bytes_idxs]
|
298
|
-
path_idxs.extend(bytes_idxs)
|
299
|
-
paths.extend(bytes_list)
|
300
|
-
if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
|
301
|
-
metadata, list
|
302
|
-
):
|
303
|
-
if isinstance(metadata, list):
|
304
|
-
idx2meta = {
|
305
|
-
p: (
|
306
|
-
m
|
307
|
-
if isinstance(m, dict)
|
308
|
-
else (isinstance(m, DocMetaData) and m.dict())
|
309
|
-
) # appease mypy
|
310
|
-
for p, m in zip(idxs, metadata)
|
311
|
-
}
|
312
|
-
elif isinstance(metadata, dict):
|
313
|
-
idx2meta = {p: metadata for p in idxs}
|
314
|
-
else:
|
315
|
-
idx2meta = {p: metadata.dict() for p in idxs}
|
316
|
-
urls_meta = {u: idx2meta[u] for u in url_idxs}
|
317
|
-
paths_meta = {p: idx2meta[p] for p in path_idxs}
|
318
|
-
docs: List[Document] = []
|
319
|
-
parser = Parser(self.config.parsing)
|
320
|
-
if len(urls) > 0:
|
321
|
-
for ui in url_idxs:
|
322
|
-
meta = urls_meta.get(ui, {})
|
323
|
-
loader = URLLoader(urls=[all_paths[ui]], parser=parser) # type: ignore
|
324
|
-
url_docs = loader.load()
|
325
|
-
# update metadata of each doc with meta
|
326
|
-
for d in url_docs:
|
327
|
-
d.metadata = d.metadata.copy(update=meta)
|
328
|
-
docs.extend(url_docs)
|
329
|
-
if len(paths) > 0: # paths OR bytes are handled similarly
|
330
|
-
for pi in path_idxs:
|
331
|
-
meta = paths_meta.get(pi, {})
|
332
|
-
p = all_paths[pi]
|
333
|
-
path_docs = RepoLoader.get_documents(
|
334
|
-
p,
|
335
|
-
parser=parser,
|
336
|
-
doc_type=doc_type,
|
337
|
-
)
|
338
|
-
# update metadata of each doc with meta
|
339
|
-
for d in path_docs:
|
340
|
-
d.metadata = d.metadata.copy(update=meta)
|
341
|
-
docs.extend(path_docs)
|
342
|
-
n_docs = len(docs)
|
343
|
-
n_splits = self.ingest_docs(docs, split=self.config.split)
|
344
|
-
if n_docs == 0:
|
345
|
-
return []
|
346
|
-
n_urls = len(urls)
|
347
|
-
n_paths = len(paths)
|
348
|
-
print(
|
349
|
-
f"""
|
350
|
-
[green]I have processed the following {n_urls} URLs
|
351
|
-
and {n_paths} docs into {n_splits} parts:
|
352
|
-
""".strip()
|
353
|
-
)
|
354
|
-
path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
|
355
|
-
print("\n".join([u for u in urls if isinstance(u, str)])) # appease mypy
|
356
|
-
print("\n".join(path_reps))
|
357
|
-
return docs
|
358
|
-
|
359
|
-
def ingest_docs(
|
360
|
-
self,
|
361
|
-
docs: List[Document],
|
362
|
-
split: bool = True,
|
363
|
-
metadata: (
|
364
|
-
List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
|
365
|
-
) = [],
|
366
|
-
) -> int:
|
367
|
-
"""
|
368
|
-
Chunk docs into pieces, map each chunk to vec-embedding, store in vec-db
|
369
|
-
|
370
|
-
Args:
|
371
|
-
docs: List of Document objects
|
372
|
-
split: Whether to split docs into chunks. Default is True.
|
373
|
-
If False, docs are treated as "chunks" and are not split.
|
374
|
-
metadata: List of metadata dicts, one for each doc, to augment
|
375
|
-
whatever metadata is already in the doc.
|
376
|
-
[ASSUME no conflicting keys between the two metadata dicts.]
|
377
|
-
If a single dict is passed in, it is used for all docs.
|
378
|
-
"""
|
379
|
-
if isinstance(metadata, list) and len(metadata) > 0:
|
380
|
-
for d, m in zip(docs, metadata):
|
381
|
-
d.metadata = d.metadata.copy(
|
382
|
-
update=m if isinstance(m, dict) else m.dict() # type: ignore
|
383
|
-
)
|
384
|
-
elif isinstance(metadata, dict):
|
385
|
-
for d in docs:
|
386
|
-
d.metadata = d.metadata.copy(update=metadata)
|
387
|
-
elif isinstance(metadata, DocMetaData):
|
388
|
-
for d in docs:
|
389
|
-
d.metadata = d.metadata.copy(update=metadata.dict())
|
390
|
-
|
391
|
-
self.original_docs.extend(docs)
|
392
|
-
if self.parser is None:
|
393
|
-
raise ValueError("Parser not set")
|
394
|
-
for d in docs:
|
395
|
-
if d.metadata.id in [None, ""]:
|
396
|
-
d.metadata.id = ObjectRegistry.new_id()
|
397
|
-
if split:
|
398
|
-
docs = self.parser.split(docs)
|
399
|
-
else:
|
400
|
-
if self.config.n_neighbor_chunks > 0:
|
401
|
-
self.parser.add_window_ids(docs)
|
402
|
-
# we're not splitting, so we mark each doc as a chunk
|
403
|
-
for d in docs:
|
404
|
-
d.metadata.is_chunk = True
|
405
|
-
if self.vecdb is None:
|
406
|
-
raise ValueError("VecDB not set")
|
407
|
-
|
408
|
-
# If any additional fields need to be added to content,
|
409
|
-
# add them as key=value pairs for all docs, before batching.
|
410
|
-
# This helps retrieval for table-like data.
|
411
|
-
# Note we need to do this at stage so that the embeddings
|
412
|
-
# are computed on the full content with these additional fields.
|
413
|
-
if len(self.config.add_fields_to_content) > 0:
|
414
|
-
fields = [
|
415
|
-
f for f in extract_fields(docs[0], self.config.add_fields_to_content)
|
416
|
-
]
|
417
|
-
if len(fields) > 0:
|
418
|
-
for d in docs:
|
419
|
-
key_vals = extract_fields(d, fields)
|
420
|
-
d.content = (
|
421
|
-
",".join(f"{k}={v}" for k, v in key_vals.items())
|
422
|
-
+ ",content="
|
423
|
-
+ d.content
|
424
|
-
)
|
425
|
-
docs = docs[: self.config.parsing.max_chunks]
|
426
|
-
# vecdb should take care of adding docs in batches;
|
427
|
-
# batching can be controlled via vecdb.config.batch_size
|
428
|
-
self.vecdb.add_documents(docs)
|
429
|
-
self.original_docs_length = self.doc_length(docs)
|
430
|
-
self.setup_documents(docs, filter=self.config.filter)
|
431
|
-
return len(docs)
|
432
|
-
|
433
|
-
def retrieval_tool(self, msg: RetrievalTool) -> str:
|
434
|
-
"""Handle the RetrievalTool message"""
|
435
|
-
self.config.retrieve_only = True
|
436
|
-
self.config.parsing.n_similar_docs = msg.num_results
|
437
|
-
content_doc = self.answer_from_docs(msg.query)
|
438
|
-
return content_doc.content
|
439
|
-
|
440
|
-
@staticmethod
|
441
|
-
def document_compatible_dataframe(
|
442
|
-
df: pd.DataFrame,
|
443
|
-
content: str = "content",
|
444
|
-
metadata: List[str] = [],
|
445
|
-
) -> Tuple[pd.DataFrame, List[str]]:
|
446
|
-
"""
|
447
|
-
Convert dataframe so it is compatible with Document class:
|
448
|
-
- has "content" column
|
449
|
-
- has an "id" column to be used as Document.metadata.id
|
450
|
-
|
451
|
-
Args:
|
452
|
-
df: dataframe to convert
|
453
|
-
content: name of content column
|
454
|
-
metadata: list of metadata column names
|
455
|
-
|
456
|
-
Returns:
|
457
|
-
Tuple[pd.DataFrame, List[str]]: dataframe, metadata
|
458
|
-
- dataframe: dataframe with "content" column and "id" column
|
459
|
-
- metadata: list of metadata column names, including "id"
|
460
|
-
"""
|
461
|
-
if content not in df.columns:
|
462
|
-
raise ValueError(
|
463
|
-
f"""
|
464
|
-
Content column {content} not in dataframe,
|
465
|
-
so we cannot ingest into the DocChatAgent.
|
466
|
-
Please specify the `content` parameter as a suitable
|
467
|
-
text-based column in the dataframe.
|
468
|
-
"""
|
469
|
-
)
|
470
|
-
if content != "content":
|
471
|
-
# rename content column to "content", leave existing column intact
|
472
|
-
df = df.rename(columns={content: "content"}, inplace=False)
|
473
|
-
|
474
|
-
actual_metadata = metadata.copy()
|
475
|
-
if "id" not in df.columns:
|
476
|
-
docs = dataframe_to_documents(df, content="content", metadata=metadata)
|
477
|
-
ids = [str(d.id()) for d in docs]
|
478
|
-
df["id"] = ids
|
479
|
-
|
480
|
-
if "id" not in actual_metadata:
|
481
|
-
actual_metadata += ["id"]
|
482
|
-
|
483
|
-
return df, actual_metadata
|
484
|
-
|
485
|
-
def ingest_dataframe(
|
486
|
-
self,
|
487
|
-
df: pd.DataFrame,
|
488
|
-
content: str = "content",
|
489
|
-
metadata: List[str] = [],
|
490
|
-
) -> int:
|
491
|
-
"""
|
492
|
-
Ingest a dataframe into vecdb.
|
493
|
-
"""
|
494
|
-
self.from_dataframe = True
|
495
|
-
self.df_description = describe_dataframe(
|
496
|
-
df, filter_fields=self.config.filter_fields, n_vals=5
|
497
|
-
)
|
498
|
-
df, metadata = DocChatAgent.document_compatible_dataframe(df, content, metadata)
|
499
|
-
docs = dataframe_to_documents(df, content="content", metadata=metadata)
|
500
|
-
# When ingesting a dataframe we will no longer do any chunking,
|
501
|
-
# so we mark each doc as a chunk.
|
502
|
-
# TODO - revisit this since we may still want to chunk large text columns
|
503
|
-
for d in docs:
|
504
|
-
d.metadata.is_chunk = True
|
505
|
-
return self.ingest_docs(docs)
|
506
|
-
|
507
|
-
def set_filter(self, filter: str) -> None:
|
508
|
-
self.config.filter = filter
|
509
|
-
self.setup_documents(filter=filter)
|
510
|
-
|
511
|
-
def setup_documents(
|
512
|
-
self,
|
513
|
-
docs: List[Document] = [],
|
514
|
-
filter: str | None = None,
|
515
|
-
) -> None:
|
516
|
-
"""
|
517
|
-
Setup `self.chunked_docs` and `self.chunked_docs_clean`
|
518
|
-
based on possible filter.
|
519
|
-
These will be used in various non-vector-based search functions,
|
520
|
-
e.g. self.get_similar_chunks_bm25(), self.get_fuzzy_matches(), etc.
|
521
|
-
|
522
|
-
Args:
|
523
|
-
docs: List of Document objects. This is empty when we are calling this
|
524
|
-
method after initial doc ingestion.
|
525
|
-
filter: Filter condition for various lexical/semantic search fns.
|
526
|
-
"""
|
527
|
-
if filter is None and len(docs) > 0:
|
528
|
-
# no filter, so just use the docs passed in
|
529
|
-
self.chunked_docs.extend(docs)
|
530
|
-
else:
|
531
|
-
if self.vecdb is None:
|
532
|
-
raise ValueError("VecDB not set")
|
533
|
-
self.chunked_docs = self.vecdb.get_all_documents(where=filter or "")
|
534
|
-
|
535
|
-
self.chunked_docs_clean = [
|
536
|
-
Document(content=preprocess_text(d.content), metadata=d.metadata)
|
537
|
-
for d in self.chunked_docs
|
538
|
-
]
|
539
|
-
|
540
|
-
def get_field_values(self, fields: list[str]) -> Dict[str, str]:
|
541
|
-
"""Get string-listing of possible values of each field,
|
542
|
-
e.g.
|
543
|
-
{
|
544
|
-
"genre": "crime, drama, mystery, ... (10 more)",
|
545
|
-
"certificate": "R, PG-13, PG, R",
|
546
|
-
}
|
547
|
-
The field names may have "metadata." prefix, e.g. "metadata.genre".
|
548
|
-
"""
|
549
|
-
field_values: Dict[str, Set[str]] = {}
|
550
|
-
# make empty set for each field
|
551
|
-
for f in fields:
|
552
|
-
field_values[f] = set()
|
553
|
-
if self.vecdb is None:
|
554
|
-
raise ValueError("VecDB not set")
|
555
|
-
# get all documents and accumulate possible values of each field until 10
|
556
|
-
docs = self.vecdb.get_all_documents() # only works for vecdbs that support this
|
557
|
-
for d in docs:
|
558
|
-
# extract fields from d
|
559
|
-
doc_field_vals = extract_fields(d, fields)
|
560
|
-
# the `field` returned by extract_fields may contain only the last
|
561
|
-
# part of the field name, e.g. "genre" instead of "metadata.genre",
|
562
|
-
# so we use the orig_field name to fill in the values
|
563
|
-
for (field, val), orig_field in zip(doc_field_vals.items(), fields):
|
564
|
-
field_values[orig_field].add(val)
|
565
|
-
# For each field make a string showing list of possible values,
|
566
|
-
# truncate to 20 values, and if there are more, indicate how many
|
567
|
-
# more there are, e.g. Genre: crime, drama, mystery, ... (20 more)
|
568
|
-
field_values_list = {}
|
569
|
-
for f in fields:
|
570
|
-
vals = list(field_values[f])
|
571
|
-
n = len(vals)
|
572
|
-
remaining = n - 20
|
573
|
-
vals = vals[:20]
|
574
|
-
if n > 20:
|
575
|
-
vals.append(f"(...{remaining} more)")
|
576
|
-
# make a string of the values, ensure they are strings
|
577
|
-
field_values_list[f] = ", ".join(str(v) for v in vals)
|
578
|
-
return field_values_list
|
579
|
-
|
580
|
-
def doc_length(self, docs: List[Document]) -> int:
|
581
|
-
"""
|
582
|
-
Calc token-length of a list of docs
|
583
|
-
Args:
|
584
|
-
docs: list of Document objects
|
585
|
-
Returns:
|
586
|
-
int: number of tokens
|
587
|
-
"""
|
588
|
-
if self.parser is None:
|
589
|
-
raise ValueError("Parser not set")
|
590
|
-
return self.parser.num_tokens(self.doc_string(docs))
|
591
|
-
|
592
|
-
def user_docs_ingest_dialog(self) -> None:
|
593
|
-
"""
|
594
|
-
Ask user to select doc-collection, enter filenames/urls, and ingest into vecdb.
|
595
|
-
"""
|
596
|
-
if self.vecdb is None:
|
597
|
-
raise ValueError("VecDB not set")
|
598
|
-
n_deletes = self.vecdb.clear_empty_collections()
|
599
|
-
collections = self.vecdb.list_collections()
|
600
|
-
collection_name = "NEW"
|
601
|
-
is_new_collection = False
|
602
|
-
replace_collection = False
|
603
|
-
if len(collections) > 0:
|
604
|
-
n = len(collections)
|
605
|
-
delete_str = (
|
606
|
-
f"(deleted {n_deletes} empty collections)" if n_deletes > 0 else ""
|
607
|
-
)
|
608
|
-
print(f"Found {n} collections: {delete_str}")
|
609
|
-
for i, option in enumerate(collections, start=1):
|
610
|
-
print(f"{i}. {option}")
|
611
|
-
while True:
|
612
|
-
choice = Prompt.ask(
|
613
|
-
f"Enter 1-{n} to select a collection, "
|
614
|
-
"or hit ENTER to create a NEW collection, "
|
615
|
-
"or -1 to DELETE ALL COLLECTIONS",
|
616
|
-
default="0",
|
617
|
-
)
|
618
|
-
try:
|
619
|
-
if -1 <= int(choice) <= n:
|
620
|
-
break
|
621
|
-
except Exception:
|
622
|
-
pass
|
623
|
-
|
624
|
-
if choice == "-1":
|
625
|
-
confirm = Prompt.ask(
|
626
|
-
"Are you sure you want to delete all collections?",
|
627
|
-
choices=["y", "n"],
|
628
|
-
default="n",
|
629
|
-
)
|
630
|
-
if confirm == "y":
|
631
|
-
self.vecdb.clear_all_collections(really=True)
|
632
|
-
collection_name = "NEW"
|
633
|
-
|
634
|
-
if int(choice) > 0:
|
635
|
-
collection_name = collections[int(choice) - 1]
|
636
|
-
print(f"Using collection {collection_name}")
|
637
|
-
choice = Prompt.ask(
|
638
|
-
"Would you like to replace this collection?",
|
639
|
-
choices=["y", "n"],
|
640
|
-
default="n",
|
641
|
-
)
|
642
|
-
replace_collection = choice == "y"
|
643
|
-
|
644
|
-
if collection_name == "NEW":
|
645
|
-
is_new_collection = True
|
646
|
-
collection_name = Prompt.ask(
|
647
|
-
"What would you like to name the NEW collection?",
|
648
|
-
default="doc-chat",
|
649
|
-
)
|
650
|
-
|
651
|
-
self.vecdb.set_collection(collection_name, replace=replace_collection)
|
652
|
-
|
653
|
-
default_urls_str = (
|
654
|
-
" (or leave empty for default URLs)" if is_new_collection else ""
|
655
|
-
)
|
656
|
-
print(f"[blue]Enter some URLs or file/dir paths below {default_urls_str}")
|
657
|
-
inputs = get_list_from_user()
|
658
|
-
if len(inputs) == 0:
|
659
|
-
if is_new_collection:
|
660
|
-
inputs = self.config.default_paths
|
661
|
-
self.config.doc_paths = inputs # type: ignore
|
662
|
-
self.ingest()
|
663
|
-
|
664
|
-
def llm_response(
|
665
|
-
self,
|
666
|
-
message: None | str | ChatDocument = None,
|
667
|
-
) -> Optional[ChatDocument]:
|
668
|
-
if not self.llm_can_respond(message):
|
669
|
-
return None
|
670
|
-
query_str: str | None
|
671
|
-
if isinstance(message, ChatDocument):
|
672
|
-
query_str = message.content
|
673
|
-
else:
|
674
|
-
query_str = message
|
675
|
-
if query_str is None or query_str.startswith("!"):
|
676
|
-
# direct query to LLM
|
677
|
-
query_str = query_str[1:] if query_str is not None else None
|
678
|
-
if self.llm is None:
|
679
|
-
raise ValueError("LLM not set")
|
680
|
-
response = super().llm_response(query_str)
|
681
|
-
if query_str is not None:
|
682
|
-
self.update_dialog(
|
683
|
-
query_str, "" if response is None else response.content
|
684
|
-
)
|
685
|
-
return response
|
686
|
-
if query_str == "":
|
687
|
-
return ChatDocument(
|
688
|
-
content=NO_ANSWER + " since query was empty",
|
689
|
-
metadata=ChatDocMetaData(
|
690
|
-
source="No query provided",
|
691
|
-
sender=Entity.LLM,
|
692
|
-
),
|
693
|
-
)
|
694
|
-
elif query_str == "?" and self.response is not None:
|
695
|
-
return self.justify_response()
|
696
|
-
elif (query_str.startswith(("summar", "?")) and self.response is None) or (
|
697
|
-
query_str == "??"
|
698
|
-
):
|
699
|
-
return self.summarize_docs()
|
700
|
-
else:
|
701
|
-
self.callbacks.show_start_response(entity="llm")
|
702
|
-
response = self.answer_from_docs(query_str)
|
703
|
-
# Citation details (if any) are NOT generated by LLM
|
704
|
-
# (We extract these from LLM's numerical citations),
|
705
|
-
# so render them here
|
706
|
-
self._render_llm_response(response, citation_only=True)
|
707
|
-
return ChatDocument(
|
708
|
-
content=response.content,
|
709
|
-
metadata=ChatDocMetaData(
|
710
|
-
source=response.metadata.source,
|
711
|
-
sender=Entity.LLM,
|
712
|
-
),
|
713
|
-
)
|
714
|
-
|
715
|
-
async def llm_response_async(
|
716
|
-
self,
|
717
|
-
message: None | str | ChatDocument = None,
|
718
|
-
) -> Optional[ChatDocument]:
|
719
|
-
apply_nest_asyncio()
|
720
|
-
if not self.llm_can_respond(message):
|
721
|
-
return None
|
722
|
-
query_str: str | None
|
723
|
-
if isinstance(message, ChatDocument):
|
724
|
-
query_str = message.content
|
725
|
-
else:
|
726
|
-
query_str = message
|
727
|
-
if query_str is None or query_str.startswith("!"):
|
728
|
-
# direct query to LLM
|
729
|
-
query_str = query_str[1:] if query_str is not None else None
|
730
|
-
if self.llm is None:
|
731
|
-
raise ValueError("LLM not set")
|
732
|
-
response = await super().llm_response_async(query_str)
|
733
|
-
if query_str is not None:
|
734
|
-
self.update_dialog(
|
735
|
-
query_str, "" if response is None else response.content
|
736
|
-
)
|
737
|
-
return response
|
738
|
-
if query_str == "":
|
739
|
-
return None
|
740
|
-
elif query_str == "?" and self.response is not None:
|
741
|
-
return self.justify_response()
|
742
|
-
elif (query_str.startswith(("summar", "?")) and self.response is None) or (
|
743
|
-
query_str == "??"
|
744
|
-
):
|
745
|
-
return self.summarize_docs()
|
746
|
-
else:
|
747
|
-
self.callbacks.show_start_response(entity="llm")
|
748
|
-
response = self.answer_from_docs(query_str)
|
749
|
-
self._render_llm_response(response, citation_only=True)
|
750
|
-
return ChatDocument(
|
751
|
-
content=response.content,
|
752
|
-
metadata=ChatDocMetaData(
|
753
|
-
source=response.metadata.source,
|
754
|
-
sender=Entity.LLM,
|
755
|
-
),
|
756
|
-
)
|
757
|
-
|
758
|
-
@staticmethod
|
759
|
-
def doc_string(docs: List[Document]) -> str:
|
760
|
-
"""
|
761
|
-
Generate a string representation of a list of docs.
|
762
|
-
Args:
|
763
|
-
docs: list of Document objects
|
764
|
-
Returns:
|
765
|
-
str: string representation
|
766
|
-
"""
|
767
|
-
contents = [f"Extract: {d.content}" for d in docs]
|
768
|
-
sources = [d.metadata.source for d in docs]
|
769
|
-
sources = [f"Source: {s}" if s is not None else "" for s in sources]
|
770
|
-
return "\n".join(
|
771
|
-
[
|
772
|
-
f"""
|
773
|
-
[{i+1}]
|
774
|
-
{content}
|
775
|
-
{source}
|
776
|
-
"""
|
777
|
-
for i, (content, source) in enumerate(zip(contents, sources))
|
778
|
-
]
|
779
|
-
)
|
780
|
-
|
781
|
-
def get_summary_answer(
|
782
|
-
self, question: str, passages: List[Document]
|
783
|
-
) -> ChatDocument:
|
784
|
-
"""
|
785
|
-
Given a question and a list of (possibly) doc snippets,
|
786
|
-
generate an answer if possible
|
787
|
-
Args:
|
788
|
-
question: question to answer
|
789
|
-
passages: list of `Document` objects each containing a possibly relevant
|
790
|
-
snippet, and metadata
|
791
|
-
Returns:
|
792
|
-
a `Document` object containing the answer,
|
793
|
-
and metadata containing source citations
|
794
|
-
|
795
|
-
"""
|
796
|
-
|
797
|
-
passages_str = self.doc_string(passages)
|
798
|
-
# Substitute Q and P into the templatized prompt
|
799
|
-
|
800
|
-
final_prompt = self.config.summarize_prompt.format(
|
801
|
-
question=question, extracts=passages_str
|
802
|
-
)
|
803
|
-
show_if_debug(final_prompt, "SUMMARIZE_PROMPT= ")
|
804
|
-
|
805
|
-
# Generate the final verbatim extract based on the final prompt.
|
806
|
-
# Note this will send entire message history, plus this final_prompt
|
807
|
-
# to the LLM, and self.message_history will be updated to include
|
808
|
-
# 2 new LLMMessage objects:
|
809
|
-
# one for `final_prompt`, and one for the LLM response
|
810
|
-
|
811
|
-
if self.config.conversation_mode:
|
812
|
-
# respond with temporary context
|
813
|
-
answer_doc = super()._llm_response_temp_context(question, final_prompt)
|
814
|
-
else:
|
815
|
-
answer_doc = super().llm_response_forget(final_prompt)
|
816
|
-
|
817
|
-
final_answer = answer_doc.content.strip()
|
818
|
-
show_if_debug(final_answer, "SUMMARIZE_RESPONSE= ")
|
819
|
-
|
820
|
-
citations = extract_markdown_references(final_answer)
|
821
|
-
|
822
|
-
citations_str = ""
|
823
|
-
if len(citations) > 0:
|
824
|
-
# append [i] source, content for each citation
|
825
|
-
citations_str = "\n".join(
|
826
|
-
[
|
827
|
-
f"[^{c}] {passages[c-1].metadata.source}"
|
828
|
-
f"\n{format_footnote_text(passages[c-1].content)}"
|
829
|
-
for c in citations
|
830
|
-
]
|
831
|
-
)
|
832
|
-
|
833
|
-
return ChatDocument(
|
834
|
-
content=final_answer, # does not contain citations
|
835
|
-
metadata=ChatDocMetaData(
|
836
|
-
source=citations_str, # only the citations
|
837
|
-
sender=Entity.LLM,
|
838
|
-
has_citation=len(citations) > 0,
|
839
|
-
cached=getattr(answer_doc.metadata, "cached", False),
|
840
|
-
),
|
841
|
-
)
|
842
|
-
|
843
|
-
def llm_hypothetical_answer(self, query: str) -> str:
|
844
|
-
if self.llm is None:
|
845
|
-
raise ValueError("LLM not set")
|
846
|
-
with status("[cyan]LLM generating hypothetical answer..."):
|
847
|
-
with StreamingIfAllowed(self.llm, False):
|
848
|
-
# TODO: provide an easy way to
|
849
|
-
# Adjust this prompt depending on context.
|
850
|
-
answer = self.llm_response_forget(
|
851
|
-
f"""
|
852
|
-
Give an ideal answer to the following query,
|
853
|
-
in up to 3 sentences. Do not explain yourself,
|
854
|
-
and do not apologize, just show
|
855
|
-
a good possible answer, even if you do not have any information.
|
856
|
-
Preface your answer with "HYPOTHETICAL ANSWER: "
|
857
|
-
|
858
|
-
QUERY: {query}
|
859
|
-
"""
|
860
|
-
).content
|
861
|
-
return answer
|
862
|
-
|
863
|
-
def llm_rephrase_query(self, query: str) -> List[str]:
|
864
|
-
if self.llm is None:
|
865
|
-
raise ValueError("LLM not set")
|
866
|
-
with status("[cyan]LLM generating rephrases of query..."):
|
867
|
-
with StreamingIfAllowed(self.llm, False):
|
868
|
-
rephrases = self.llm_response_forget(
|
869
|
-
f"""
|
870
|
-
Rephrase the following query in {self.config.n_query_rephrases}
|
871
|
-
different equivalent ways, separate them with 2 newlines.
|
872
|
-
QUERY: {query}
|
873
|
-
"""
|
874
|
-
).content.split("\n\n")
|
875
|
-
return rephrases
|
876
|
-
|
877
|
-
def get_similar_chunks_bm25(
|
878
|
-
self, query: str, multiple: int
|
879
|
-
) -> List[Tuple[Document, float]]:
|
880
|
-
# find similar docs using bm25 similarity:
|
881
|
-
# these may sometimes be more likely to contain a relevant verbatim extract
|
882
|
-
with status("[cyan]Searching for similar chunks using bm25..."):
|
883
|
-
if self.chunked_docs is None or len(self.chunked_docs) == 0:
|
884
|
-
logger.warning("No chunked docs; cannot use bm25-similarity")
|
885
|
-
return []
|
886
|
-
if self.chunked_docs_clean is None or len(self.chunked_docs_clean) == 0:
|
887
|
-
logger.warning("No cleaned chunked docs; cannot use bm25-similarity")
|
888
|
-
return []
|
889
|
-
docs_scores = find_closest_matches_with_bm25(
|
890
|
-
self.chunked_docs,
|
891
|
-
self.chunked_docs_clean, # already pre-processed!
|
892
|
-
query,
|
893
|
-
k=self.config.parsing.n_similar_docs * multiple,
|
894
|
-
)
|
895
|
-
return docs_scores
|
896
|
-
|
897
|
-
def get_fuzzy_matches(
|
898
|
-
self, query: str, multiple: int
|
899
|
-
) -> List[Tuple[Document, float]]:
|
900
|
-
# find similar docs using fuzzy matching:
|
901
|
-
# these may sometimes be more likely to contain a relevant verbatim extract
|
902
|
-
with status("[cyan]Finding fuzzy matches in chunks..."):
|
903
|
-
if self.chunked_docs is None:
|
904
|
-
logger.warning("No chunked docs; cannot use fuzzy matching")
|
905
|
-
return []
|
906
|
-
if self.chunked_docs_clean is None:
|
907
|
-
logger.warning("No cleaned chunked docs; cannot use fuzzy-search")
|
908
|
-
return []
|
909
|
-
fuzzy_match_docs = find_fuzzy_matches_in_docs(
|
910
|
-
query,
|
911
|
-
self.chunked_docs,
|
912
|
-
self.chunked_docs_clean,
|
913
|
-
k=self.config.parsing.n_similar_docs * multiple,
|
914
|
-
words_before=self.config.n_fuzzy_neighbor_words or None,
|
915
|
-
words_after=self.config.n_fuzzy_neighbor_words or None,
|
916
|
-
)
|
917
|
-
return fuzzy_match_docs
|
918
|
-
|
919
|
-
def rerank_with_cross_encoder(
|
920
|
-
self, query: str, passages: List[Document]
|
921
|
-
) -> List[Document]:
|
922
|
-
with status("[cyan]Re-ranking retrieved chunks using cross-encoder..."):
|
923
|
-
try:
|
924
|
-
from sentence_transformers import CrossEncoder
|
925
|
-
except ImportError:
|
926
|
-
raise ImportError(
|
927
|
-
"""
|
928
|
-
To use cross-encoder re-ranking, you must install
|
929
|
-
langroid with the [hf-embeddings] extra, e.g.:
|
930
|
-
pip install "langroid[hf-embeddings]"
|
931
|
-
"""
|
932
|
-
)
|
933
|
-
|
934
|
-
model = CrossEncoder(self.config.cross_encoder_reranking_model)
|
935
|
-
scores = model.predict([(query, p.content) for p in passages])
|
936
|
-
# Convert to [0,1] so we might could use a cutoff later.
|
937
|
-
scores = 1.0 / (1 + np.exp(-np.array(scores)))
|
938
|
-
# get top k scoring passages
|
939
|
-
sorted_pairs = sorted(
|
940
|
-
zip(scores, passages),
|
941
|
-
key=lambda x: x[0],
|
942
|
-
reverse=True,
|
943
|
-
)
|
944
|
-
passages = [
|
945
|
-
d for _, d in sorted_pairs[: self.config.parsing.n_similar_docs]
|
946
|
-
]
|
947
|
-
return passages
|
948
|
-
|
949
|
-
def rerank_with_diversity(self, passages: List[Document]) -> List[Document]:
|
950
|
-
"""
|
951
|
-
Rerank a list of items in such a way that each successive item is least similar
|
952
|
-
(on average) to the earlier items.
|
953
|
-
|
954
|
-
Args:
|
955
|
-
query (str): The query for which the passages are relevant.
|
956
|
-
passages (List[Document]): A list of Documents to be reranked.
|
957
|
-
|
958
|
-
Returns:
|
959
|
-
List[Documents]: A reranked list of Documents.
|
960
|
-
"""
|
961
|
-
|
962
|
-
if self.vecdb is None:
|
963
|
-
logger.warning("No vecdb; cannot use rerank_with_diversity")
|
964
|
-
return passages
|
965
|
-
emb_model = self.vecdb.embedding_model
|
966
|
-
emb_fn = emb_model.embedding_fn()
|
967
|
-
embs = emb_fn([p.content for p in passages])
|
968
|
-
embs_arr = [np.array(e) for e in embs]
|
969
|
-
indices = list(range(len(passages)))
|
970
|
-
|
971
|
-
# Helper function to compute average similarity to
|
972
|
-
# items in the current result list.
|
973
|
-
def avg_similarity_to_result(i: int, result: List[int]) -> float:
|
974
|
-
return sum( # type: ignore
|
975
|
-
(embs_arr[i] @ embs_arr[j])
|
976
|
-
/ (np.linalg.norm(embs_arr[i]) * np.linalg.norm(embs_arr[j]))
|
977
|
-
for j in result
|
978
|
-
) / len(result)
|
979
|
-
|
980
|
-
# copy passages to items
|
981
|
-
result = [indices.pop(0)] # Start with the first item.
|
982
|
-
|
983
|
-
while indices:
|
984
|
-
# Find the item that has the least average similarity
|
985
|
-
# to items in the result list.
|
986
|
-
least_similar_item = min(
|
987
|
-
indices, key=lambda i: avg_similarity_to_result(i, result)
|
988
|
-
)
|
989
|
-
result.append(least_similar_item)
|
990
|
-
indices.remove(least_similar_item)
|
991
|
-
|
992
|
-
# return passages in order of result list
|
993
|
-
return [passages[i] for i in result]
|
994
|
-
|
995
|
-
def rerank_to_periphery(self, passages: List[Document]) -> List[Document]:
|
996
|
-
"""
|
997
|
-
Rerank to avoid Lost In the Middle (LIM) problem,
|
998
|
-
where LLMs pay more attention to items at the ends of a list,
|
999
|
-
rather than the middle. So we re-rank to make the best passages
|
1000
|
-
appear at the periphery of the list.
|
1001
|
-
https://arxiv.org/abs/2307.03172
|
1002
|
-
|
1003
|
-
Example reranking:
|
1004
|
-
1 2 3 4 5 6 7 8 9 ==> 1 3 5 7 9 8 6 4 2
|
1005
|
-
|
1006
|
-
Args:
|
1007
|
-
passages (List[Document]): A list of Documents to be reranked.
|
1008
|
-
|
1009
|
-
Returns:
|
1010
|
-
List[Documents]: A reranked list of Documents.
|
1011
|
-
|
1012
|
-
"""
|
1013
|
-
# Splitting items into odds and evens based on index, not value
|
1014
|
-
odds = passages[::2]
|
1015
|
-
evens = passages[1::2][::-1]
|
1016
|
-
|
1017
|
-
# Merging them back together
|
1018
|
-
return odds + evens
|
1019
|
-
|
1020
|
-
def add_context_window(
|
1021
|
-
self,
|
1022
|
-
docs_scores: List[Tuple[Document, float]],
|
1023
|
-
) -> List[Tuple[Document, float]]:
|
1024
|
-
"""
|
1025
|
-
In each doc's metadata, there may be a window_ids field indicating
|
1026
|
-
the ids of the chunks around the current chunk. We use these stored
|
1027
|
-
window_ids to retrieve the desired number
|
1028
|
-
(self.config.n_neighbor_chunks) of neighbors
|
1029
|
-
on either side of the current chunk.
|
1030
|
-
|
1031
|
-
Args:
|
1032
|
-
docs_scores (List[Tuple[Document, float]]): List of pairs of documents
|
1033
|
-
to add context windows to together with their match scores.
|
1034
|
-
|
1035
|
-
Returns:
|
1036
|
-
List[Tuple[Document, float]]: List of (Document, score) tuples.
|
1037
|
-
"""
|
1038
|
-
if self.vecdb is None or self.config.n_neighbor_chunks == 0:
|
1039
|
-
return docs_scores
|
1040
|
-
if len(docs_scores) == 0:
|
1041
|
-
return []
|
1042
|
-
if set(docs_scores[0][0].__fields__) != {"content", "metadata"}:
|
1043
|
-
# Do not add context window when there are other fields besides just
|
1044
|
-
# content and metadata, since we do not know how to set those other fields
|
1045
|
-
# for newly created docs with combined content.
|
1046
|
-
return docs_scores
|
1047
|
-
return self.vecdb.add_context_window(docs_scores, self.config.n_neighbor_chunks)
|
1048
|
-
|
1049
|
-
def get_semantic_search_results(
|
1050
|
-
self,
|
1051
|
-
query: str,
|
1052
|
-
k: int = 10,
|
1053
|
-
) -> List[Tuple[Document, float]]:
|
1054
|
-
"""
|
1055
|
-
Get semantic search results from vecdb.
|
1056
|
-
Args:
|
1057
|
-
query (str): query to search for
|
1058
|
-
k (int): number of results to return
|
1059
|
-
Returns:
|
1060
|
-
List[Tuple[Document, float]]: List of (Document, score) tuples.
|
1061
|
-
"""
|
1062
|
-
if self.vecdb is None:
|
1063
|
-
raise ValueError("VecDB not set")
|
1064
|
-
# Note: for dynamic filtering based on a query, users can
|
1065
|
-
# use the `temp_update` context-manager to pass in a `filter` to self.config,
|
1066
|
-
# e.g.:
|
1067
|
-
# with temp_update(self.config, {"filter": "metadata.source=='source1'"}):
|
1068
|
-
# docs_scores = self.get_semantic_search_results(query, k=k)
|
1069
|
-
# This avoids having pass the `filter` argument to every function call
|
1070
|
-
# upstream of this one.
|
1071
|
-
# The `temp_update` context manager is defined in
|
1072
|
-
# `langroid/utils/pydantic_utils.py`
|
1073
|
-
return self.vecdb.similar_texts_with_scores(
|
1074
|
-
query,
|
1075
|
-
k=k,
|
1076
|
-
where=self.config.filter,
|
1077
|
-
)
|
1078
|
-
|
1079
|
-
def get_relevant_chunks(
|
1080
|
-
self, query: str, query_proxies: List[str] = []
|
1081
|
-
) -> List[Document]:
|
1082
|
-
"""
|
1083
|
-
The retrieval stage in RAG: get doc-chunks that are most "relevant"
|
1084
|
-
to the query (and possibly any proxy queries), from the document-store,
|
1085
|
-
which currently is the vector store,
|
1086
|
-
but in theory could be any document store, or even web-search.
|
1087
|
-
This stage does NOT involve an LLM, and the retrieved chunks
|
1088
|
-
could either be pre-chunked text (from the initial pre-processing stage
|
1089
|
-
where chunks were stored in the vector store), or they could be
|
1090
|
-
dynamically retrieved based on a window around a lexical match.
|
1091
|
-
|
1092
|
-
These are the steps (some optional based on config):
|
1093
|
-
- semantic search based on vector-embedding distance, from vecdb
|
1094
|
-
- lexical search using bm25-ranking (keyword similarity)
|
1095
|
-
- fuzzy matching (keyword similarity)
|
1096
|
-
- re-ranking of doc-chunks by relevance to query, using cross-encoder,
|
1097
|
-
and pick top k
|
1098
|
-
|
1099
|
-
Args:
|
1100
|
-
query: original query (assumed to be in stand-alone form)
|
1101
|
-
query_proxies: possible rephrases, or hypothetical answer to query
|
1102
|
-
(e.g. for HyDE-type retrieval)
|
1103
|
-
|
1104
|
-
Returns:
|
1105
|
-
|
1106
|
-
"""
|
1107
|
-
|
1108
|
-
if (
|
1109
|
-
self.vecdb is None
|
1110
|
-
or self.vecdb.config.collection_name
|
1111
|
-
not in self.vecdb.list_collections(empty=False)
|
1112
|
-
):
|
1113
|
-
return []
|
1114
|
-
|
1115
|
-
# if we are using cross-encoder reranking or reciprocal rank fusion (RRF),
|
1116
|
-
# we can retrieve more docs during retrieval, and leave it to the cross-encoder
|
1117
|
-
# or RRF reranking to whittle down to self.config.parsing.n_similar_docs
|
1118
|
-
retrieval_multiple = (
|
1119
|
-
1
|
1120
|
-
if (
|
1121
|
-
self.config.cross_encoder_reranking_model == ""
|
1122
|
-
and not self.config.use_reciprocal_rank_fusion
|
1123
|
-
)
|
1124
|
-
else 3
|
1125
|
-
)
|
1126
|
-
|
1127
|
-
if self.vecdb is None:
|
1128
|
-
raise ValueError("VecDB not set")
|
1129
|
-
|
1130
|
-
with status("[cyan]Searching VecDB for relevant doc passages..."):
|
1131
|
-
docs_and_scores: List[Tuple[Document, float]] = []
|
1132
|
-
for q in [query] + query_proxies:
|
1133
|
-
docs_and_scores += self.get_semantic_search_results(
|
1134
|
-
q,
|
1135
|
-
k=self.config.parsing.n_similar_docs * retrieval_multiple,
|
1136
|
-
)
|
1137
|
-
# sort by score descending
|
1138
|
-
docs_and_scores = sorted(
|
1139
|
-
docs_and_scores, key=lambda x: x[1], reverse=True
|
1140
|
-
)
|
1141
|
-
|
1142
|
-
# keep only docs with unique d.id()
|
1143
|
-
id2_rank_semantic = {d.id(): i for i, (d, _) in enumerate(docs_and_scores)}
|
1144
|
-
id2doc = {d.id(): d for d, _ in docs_and_scores}
|
1145
|
-
# make sure we get unique docs
|
1146
|
-
passages = [id2doc[id] for id, _ in id2_rank_semantic.items()]
|
1147
|
-
|
1148
|
-
id2_rank_bm25 = {}
|
1149
|
-
if self.config.use_bm25_search:
|
1150
|
-
# TODO: Add score threshold in config
|
1151
|
-
docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
|
1152
|
-
if self.config.cross_encoder_reranking_model == "":
|
1153
|
-
# only if we're not re-ranking with a cross-encoder,
|
1154
|
-
# we collect these ranks for Reciprocal Rank Fusion down below.
|
1155
|
-
docs_scores = sorted(docs_scores, key=lambda x: x[1], reverse=True)
|
1156
|
-
id2_rank_bm25 = {d.id(): i for i, (d, _) in enumerate(docs_scores)}
|
1157
|
-
id2doc.update({d.id(): d for d, _ in docs_scores})
|
1158
|
-
else:
|
1159
|
-
passages += [d for (d, _) in docs_scores]
|
1160
|
-
|
1161
|
-
id2_rank_fuzzy = {}
|
1162
|
-
if self.config.use_fuzzy_match:
|
1163
|
-
# TODO: Add score threshold in config
|
1164
|
-
fuzzy_match_doc_scores = self.get_fuzzy_matches(query, retrieval_multiple)
|
1165
|
-
if self.config.cross_encoder_reranking_model == "":
|
1166
|
-
# only if we're not re-ranking with a cross-encoder,
|
1167
|
-
# we collect these ranks for Reciprocal Rank Fusion down below.
|
1168
|
-
fuzzy_match_doc_scores = sorted(
|
1169
|
-
fuzzy_match_doc_scores, key=lambda x: x[1], reverse=True
|
1170
|
-
)
|
1171
|
-
id2_rank_fuzzy = {
|
1172
|
-
d.id(): i for i, (d, _) in enumerate(fuzzy_match_doc_scores)
|
1173
|
-
}
|
1174
|
-
id2doc.update({d.id(): d for d, _ in fuzzy_match_doc_scores})
|
1175
|
-
else:
|
1176
|
-
passages += [d for (d, _) in fuzzy_match_doc_scores]
|
1177
|
-
|
1178
|
-
if (
|
1179
|
-
self.config.cross_encoder_reranking_model == ""
|
1180
|
-
and self.config.use_reciprocal_rank_fusion
|
1181
|
-
and (self.config.use_bm25_search or self.config.use_fuzzy_match)
|
1182
|
-
):
|
1183
|
-
# Since we're not using cross-enocder re-ranking,
|
1184
|
-
# we need to re-order the retrieved chunks from potentially three
|
1185
|
-
# different retrieval methods (semantic, bm25, fuzzy), where the
|
1186
|
-
# similarity scores are on different scales.
|
1187
|
-
# We order the retrieved chunks using Reciprocal Rank Fusion (RRF) score.
|
1188
|
-
# Combine the ranks from each id2doc_rank_* dict into a single dict,
|
1189
|
-
# where the reciprocal rank score is the sum of
|
1190
|
-
# 1/(rank + self.config.reciprocal_rank_fusion_constant).
|
1191
|
-
# See https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking
|
1192
|
-
#
|
1193
|
-
# Note: diversity/periphery-reranking below may modify the final ranking.
|
1194
|
-
id2_reciprocal_score = {}
|
1195
|
-
for id_ in (
|
1196
|
-
set(id2_rank_semantic.keys())
|
1197
|
-
| set(id2_rank_bm25.keys())
|
1198
|
-
| set(id2_rank_fuzzy.keys())
|
1199
|
-
):
|
1200
|
-
rank_semantic = id2_rank_semantic.get(id_, float("inf"))
|
1201
|
-
rank_bm25 = id2_rank_bm25.get(id_, float("inf"))
|
1202
|
-
rank_fuzzy = id2_rank_fuzzy.get(id_, float("inf"))
|
1203
|
-
c = self.config.reciprocal_rank_fusion_constant
|
1204
|
-
reciprocal_fusion_score = (
|
1205
|
-
1 / (rank_semantic + c) + 1 / (rank_bm25 + c) + 1 / (rank_fuzzy + c)
|
1206
|
-
)
|
1207
|
-
id2_reciprocal_score[id_] = reciprocal_fusion_score
|
1208
|
-
|
1209
|
-
# sort the docs by the reciprocal score, in descending order
|
1210
|
-
id2_reciprocal_score = OrderedDict(
|
1211
|
-
sorted(
|
1212
|
-
id2_reciprocal_score.items(),
|
1213
|
-
key=lambda x: x[1],
|
1214
|
-
reverse=True,
|
1215
|
-
)
|
1216
|
-
)
|
1217
|
-
# each method retrieved up to retrieval_multiple * n_similar_docs,
|
1218
|
-
# so we need to take the top n_similar_docs from the combined list
|
1219
|
-
passages = [
|
1220
|
-
id2doc[id]
|
1221
|
-
for i, (id, _) in enumerate(id2_reciprocal_score.items())
|
1222
|
-
if i < self.config.parsing.n_similar_docs
|
1223
|
-
]
|
1224
|
-
# passages must have distinct ids
|
1225
|
-
assert len(passages) == len(set([d.id() for d in passages])), (
|
1226
|
-
f"Duplicate passages in retrieved docs: {len(passages)} != "
|
1227
|
-
f"{len(set([d.id() for d in passages]))}"
|
1228
|
-
)
|
1229
|
-
|
1230
|
-
if len(passages) == 0:
|
1231
|
-
return []
|
1232
|
-
|
1233
|
-
if self.config.rerank_after_adding_context:
|
1234
|
-
passages_scores = [(p, 0.0) for p in passages]
|
1235
|
-
passages_scores = self.add_context_window(passages_scores)
|
1236
|
-
passages = [p for p, _ in passages_scores]
|
1237
|
-
# now passages can potentially have a lot of doc chunks,
|
1238
|
-
# so we re-rank them using a cross-encoder scoring model,
|
1239
|
-
# and pick top k where k = config.parsing.n_similar_docs
|
1240
|
-
# https://www.sbert.net/examples/applications/retrieve_rerank
|
1241
|
-
if self.config.cross_encoder_reranking_model != "":
|
1242
|
-
passages = self.rerank_with_cross_encoder(query, passages)
|
1243
|
-
|
1244
|
-
if self.config.rerank_diversity:
|
1245
|
-
# reorder to increase diversity among top docs
|
1246
|
-
passages = self.rerank_with_diversity(passages)
|
1247
|
-
|
1248
|
-
if self.config.rerank_periphery:
|
1249
|
-
# reorder so most important docs are at periphery
|
1250
|
-
# (see Lost In the Middle issue).
|
1251
|
-
passages = self.rerank_to_periphery(passages)
|
1252
|
-
|
1253
|
-
if not self.config.rerank_after_adding_context:
|
1254
|
-
passages_scores = [(p, 0.0) for p in passages]
|
1255
|
-
passages_scores = self.add_context_window(passages_scores)
|
1256
|
-
passages = [p for p, _ in passages_scores]
|
1257
|
-
|
1258
|
-
return passages[: self.config.parsing.n_similar_docs]
|
1259
|
-
|
1260
|
-
@no_type_check
|
1261
|
-
def get_relevant_extracts(self, query: str) -> Tuple[str, List[Document]]:
|
1262
|
-
"""
|
1263
|
-
Get list of (verbatim) extracts from doc-chunks relevant to answering a query.
|
1264
|
-
|
1265
|
-
These are the stages (some optional based on config):
|
1266
|
-
- use LLM to convert query to stand-alone query
|
1267
|
-
- optionally use LLM to rephrase query to use below
|
1268
|
-
- optionally use LLM to generate hypothetical answer (HyDE) to use below.
|
1269
|
-
- get_relevant_chunks(): get doc-chunks relevant to query and proxies
|
1270
|
-
- use LLM to get relevant extracts from doc-chunks
|
1271
|
-
|
1272
|
-
Args:
|
1273
|
-
query (str): query to search for
|
1274
|
-
|
1275
|
-
Returns:
|
1276
|
-
query (str): stand-alone version of input query
|
1277
|
-
List[Document]: list of relevant extracts
|
1278
|
-
|
1279
|
-
"""
|
1280
|
-
if (
|
1281
|
-
self.vecdb is None
|
1282
|
-
or self.vecdb.config.collection_name
|
1283
|
-
not in self.vecdb.list_collections(empty=False)
|
1284
|
-
):
|
1285
|
-
return query, []
|
1286
|
-
|
1287
|
-
if len(self.dialog) > 0 and not self.config.assistant_mode:
|
1288
|
-
# Regardless of whether we are in conversation mode or not,
|
1289
|
-
# for relevant doc/chunk extraction, we must convert the query
|
1290
|
-
# to a standalone query to get more relevant results.
|
1291
|
-
with status("[cyan]Converting to stand-alone query...[/cyan]"):
|
1292
|
-
with StreamingIfAllowed(self.llm, False):
|
1293
|
-
query = self.llm.followup_to_standalone(self.dialog, query)
|
1294
|
-
print(f"[orange2]New query: {query}")
|
1295
|
-
|
1296
|
-
proxies = []
|
1297
|
-
if self.config.hypothetical_answer:
|
1298
|
-
answer = self.llm_hypothetical_answer(query)
|
1299
|
-
proxies = [answer]
|
1300
|
-
|
1301
|
-
if self.config.n_query_rephrases > 0:
|
1302
|
-
rephrases = self.llm_rephrase_query(query)
|
1303
|
-
proxies += rephrases
|
1304
|
-
|
1305
|
-
passages = self.get_relevant_chunks(query, proxies) # no LLM involved
|
1306
|
-
|
1307
|
-
if len(passages) == 0:
|
1308
|
-
return query, []
|
1309
|
-
|
1310
|
-
with status("[cyan]LLM Extracting verbatim passages..."):
|
1311
|
-
with StreamingIfAllowed(self.llm, False):
|
1312
|
-
# these are async calls, one per passage; turn off streaming
|
1313
|
-
extracts = self.get_verbatim_extracts(query, passages)
|
1314
|
-
extracts = [e for e in extracts if e.content != NO_ANSWER]
|
1315
|
-
|
1316
|
-
return query, extracts
|
1317
|
-
|
1318
|
-
def get_verbatim_extracts(
|
1319
|
-
self,
|
1320
|
-
query: str,
|
1321
|
-
passages: List[Document],
|
1322
|
-
) -> List[Document]:
|
1323
|
-
"""
|
1324
|
-
Run RelevanceExtractorAgent in async/concurrent mode on passages,
|
1325
|
-
to extract portions relevant to answering query, from each passage.
|
1326
|
-
Args:
|
1327
|
-
query (str): query to answer
|
1328
|
-
passages (List[Documents]): list of passages to extract from
|
1329
|
-
|
1330
|
-
Returns:
|
1331
|
-
List[Document]: list of Documents containing extracts and metadata.
|
1332
|
-
"""
|
1333
|
-
agent_cfg = self.config.relevance_extractor_config
|
1334
|
-
if agent_cfg is None:
|
1335
|
-
# no relevance extraction: simply return passages
|
1336
|
-
return passages
|
1337
|
-
if agent_cfg.llm is None:
|
1338
|
-
# Use main DocChatAgent's LLM if not provided explicitly:
|
1339
|
-
# this reduces setup burden on the user
|
1340
|
-
agent_cfg.llm = self.config.llm
|
1341
|
-
agent_cfg.query = query
|
1342
|
-
agent_cfg.segment_length = self.config.extraction_granularity
|
1343
|
-
agent_cfg.llm.stream = False # disable streaming for concurrent calls
|
1344
|
-
|
1345
|
-
agent = RelevanceExtractorAgent(agent_cfg)
|
1346
|
-
task = Task(
|
1347
|
-
agent,
|
1348
|
-
name="Relevance-Extractor",
|
1349
|
-
interactive=False,
|
1350
|
-
)
|
1351
|
-
|
1352
|
-
extracts: list[str] = run_batch_tasks(
|
1353
|
-
task,
|
1354
|
-
passages,
|
1355
|
-
input_map=lambda msg: msg.content,
|
1356
|
-
output_map=lambda ans: ans.content if ans is not None else NO_ANSWER,
|
1357
|
-
) # type: ignore
|
1358
|
-
|
1359
|
-
# Caution: Retain ALL other fields in the Documents (which could be
|
1360
|
-
# other than just `content` and `metadata`), while simply replacing
|
1361
|
-
# `content` with the extracted portions
|
1362
|
-
passage_extracts = []
|
1363
|
-
for p, e in zip(passages, extracts):
|
1364
|
-
if e == NO_ANSWER or len(e) == 0:
|
1365
|
-
continue
|
1366
|
-
p_copy = p.copy()
|
1367
|
-
p_copy.content = e
|
1368
|
-
passage_extracts.append(p_copy)
|
1369
|
-
|
1370
|
-
return passage_extracts
|
1371
|
-
|
1372
|
-
def answer_from_docs(self, query: str) -> ChatDocument:
|
1373
|
-
"""
|
1374
|
-
Answer query based on relevant docs from the VecDB
|
1375
|
-
|
1376
|
-
Args:
|
1377
|
-
query (str): query to answer
|
1378
|
-
|
1379
|
-
Returns:
|
1380
|
-
Document: answer
|
1381
|
-
"""
|
1382
|
-
response = ChatDocument(
|
1383
|
-
content=NO_ANSWER,
|
1384
|
-
metadata=ChatDocMetaData(
|
1385
|
-
source="None",
|
1386
|
-
sender=Entity.LLM,
|
1387
|
-
),
|
1388
|
-
)
|
1389
|
-
# query may be updated to a stand-alone version
|
1390
|
-
query, extracts = self.get_relevant_extracts(query)
|
1391
|
-
if len(extracts) == 0:
|
1392
|
-
return response
|
1393
|
-
if self.llm is None:
|
1394
|
-
raise ValueError("LLM not set")
|
1395
|
-
if self.config.retrieve_only:
|
1396
|
-
# only return extracts, skip LLM-based summary answer
|
1397
|
-
meta = dict(
|
1398
|
-
sender=Entity.LLM,
|
1399
|
-
)
|
1400
|
-
# copy metadata from first doc, unclear what to do here.
|
1401
|
-
meta.update(extracts[0].metadata)
|
1402
|
-
return ChatDocument(
|
1403
|
-
content="\n\n".join([e.content for e in extracts]),
|
1404
|
-
metadata=ChatDocMetaData(**meta), # type: ignore
|
1405
|
-
)
|
1406
|
-
response = self.get_summary_answer(query, extracts)
|
1407
|
-
|
1408
|
-
self.update_dialog(query, response.content)
|
1409
|
-
self.response = response # save last response
|
1410
|
-
return response
|
1411
|
-
|
1412
|
-
def summarize_docs(
|
1413
|
-
self,
|
1414
|
-
instruction: str = "Give a concise summary of the following text:",
|
1415
|
-
) -> None | ChatDocument:
|
1416
|
-
"""Summarize all docs"""
|
1417
|
-
if self.llm is None:
|
1418
|
-
raise ValueError("LLM not set")
|
1419
|
-
if len(self.original_docs) == 0:
|
1420
|
-
logger.warning(
|
1421
|
-
"""
|
1422
|
-
No docs to summarize! Perhaps you are re-using a previously
|
1423
|
-
defined collection?
|
1424
|
-
In that case, we don't have access to the original docs.
|
1425
|
-
To create a summary, use a new collection, and specify a list of docs.
|
1426
|
-
"""
|
1427
|
-
)
|
1428
|
-
return None
|
1429
|
-
full_text = "\n\n".join([d.content for d in self.original_docs])
|
1430
|
-
if self.parser is None:
|
1431
|
-
raise ValueError("No parser defined")
|
1432
|
-
tot_tokens = self.parser.num_tokens(full_text)
|
1433
|
-
MAX_INPUT_TOKENS = (
|
1434
|
-
self.llm.completion_context_length()
|
1435
|
-
- self.config.llm.max_output_tokens
|
1436
|
-
- 100
|
1437
|
-
)
|
1438
|
-
if tot_tokens > MAX_INPUT_TOKENS:
|
1439
|
-
# truncate
|
1440
|
-
full_text = self.parser.tokenizer.decode(
|
1441
|
-
self.parser.tokenizer.encode(full_text)[:MAX_INPUT_TOKENS]
|
1442
|
-
)
|
1443
|
-
logger.warning(
|
1444
|
-
f"Summarizing after truncating text to {MAX_INPUT_TOKENS} tokens"
|
1445
|
-
)
|
1446
|
-
prompt = f"""
|
1447
|
-
{instruction}
|
1448
|
-
|
1449
|
-
FULL TEXT:
|
1450
|
-
{full_text}
|
1451
|
-
""".strip()
|
1452
|
-
with StreamingIfAllowed(self.llm):
|
1453
|
-
summary = ChatAgent.llm_response(self, prompt)
|
1454
|
-
return summary
|
1455
|
-
|
1456
|
-
def justify_response(self) -> ChatDocument | None:
|
1457
|
-
"""Show evidence for last response"""
|
1458
|
-
if self.response is None:
|
1459
|
-
print("[magenta]No response yet")
|
1460
|
-
return None
|
1461
|
-
source = self.response.metadata.source
|
1462
|
-
if len(source) > 0:
|
1463
|
-
print("[magenta]" + source)
|
1464
|
-
else:
|
1465
|
-
print("[magenta]No source found")
|
1466
|
-
return None
|