langroid 0.31.2__py3-none-any.whl → 0.33.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (163) hide show
  1. {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info}/METADATA +150 -124
  2. langroid-0.33.3.dist-info/RECORD +7 -0
  3. {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info}/WHEEL +1 -1
  4. langroid-0.33.3.dist-info/entry_points.txt +4 -0
  5. pyproject.toml +317 -212
  6. langroid/__init__.py +0 -106
  7. langroid/agent/.chainlit/config.toml +0 -121
  8. langroid/agent/.chainlit/translations/bn.json +0 -231
  9. langroid/agent/.chainlit/translations/en-US.json +0 -229
  10. langroid/agent/.chainlit/translations/gu.json +0 -231
  11. langroid/agent/.chainlit/translations/he-IL.json +0 -231
  12. langroid/agent/.chainlit/translations/hi.json +0 -231
  13. langroid/agent/.chainlit/translations/kn.json +0 -231
  14. langroid/agent/.chainlit/translations/ml.json +0 -231
  15. langroid/agent/.chainlit/translations/mr.json +0 -231
  16. langroid/agent/.chainlit/translations/ta.json +0 -231
  17. langroid/agent/.chainlit/translations/te.json +0 -231
  18. langroid/agent/.chainlit/translations/zh-CN.json +0 -229
  19. langroid/agent/__init__.py +0 -41
  20. langroid/agent/base.py +0 -1981
  21. langroid/agent/batch.py +0 -398
  22. langroid/agent/callbacks/__init__.py +0 -0
  23. langroid/agent/callbacks/chainlit.py +0 -598
  24. langroid/agent/chat_agent.py +0 -1899
  25. langroid/agent/chat_document.py +0 -454
  26. langroid/agent/helpers.py +0 -0
  27. langroid/agent/junk +0 -13
  28. langroid/agent/openai_assistant.py +0 -882
  29. langroid/agent/special/__init__.py +0 -59
  30. langroid/agent/special/arangodb/__init__.py +0 -0
  31. langroid/agent/special/arangodb/arangodb_agent.py +0 -656
  32. langroid/agent/special/arangodb/system_messages.py +0 -186
  33. langroid/agent/special/arangodb/tools.py +0 -107
  34. langroid/agent/special/arangodb/utils.py +0 -36
  35. langroid/agent/special/doc_chat_agent.py +0 -1466
  36. langroid/agent/special/lance_doc_chat_agent.py +0 -262
  37. langroid/agent/special/lance_rag/__init__.py +0 -9
  38. langroid/agent/special/lance_rag/critic_agent.py +0 -198
  39. langroid/agent/special/lance_rag/lance_rag_task.py +0 -82
  40. langroid/agent/special/lance_rag/query_planner_agent.py +0 -260
  41. langroid/agent/special/lance_tools.py +0 -61
  42. langroid/agent/special/neo4j/__init__.py +0 -0
  43. langroid/agent/special/neo4j/csv_kg_chat.py +0 -174
  44. langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -433
  45. langroid/agent/special/neo4j/system_messages.py +0 -120
  46. langroid/agent/special/neo4j/tools.py +0 -32
  47. langroid/agent/special/relevance_extractor_agent.py +0 -127
  48. langroid/agent/special/retriever_agent.py +0 -56
  49. langroid/agent/special/sql/__init__.py +0 -17
  50. langroid/agent/special/sql/sql_chat_agent.py +0 -654
  51. langroid/agent/special/sql/utils/__init__.py +0 -21
  52. langroid/agent/special/sql/utils/description_extractors.py +0 -190
  53. langroid/agent/special/sql/utils/populate_metadata.py +0 -85
  54. langroid/agent/special/sql/utils/system_message.py +0 -35
  55. langroid/agent/special/sql/utils/tools.py +0 -64
  56. langroid/agent/special/table_chat_agent.py +0 -263
  57. langroid/agent/structured_message.py +0 -9
  58. langroid/agent/task.py +0 -2093
  59. langroid/agent/tool_message.py +0 -393
  60. langroid/agent/tools/__init__.py +0 -38
  61. langroid/agent/tools/duckduckgo_search_tool.py +0 -50
  62. langroid/agent/tools/file_tools.py +0 -234
  63. langroid/agent/tools/google_search_tool.py +0 -39
  64. langroid/agent/tools/metaphor_search_tool.py +0 -67
  65. langroid/agent/tools/orchestration.py +0 -303
  66. langroid/agent/tools/recipient_tool.py +0 -235
  67. langroid/agent/tools/retrieval_tool.py +0 -32
  68. langroid/agent/tools/rewind_tool.py +0 -137
  69. langroid/agent/tools/segment_extract_tool.py +0 -41
  70. langroid/agent/typed_task.py +0 -19
  71. langroid/agent/xml_tool_message.py +0 -382
  72. langroid/agent_config.py +0 -0
  73. langroid/cachedb/__init__.py +0 -17
  74. langroid/cachedb/base.py +0 -58
  75. langroid/cachedb/momento_cachedb.py +0 -108
  76. langroid/cachedb/redis_cachedb.py +0 -153
  77. langroid/embedding_models/__init__.py +0 -39
  78. langroid/embedding_models/base.py +0 -74
  79. langroid/embedding_models/clustering.py +0 -189
  80. langroid/embedding_models/models.py +0 -461
  81. langroid/embedding_models/protoc/__init__.py +0 -0
  82. langroid/embedding_models/protoc/embeddings.proto +0 -19
  83. langroid/embedding_models/protoc/embeddings_pb2.py +0 -33
  84. langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -50
  85. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -79
  86. langroid/embedding_models/remote_embeds.py +0 -153
  87. langroid/exceptions.py +0 -65
  88. langroid/experimental/team-save.py +0 -391
  89. langroid/language_models/.chainlit/config.toml +0 -121
  90. langroid/language_models/.chainlit/translations/en-US.json +0 -231
  91. langroid/language_models/__init__.py +0 -53
  92. langroid/language_models/azure_openai.py +0 -153
  93. langroid/language_models/base.py +0 -678
  94. langroid/language_models/config.py +0 -18
  95. langroid/language_models/mock_lm.py +0 -124
  96. langroid/language_models/openai_gpt.py +0 -1923
  97. langroid/language_models/prompt_formatter/__init__.py +0 -16
  98. langroid/language_models/prompt_formatter/base.py +0 -40
  99. langroid/language_models/prompt_formatter/hf_formatter.py +0 -132
  100. langroid/language_models/prompt_formatter/llama2_formatter.py +0 -75
  101. langroid/language_models/utils.py +0 -147
  102. langroid/mytypes.py +0 -84
  103. langroid/parsing/__init__.py +0 -52
  104. langroid/parsing/agent_chats.py +0 -38
  105. langroid/parsing/code-parsing.md +0 -86
  106. langroid/parsing/code_parser.py +0 -121
  107. langroid/parsing/config.py +0 -0
  108. langroid/parsing/document_parser.py +0 -718
  109. langroid/parsing/image_text.py +0 -32
  110. langroid/parsing/para_sentence_split.py +0 -62
  111. langroid/parsing/parse_json.py +0 -155
  112. langroid/parsing/parser.py +0 -313
  113. langroid/parsing/repo_loader.py +0 -790
  114. langroid/parsing/routing.py +0 -36
  115. langroid/parsing/search.py +0 -275
  116. langroid/parsing/spider.py +0 -102
  117. langroid/parsing/table_loader.py +0 -94
  118. langroid/parsing/url_loader.py +0 -111
  119. langroid/parsing/url_loader_cookies.py +0 -73
  120. langroid/parsing/urls.py +0 -273
  121. langroid/parsing/utils.py +0 -373
  122. langroid/parsing/web_search.py +0 -155
  123. langroid/prompts/__init__.py +0 -9
  124. langroid/prompts/chat-gpt4-system-prompt.md +0 -68
  125. langroid/prompts/dialog.py +0 -17
  126. langroid/prompts/prompts_config.py +0 -5
  127. langroid/prompts/templates.py +0 -141
  128. langroid/pydantic_v1/__init__.py +0 -10
  129. langroid/pydantic_v1/main.py +0 -4
  130. langroid/utils/.chainlit/config.toml +0 -121
  131. langroid/utils/.chainlit/translations/en-US.json +0 -231
  132. langroid/utils/__init__.py +0 -19
  133. langroid/utils/algorithms/__init__.py +0 -3
  134. langroid/utils/algorithms/graph.py +0 -103
  135. langroid/utils/configuration.py +0 -98
  136. langroid/utils/constants.py +0 -30
  137. langroid/utils/docker.py +0 -37
  138. langroid/utils/git_utils.py +0 -252
  139. langroid/utils/globals.py +0 -49
  140. langroid/utils/llms/__init__.py +0 -0
  141. langroid/utils/llms/strings.py +0 -8
  142. langroid/utils/logging.py +0 -135
  143. langroid/utils/object_registry.py +0 -66
  144. langroid/utils/output/__init__.py +0 -20
  145. langroid/utils/output/citations.py +0 -41
  146. langroid/utils/output/printing.py +0 -99
  147. langroid/utils/output/status.py +0 -40
  148. langroid/utils/pandas_utils.py +0 -30
  149. langroid/utils/pydantic_utils.py +0 -602
  150. langroid/utils/system.py +0 -286
  151. langroid/utils/types.py +0 -93
  152. langroid/utils/web/__init__.py +0 -0
  153. langroid/utils/web/login.py +0 -83
  154. langroid/vector_store/__init__.py +0 -50
  155. langroid/vector_store/base.py +0 -357
  156. langroid/vector_store/chromadb.py +0 -214
  157. langroid/vector_store/lancedb.py +0 -401
  158. langroid/vector_store/meilisearch.py +0 -299
  159. langroid/vector_store/momento.py +0 -278
  160. langroid/vector_store/qdrant_cloud.py +0 -6
  161. langroid/vector_store/qdrantdb.py +0 -468
  162. langroid-0.31.2.dist-info/RECORD +0 -162
  163. {langroid-0.31.2.dist-info → langroid-0.33.3.dist-info/licenses}/LICENSE +0 -0
@@ -1,1466 +0,0 @@
1
- """
2
- Agent that supports asking queries about a set of documents, using
3
- retrieval-augmented generation (RAG).
4
-
5
- Functionality includes:
6
- - summarizing a document, with a custom instruction; see `summarize_docs`
7
- - asking a question about a document; see `answer_from_docs`
8
-
9
- Note: to use the sentence-transformer embeddings, you must install
10
- langroid with the [hf-embeddings] extra, e.g.:
11
-
12
- pip install "langroid[hf-embeddings]"
13
-
14
- """
15
-
16
- import logging
17
- from collections import OrderedDict
18
- from functools import cache
19
- from typing import Any, Dict, List, Optional, Set, Tuple, no_type_check
20
-
21
- import nest_asyncio
22
- import numpy as np
23
- import pandas as pd
24
- from rich.prompt import Prompt
25
-
26
- from langroid.agent.batch import run_batch_tasks
27
- from langroid.agent.chat_agent import ChatAgent, ChatAgentConfig
28
- from langroid.agent.chat_document import ChatDocMetaData, ChatDocument
29
- from langroid.agent.special.relevance_extractor_agent import (
30
- RelevanceExtractorAgent,
31
- RelevanceExtractorAgentConfig,
32
- )
33
- from langroid.agent.task import Task
34
- from langroid.agent.tools.retrieval_tool import RetrievalTool
35
- from langroid.embedding_models.models import (
36
- OpenAIEmbeddingsConfig,
37
- SentenceTransformerEmbeddingsConfig,
38
- )
39
- from langroid.language_models.base import StreamingIfAllowed
40
- from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
41
- from langroid.mytypes import DocMetaData, Document, Entity
42
- from langroid.parsing.document_parser import DocumentType
43
- from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
44
- from langroid.parsing.repo_loader import RepoLoader
45
- from langroid.parsing.search import (
46
- find_closest_matches_with_bm25,
47
- find_fuzzy_matches_in_docs,
48
- preprocess_text,
49
- )
50
- from langroid.parsing.table_loader import describe_dataframe
51
- from langroid.parsing.url_loader import URLLoader
52
- from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
53
- from langroid.prompts.prompts_config import PromptsConfig
54
- from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
55
- from langroid.utils.constants import NO_ANSWER
56
- from langroid.utils.object_registry import ObjectRegistry
57
- from langroid.utils.output import show_if_debug, status
58
- from langroid.utils.output.citations import (
59
- extract_markdown_references,
60
- format_footnote_text,
61
- )
62
- from langroid.utils.pydantic_utils import dataframe_to_documents, extract_fields
63
- from langroid.vector_store.base import VectorStore, VectorStoreConfig
64
- from langroid.vector_store.qdrantdb import QdrantDBConfig
65
-
66
-
67
- @cache
68
- def apply_nest_asyncio() -> None:
69
- nest_asyncio.apply()
70
-
71
-
72
- logger = logging.getLogger(__name__)
73
-
74
- DEFAULT_DOC_CHAT_INSTRUCTIONS = """
75
- Your task is to answer questions about various documents.
76
- You will be given various passages from these documents, and asked to answer questions
77
- about them, or summarize them into coherent answers.
78
- """
79
-
80
- DEFAULT_DOC_CHAT_SYSTEM_MESSAGE = """
81
- You are a helpful assistant, helping me understand a collection of documents.
82
- """
83
-
84
- has_sentence_transformers = False
85
- try:
86
- from sentence_transformers import SentenceTransformer # noqa: F401
87
-
88
- has_sentence_transformers = True
89
- except ImportError:
90
- pass
91
-
92
-
93
- hf_embed_config = SentenceTransformerEmbeddingsConfig(
94
- model_type="sentence-transformer",
95
- model_name="BAAI/bge-large-en-v1.5",
96
- )
97
-
98
- oai_embed_config = OpenAIEmbeddingsConfig(
99
- model_type="openai",
100
- model_name="text-embedding-ada-002",
101
- dims=1536,
102
- )
103
-
104
-
105
- class DocChatAgentConfig(ChatAgentConfig):
106
- system_message: str = DEFAULT_DOC_CHAT_SYSTEM_MESSAGE
107
- user_message: str = DEFAULT_DOC_CHAT_INSTRUCTIONS
108
- summarize_prompt: str = SUMMARY_ANSWER_PROMPT_GPT4
109
- # extra fields to include in content as key=value pairs
110
- # (helps retrieval for table-like data)
111
- add_fields_to_content: List[str] = []
112
- filter_fields: List[str] = [] # fields usable in filter
113
- retrieve_only: bool = False # only retr relevant extracts, don't gen summary answer
114
- extraction_granularity: int = 1 # granularity (in sentences) for relev extraction
115
- filter: str | None = (
116
- None # filter condition for various lexical/semantic search fns
117
- )
118
- conversation_mode: bool = True # accumulate message history?
119
- # In assistant mode, DocChatAgent receives questions from another Agent,
120
- # and those will already be in stand-alone form, so in this mode
121
- # there is no need to convert them to stand-alone form.
122
- assistant_mode: bool = False
123
- # Use LLM to generate hypothetical answer A to the query Q,
124
- # and use the embed(A) to find similar chunks in vecdb.
125
- # Referred to as HyDE in the paper:
126
- # https://arxiv.org/pdf/2212.10496.pdf
127
- # It is False by default; its benefits depends on the context.
128
- hypothetical_answer: bool = False
129
- n_query_rephrases: int = 0
130
- n_neighbor_chunks: int = 0 # how many neighbors on either side of match to retrieve
131
- n_fuzzy_neighbor_words: int = 100 # num neighbor words to retrieve for fuzzy match
132
- use_fuzzy_match: bool = True
133
- use_bm25_search: bool = True
134
- use_reciprocal_rank_fusion: bool = True # ignored if using cross-encoder reranking
135
- cross_encoder_reranking_model: str = (
136
- "cross-encoder/ms-marco-MiniLM-L-6-v2" if has_sentence_transformers else ""
137
- )
138
- rerank_diversity: bool = True # rerank to maximize diversity?
139
- rerank_periphery: bool = True # rerank to avoid Lost In the Middle effect?
140
- rerank_after_adding_context: bool = True # rerank after adding context window?
141
- # RRF (Reciprocal Rank Fusion) score = 1/(rank + reciprocal_rank_fusion_constant)
142
- # see https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking#how-rrf-ranking-works
143
- reciprocal_rank_fusion_constant: float = 60.0
144
- cache: bool = True # cache results
145
- debug: bool = False
146
- stream: bool = True # allow streaming where needed
147
- split: bool = True # use chunking
148
- relevance_extractor_config: None | RelevanceExtractorAgentConfig = (
149
- RelevanceExtractorAgentConfig(
150
- llm=None # use the parent's llm unless explicitly set here
151
- )
152
- )
153
- doc_paths: List[str | bytes] = []
154
- default_paths: List[str] = [
155
- "https://news.ycombinator.com/item?id=35629033",
156
- "https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
157
- "https://www.wired.com/1995/04/maes/",
158
- "https://cthiriet.com/articles/scaling-laws",
159
- "https://www.jasonwei.net/blog/emergence",
160
- "https://www.quantamagazine.org/the-unpredictable-abilities-emerging-from-large-ai-models-20230316/",
161
- "https://ai.googleblog.com/2022/11/characterizing-emergent-phenomena-in.html",
162
- ]
163
- parsing: ParsingConfig = ParsingConfig( # modify as needed
164
- splitter=Splitter.TOKENS,
165
- chunk_size=1000, # aim for this many tokens per chunk
166
- overlap=100, # overlap between chunks
167
- max_chunks=10_000,
168
- # aim to have at least this many chars per chunk when
169
- # truncating due to punctuation
170
- min_chunk_chars=200,
171
- discard_chunk_chars=5, # discard chunks with fewer than this many chars
172
- n_similar_docs=3,
173
- n_neighbor_ids=0, # num chunk IDs to store on either side of each chunk
174
- pdf=PdfParsingConfig(
175
- # NOTE: PDF parsing is extremely challenging, and each library
176
- # has its own strengths and weaknesses.
177
- # Try one that works for your use case.
178
- # or "unstructured", "pdfplumber", "fitz", "pypdf"
179
- library="pdfplumber",
180
- ),
181
- )
182
-
183
- # Allow vecdb to be None in case we want to explicitly set it later
184
- vecdb: Optional[VectorStoreConfig] = QdrantDBConfig(
185
- collection_name="doc-chat-qdrantdb",
186
- replace_collection=True,
187
- storage_path=".qdrantdb/data/",
188
- embedding=hf_embed_config if has_sentence_transformers else oai_embed_config,
189
- )
190
-
191
- llm: OpenAIGPTConfig = OpenAIGPTConfig(
192
- type="openai",
193
- chat_model=OpenAIChatModel.GPT4,
194
- completion_model=OpenAIChatModel.GPT4,
195
- timeout=40,
196
- )
197
- prompts: PromptsConfig = PromptsConfig(
198
- max_tokens=1000,
199
- )
200
-
201
-
202
- class DocChatAgent(ChatAgent):
203
- """
204
- Agent for chatting with a collection of documents.
205
- """
206
-
207
- def __init__(
208
- self,
209
- config: DocChatAgentConfig,
210
- ):
211
- super().__init__(config)
212
- self.config: DocChatAgentConfig = config
213
- self.original_docs: List[Document] = []
214
- self.original_docs_length = 0
215
- self.from_dataframe = False
216
- self.df_description = ""
217
- self.chunked_docs: List[Document] = []
218
- self.chunked_docs_clean: List[Document] = []
219
- self.response: None | Document = None
220
- if len(config.doc_paths) > 0:
221
- self.ingest()
222
-
223
- def clear(self) -> None:
224
- """Clear the document collection and the specific collection in vecdb"""
225
- self.original_docs = []
226
- self.original_docs_length = 0
227
- self.chunked_docs = []
228
- self.chunked_docs_clean = []
229
- if self.vecdb is None:
230
- logger.warning("Attempting to clear VecDB, but VecDB not set.")
231
- return
232
- collection_name = self.vecdb.config.collection_name
233
- if collection_name is None:
234
- return
235
- try:
236
- # Note we may have used a vecdb with a config.collection_name
237
- # different from the agent's config.vecdb.collection_name!!
238
- self.vecdb.delete_collection(collection_name)
239
- self.vecdb = VectorStore.create(self.vecdb.config)
240
- except Exception as e:
241
- logger.warning(
242
- f"""
243
- Error while deleting collection {collection_name}:
244
- {e}
245
- """
246
- )
247
-
248
- def ingest(self) -> None:
249
- """
250
- Chunk + embed + store docs specified by self.config.doc_paths
251
- """
252
- if len(self.config.doc_paths) == 0:
253
- # we must be using a previously defined collection
254
- # But let's get all the chunked docs so we can
255
- # do keyword and other non-vector searches
256
- if self.vecdb is None:
257
- raise ValueError("VecDB not set")
258
- self.setup_documents(filter=self.config.filter)
259
- return
260
- self.ingest_doc_paths(self.config.doc_paths) # type: ignore
261
-
262
- def ingest_doc_paths(
263
- self,
264
- paths: str | bytes | List[str | bytes],
265
- metadata: (
266
- List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
267
- ) = [],
268
- doc_type: str | DocumentType | None = None,
269
- ) -> List[Document]:
270
- """Split, ingest docs from specified paths,
271
- do not add these to config.doc_paths.
272
-
273
- Args:
274
- paths: document paths, urls or byte-content of docs.
275
- The bytes option is intended to support cases where a document
276
- has already been read in as bytes (e.g. from an API or a database),
277
- and we want to avoid having to write it to a temporary file
278
- just to read it back in.
279
- metadata: List of metadata dicts, one for each path.
280
- If a single dict is passed in, it is used for all paths.
281
- doc_type: DocumentType to use for parsing, if known.
282
- MUST apply to all docs if specified.
283
- This is especially useful when the `paths` are of bytes type,
284
- to help with document type detection.
285
- Returns:
286
- List of Document objects
287
- """
288
- if isinstance(paths, str) or isinstance(paths, bytes):
289
- paths = [paths]
290
- all_paths = paths
291
- paths_meta: Dict[int, Any] = {}
292
- urls_meta: Dict[int, Any] = {}
293
- idxs = range(len(all_paths))
294
- url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
295
- urls = [all_paths[i] for i in url_idxs]
296
- paths = [all_paths[i] for i in path_idxs]
297
- bytes_list = [all_paths[i] for i in bytes_idxs]
298
- path_idxs.extend(bytes_idxs)
299
- paths.extend(bytes_list)
300
- if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
301
- metadata, list
302
- ):
303
- if isinstance(metadata, list):
304
- idx2meta = {
305
- p: (
306
- m
307
- if isinstance(m, dict)
308
- else (isinstance(m, DocMetaData) and m.dict())
309
- ) # appease mypy
310
- for p, m in zip(idxs, metadata)
311
- }
312
- elif isinstance(metadata, dict):
313
- idx2meta = {p: metadata for p in idxs}
314
- else:
315
- idx2meta = {p: metadata.dict() for p in idxs}
316
- urls_meta = {u: idx2meta[u] for u in url_idxs}
317
- paths_meta = {p: idx2meta[p] for p in path_idxs}
318
- docs: List[Document] = []
319
- parser = Parser(self.config.parsing)
320
- if len(urls) > 0:
321
- for ui in url_idxs:
322
- meta = urls_meta.get(ui, {})
323
- loader = URLLoader(urls=[all_paths[ui]], parser=parser) # type: ignore
324
- url_docs = loader.load()
325
- # update metadata of each doc with meta
326
- for d in url_docs:
327
- d.metadata = d.metadata.copy(update=meta)
328
- docs.extend(url_docs)
329
- if len(paths) > 0: # paths OR bytes are handled similarly
330
- for pi in path_idxs:
331
- meta = paths_meta.get(pi, {})
332
- p = all_paths[pi]
333
- path_docs = RepoLoader.get_documents(
334
- p,
335
- parser=parser,
336
- doc_type=doc_type,
337
- )
338
- # update metadata of each doc with meta
339
- for d in path_docs:
340
- d.metadata = d.metadata.copy(update=meta)
341
- docs.extend(path_docs)
342
- n_docs = len(docs)
343
- n_splits = self.ingest_docs(docs, split=self.config.split)
344
- if n_docs == 0:
345
- return []
346
- n_urls = len(urls)
347
- n_paths = len(paths)
348
- print(
349
- f"""
350
- [green]I have processed the following {n_urls} URLs
351
- and {n_paths} docs into {n_splits} parts:
352
- """.strip()
353
- )
354
- path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
355
- print("\n".join([u for u in urls if isinstance(u, str)])) # appease mypy
356
- print("\n".join(path_reps))
357
- return docs
358
-
359
- def ingest_docs(
360
- self,
361
- docs: List[Document],
362
- split: bool = True,
363
- metadata: (
364
- List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
365
- ) = [],
366
- ) -> int:
367
- """
368
- Chunk docs into pieces, map each chunk to vec-embedding, store in vec-db
369
-
370
- Args:
371
- docs: List of Document objects
372
- split: Whether to split docs into chunks. Default is True.
373
- If False, docs are treated as "chunks" and are not split.
374
- metadata: List of metadata dicts, one for each doc, to augment
375
- whatever metadata is already in the doc.
376
- [ASSUME no conflicting keys between the two metadata dicts.]
377
- If a single dict is passed in, it is used for all docs.
378
- """
379
- if isinstance(metadata, list) and len(metadata) > 0:
380
- for d, m in zip(docs, metadata):
381
- d.metadata = d.metadata.copy(
382
- update=m if isinstance(m, dict) else m.dict() # type: ignore
383
- )
384
- elif isinstance(metadata, dict):
385
- for d in docs:
386
- d.metadata = d.metadata.copy(update=metadata)
387
- elif isinstance(metadata, DocMetaData):
388
- for d in docs:
389
- d.metadata = d.metadata.copy(update=metadata.dict())
390
-
391
- self.original_docs.extend(docs)
392
- if self.parser is None:
393
- raise ValueError("Parser not set")
394
- for d in docs:
395
- if d.metadata.id in [None, ""]:
396
- d.metadata.id = ObjectRegistry.new_id()
397
- if split:
398
- docs = self.parser.split(docs)
399
- else:
400
- if self.config.n_neighbor_chunks > 0:
401
- self.parser.add_window_ids(docs)
402
- # we're not splitting, so we mark each doc as a chunk
403
- for d in docs:
404
- d.metadata.is_chunk = True
405
- if self.vecdb is None:
406
- raise ValueError("VecDB not set")
407
-
408
- # If any additional fields need to be added to content,
409
- # add them as key=value pairs for all docs, before batching.
410
- # This helps retrieval for table-like data.
411
- # Note we need to do this at stage so that the embeddings
412
- # are computed on the full content with these additional fields.
413
- if len(self.config.add_fields_to_content) > 0:
414
- fields = [
415
- f for f in extract_fields(docs[0], self.config.add_fields_to_content)
416
- ]
417
- if len(fields) > 0:
418
- for d in docs:
419
- key_vals = extract_fields(d, fields)
420
- d.content = (
421
- ",".join(f"{k}={v}" for k, v in key_vals.items())
422
- + ",content="
423
- + d.content
424
- )
425
- docs = docs[: self.config.parsing.max_chunks]
426
- # vecdb should take care of adding docs in batches;
427
- # batching can be controlled via vecdb.config.batch_size
428
- self.vecdb.add_documents(docs)
429
- self.original_docs_length = self.doc_length(docs)
430
- self.setup_documents(docs, filter=self.config.filter)
431
- return len(docs)
432
-
433
- def retrieval_tool(self, msg: RetrievalTool) -> str:
434
- """Handle the RetrievalTool message"""
435
- self.config.retrieve_only = True
436
- self.config.parsing.n_similar_docs = msg.num_results
437
- content_doc = self.answer_from_docs(msg.query)
438
- return content_doc.content
439
-
440
- @staticmethod
441
- def document_compatible_dataframe(
442
- df: pd.DataFrame,
443
- content: str = "content",
444
- metadata: List[str] = [],
445
- ) -> Tuple[pd.DataFrame, List[str]]:
446
- """
447
- Convert dataframe so it is compatible with Document class:
448
- - has "content" column
449
- - has an "id" column to be used as Document.metadata.id
450
-
451
- Args:
452
- df: dataframe to convert
453
- content: name of content column
454
- metadata: list of metadata column names
455
-
456
- Returns:
457
- Tuple[pd.DataFrame, List[str]]: dataframe, metadata
458
- - dataframe: dataframe with "content" column and "id" column
459
- - metadata: list of metadata column names, including "id"
460
- """
461
- if content not in df.columns:
462
- raise ValueError(
463
- f"""
464
- Content column {content} not in dataframe,
465
- so we cannot ingest into the DocChatAgent.
466
- Please specify the `content` parameter as a suitable
467
- text-based column in the dataframe.
468
- """
469
- )
470
- if content != "content":
471
- # rename content column to "content", leave existing column intact
472
- df = df.rename(columns={content: "content"}, inplace=False)
473
-
474
- actual_metadata = metadata.copy()
475
- if "id" not in df.columns:
476
- docs = dataframe_to_documents(df, content="content", metadata=metadata)
477
- ids = [str(d.id()) for d in docs]
478
- df["id"] = ids
479
-
480
- if "id" not in actual_metadata:
481
- actual_metadata += ["id"]
482
-
483
- return df, actual_metadata
484
-
485
- def ingest_dataframe(
486
- self,
487
- df: pd.DataFrame,
488
- content: str = "content",
489
- metadata: List[str] = [],
490
- ) -> int:
491
- """
492
- Ingest a dataframe into vecdb.
493
- """
494
- self.from_dataframe = True
495
- self.df_description = describe_dataframe(
496
- df, filter_fields=self.config.filter_fields, n_vals=5
497
- )
498
- df, metadata = DocChatAgent.document_compatible_dataframe(df, content, metadata)
499
- docs = dataframe_to_documents(df, content="content", metadata=metadata)
500
- # When ingesting a dataframe we will no longer do any chunking,
501
- # so we mark each doc as a chunk.
502
- # TODO - revisit this since we may still want to chunk large text columns
503
- for d in docs:
504
- d.metadata.is_chunk = True
505
- return self.ingest_docs(docs)
506
-
507
- def set_filter(self, filter: str) -> None:
508
- self.config.filter = filter
509
- self.setup_documents(filter=filter)
510
-
511
- def setup_documents(
512
- self,
513
- docs: List[Document] = [],
514
- filter: str | None = None,
515
- ) -> None:
516
- """
517
- Setup `self.chunked_docs` and `self.chunked_docs_clean`
518
- based on possible filter.
519
- These will be used in various non-vector-based search functions,
520
- e.g. self.get_similar_chunks_bm25(), self.get_fuzzy_matches(), etc.
521
-
522
- Args:
523
- docs: List of Document objects. This is empty when we are calling this
524
- method after initial doc ingestion.
525
- filter: Filter condition for various lexical/semantic search fns.
526
- """
527
- if filter is None and len(docs) > 0:
528
- # no filter, so just use the docs passed in
529
- self.chunked_docs.extend(docs)
530
- else:
531
- if self.vecdb is None:
532
- raise ValueError("VecDB not set")
533
- self.chunked_docs = self.vecdb.get_all_documents(where=filter or "")
534
-
535
- self.chunked_docs_clean = [
536
- Document(content=preprocess_text(d.content), metadata=d.metadata)
537
- for d in self.chunked_docs
538
- ]
539
-
540
- def get_field_values(self, fields: list[str]) -> Dict[str, str]:
541
- """Get string-listing of possible values of each field,
542
- e.g.
543
- {
544
- "genre": "crime, drama, mystery, ... (10 more)",
545
- "certificate": "R, PG-13, PG, R",
546
- }
547
- The field names may have "metadata." prefix, e.g. "metadata.genre".
548
- """
549
- field_values: Dict[str, Set[str]] = {}
550
- # make empty set for each field
551
- for f in fields:
552
- field_values[f] = set()
553
- if self.vecdb is None:
554
- raise ValueError("VecDB not set")
555
- # get all documents and accumulate possible values of each field until 10
556
- docs = self.vecdb.get_all_documents() # only works for vecdbs that support this
557
- for d in docs:
558
- # extract fields from d
559
- doc_field_vals = extract_fields(d, fields)
560
- # the `field` returned by extract_fields may contain only the last
561
- # part of the field name, e.g. "genre" instead of "metadata.genre",
562
- # so we use the orig_field name to fill in the values
563
- for (field, val), orig_field in zip(doc_field_vals.items(), fields):
564
- field_values[orig_field].add(val)
565
- # For each field make a string showing list of possible values,
566
- # truncate to 20 values, and if there are more, indicate how many
567
- # more there are, e.g. Genre: crime, drama, mystery, ... (20 more)
568
- field_values_list = {}
569
- for f in fields:
570
- vals = list(field_values[f])
571
- n = len(vals)
572
- remaining = n - 20
573
- vals = vals[:20]
574
- if n > 20:
575
- vals.append(f"(...{remaining} more)")
576
- # make a string of the values, ensure they are strings
577
- field_values_list[f] = ", ".join(str(v) for v in vals)
578
- return field_values_list
579
-
580
- def doc_length(self, docs: List[Document]) -> int:
581
- """
582
- Calc token-length of a list of docs
583
- Args:
584
- docs: list of Document objects
585
- Returns:
586
- int: number of tokens
587
- """
588
- if self.parser is None:
589
- raise ValueError("Parser not set")
590
- return self.parser.num_tokens(self.doc_string(docs))
591
-
592
- def user_docs_ingest_dialog(self) -> None:
593
- """
594
- Ask user to select doc-collection, enter filenames/urls, and ingest into vecdb.
595
- """
596
- if self.vecdb is None:
597
- raise ValueError("VecDB not set")
598
- n_deletes = self.vecdb.clear_empty_collections()
599
- collections = self.vecdb.list_collections()
600
- collection_name = "NEW"
601
- is_new_collection = False
602
- replace_collection = False
603
- if len(collections) > 0:
604
- n = len(collections)
605
- delete_str = (
606
- f"(deleted {n_deletes} empty collections)" if n_deletes > 0 else ""
607
- )
608
- print(f"Found {n} collections: {delete_str}")
609
- for i, option in enumerate(collections, start=1):
610
- print(f"{i}. {option}")
611
- while True:
612
- choice = Prompt.ask(
613
- f"Enter 1-{n} to select a collection, "
614
- "or hit ENTER to create a NEW collection, "
615
- "or -1 to DELETE ALL COLLECTIONS",
616
- default="0",
617
- )
618
- try:
619
- if -1 <= int(choice) <= n:
620
- break
621
- except Exception:
622
- pass
623
-
624
- if choice == "-1":
625
- confirm = Prompt.ask(
626
- "Are you sure you want to delete all collections?",
627
- choices=["y", "n"],
628
- default="n",
629
- )
630
- if confirm == "y":
631
- self.vecdb.clear_all_collections(really=True)
632
- collection_name = "NEW"
633
-
634
- if int(choice) > 0:
635
- collection_name = collections[int(choice) - 1]
636
- print(f"Using collection {collection_name}")
637
- choice = Prompt.ask(
638
- "Would you like to replace this collection?",
639
- choices=["y", "n"],
640
- default="n",
641
- )
642
- replace_collection = choice == "y"
643
-
644
- if collection_name == "NEW":
645
- is_new_collection = True
646
- collection_name = Prompt.ask(
647
- "What would you like to name the NEW collection?",
648
- default="doc-chat",
649
- )
650
-
651
- self.vecdb.set_collection(collection_name, replace=replace_collection)
652
-
653
- default_urls_str = (
654
- " (or leave empty for default URLs)" if is_new_collection else ""
655
- )
656
- print(f"[blue]Enter some URLs or file/dir paths below {default_urls_str}")
657
- inputs = get_list_from_user()
658
- if len(inputs) == 0:
659
- if is_new_collection:
660
- inputs = self.config.default_paths
661
- self.config.doc_paths = inputs # type: ignore
662
- self.ingest()
663
-
664
- def llm_response(
665
- self,
666
- message: None | str | ChatDocument = None,
667
- ) -> Optional[ChatDocument]:
668
- if not self.llm_can_respond(message):
669
- return None
670
- query_str: str | None
671
- if isinstance(message, ChatDocument):
672
- query_str = message.content
673
- else:
674
- query_str = message
675
- if query_str is None or query_str.startswith("!"):
676
- # direct query to LLM
677
- query_str = query_str[1:] if query_str is not None else None
678
- if self.llm is None:
679
- raise ValueError("LLM not set")
680
- response = super().llm_response(query_str)
681
- if query_str is not None:
682
- self.update_dialog(
683
- query_str, "" if response is None else response.content
684
- )
685
- return response
686
- if query_str == "":
687
- return ChatDocument(
688
- content=NO_ANSWER + " since query was empty",
689
- metadata=ChatDocMetaData(
690
- source="No query provided",
691
- sender=Entity.LLM,
692
- ),
693
- )
694
- elif query_str == "?" and self.response is not None:
695
- return self.justify_response()
696
- elif (query_str.startswith(("summar", "?")) and self.response is None) or (
697
- query_str == "??"
698
- ):
699
- return self.summarize_docs()
700
- else:
701
- self.callbacks.show_start_response(entity="llm")
702
- response = self.answer_from_docs(query_str)
703
- # Citation details (if any) are NOT generated by LLM
704
- # (We extract these from LLM's numerical citations),
705
- # so render them here
706
- self._render_llm_response(response, citation_only=True)
707
- return ChatDocument(
708
- content=response.content,
709
- metadata=ChatDocMetaData(
710
- source=response.metadata.source,
711
- sender=Entity.LLM,
712
- ),
713
- )
714
-
715
- async def llm_response_async(
716
- self,
717
- message: None | str | ChatDocument = None,
718
- ) -> Optional[ChatDocument]:
719
- apply_nest_asyncio()
720
- if not self.llm_can_respond(message):
721
- return None
722
- query_str: str | None
723
- if isinstance(message, ChatDocument):
724
- query_str = message.content
725
- else:
726
- query_str = message
727
- if query_str is None or query_str.startswith("!"):
728
- # direct query to LLM
729
- query_str = query_str[1:] if query_str is not None else None
730
- if self.llm is None:
731
- raise ValueError("LLM not set")
732
- response = await super().llm_response_async(query_str)
733
- if query_str is not None:
734
- self.update_dialog(
735
- query_str, "" if response is None else response.content
736
- )
737
- return response
738
- if query_str == "":
739
- return None
740
- elif query_str == "?" and self.response is not None:
741
- return self.justify_response()
742
- elif (query_str.startswith(("summar", "?")) and self.response is None) or (
743
- query_str == "??"
744
- ):
745
- return self.summarize_docs()
746
- else:
747
- self.callbacks.show_start_response(entity="llm")
748
- response = self.answer_from_docs(query_str)
749
- self._render_llm_response(response, citation_only=True)
750
- return ChatDocument(
751
- content=response.content,
752
- metadata=ChatDocMetaData(
753
- source=response.metadata.source,
754
- sender=Entity.LLM,
755
- ),
756
- )
757
-
758
- @staticmethod
759
- def doc_string(docs: List[Document]) -> str:
760
- """
761
- Generate a string representation of a list of docs.
762
- Args:
763
- docs: list of Document objects
764
- Returns:
765
- str: string representation
766
- """
767
- contents = [f"Extract: {d.content}" for d in docs]
768
- sources = [d.metadata.source for d in docs]
769
- sources = [f"Source: {s}" if s is not None else "" for s in sources]
770
- return "\n".join(
771
- [
772
- f"""
773
- [{i+1}]
774
- {content}
775
- {source}
776
- """
777
- for i, (content, source) in enumerate(zip(contents, sources))
778
- ]
779
- )
780
-
781
- def get_summary_answer(
782
- self, question: str, passages: List[Document]
783
- ) -> ChatDocument:
784
- """
785
- Given a question and a list of (possibly) doc snippets,
786
- generate an answer if possible
787
- Args:
788
- question: question to answer
789
- passages: list of `Document` objects each containing a possibly relevant
790
- snippet, and metadata
791
- Returns:
792
- a `Document` object containing the answer,
793
- and metadata containing source citations
794
-
795
- """
796
-
797
- passages_str = self.doc_string(passages)
798
- # Substitute Q and P into the templatized prompt
799
-
800
- final_prompt = self.config.summarize_prompt.format(
801
- question=question, extracts=passages_str
802
- )
803
- show_if_debug(final_prompt, "SUMMARIZE_PROMPT= ")
804
-
805
- # Generate the final verbatim extract based on the final prompt.
806
- # Note this will send entire message history, plus this final_prompt
807
- # to the LLM, and self.message_history will be updated to include
808
- # 2 new LLMMessage objects:
809
- # one for `final_prompt`, and one for the LLM response
810
-
811
- if self.config.conversation_mode:
812
- # respond with temporary context
813
- answer_doc = super()._llm_response_temp_context(question, final_prompt)
814
- else:
815
- answer_doc = super().llm_response_forget(final_prompt)
816
-
817
- final_answer = answer_doc.content.strip()
818
- show_if_debug(final_answer, "SUMMARIZE_RESPONSE= ")
819
-
820
- citations = extract_markdown_references(final_answer)
821
-
822
- citations_str = ""
823
- if len(citations) > 0:
824
- # append [i] source, content for each citation
825
- citations_str = "\n".join(
826
- [
827
- f"[^{c}] {passages[c-1].metadata.source}"
828
- f"\n{format_footnote_text(passages[c-1].content)}"
829
- for c in citations
830
- ]
831
- )
832
-
833
- return ChatDocument(
834
- content=final_answer, # does not contain citations
835
- metadata=ChatDocMetaData(
836
- source=citations_str, # only the citations
837
- sender=Entity.LLM,
838
- has_citation=len(citations) > 0,
839
- cached=getattr(answer_doc.metadata, "cached", False),
840
- ),
841
- )
842
-
843
- def llm_hypothetical_answer(self, query: str) -> str:
844
- if self.llm is None:
845
- raise ValueError("LLM not set")
846
- with status("[cyan]LLM generating hypothetical answer..."):
847
- with StreamingIfAllowed(self.llm, False):
848
- # TODO: provide an easy way to
849
- # Adjust this prompt depending on context.
850
- answer = self.llm_response_forget(
851
- f"""
852
- Give an ideal answer to the following query,
853
- in up to 3 sentences. Do not explain yourself,
854
- and do not apologize, just show
855
- a good possible answer, even if you do not have any information.
856
- Preface your answer with "HYPOTHETICAL ANSWER: "
857
-
858
- QUERY: {query}
859
- """
860
- ).content
861
- return answer
862
-
863
- def llm_rephrase_query(self, query: str) -> List[str]:
864
- if self.llm is None:
865
- raise ValueError("LLM not set")
866
- with status("[cyan]LLM generating rephrases of query..."):
867
- with StreamingIfAllowed(self.llm, False):
868
- rephrases = self.llm_response_forget(
869
- f"""
870
- Rephrase the following query in {self.config.n_query_rephrases}
871
- different equivalent ways, separate them with 2 newlines.
872
- QUERY: {query}
873
- """
874
- ).content.split("\n\n")
875
- return rephrases
876
-
877
- def get_similar_chunks_bm25(
878
- self, query: str, multiple: int
879
- ) -> List[Tuple[Document, float]]:
880
- # find similar docs using bm25 similarity:
881
- # these may sometimes be more likely to contain a relevant verbatim extract
882
- with status("[cyan]Searching for similar chunks using bm25..."):
883
- if self.chunked_docs is None or len(self.chunked_docs) == 0:
884
- logger.warning("No chunked docs; cannot use bm25-similarity")
885
- return []
886
- if self.chunked_docs_clean is None or len(self.chunked_docs_clean) == 0:
887
- logger.warning("No cleaned chunked docs; cannot use bm25-similarity")
888
- return []
889
- docs_scores = find_closest_matches_with_bm25(
890
- self.chunked_docs,
891
- self.chunked_docs_clean, # already pre-processed!
892
- query,
893
- k=self.config.parsing.n_similar_docs * multiple,
894
- )
895
- return docs_scores
896
-
897
- def get_fuzzy_matches(
898
- self, query: str, multiple: int
899
- ) -> List[Tuple[Document, float]]:
900
- # find similar docs using fuzzy matching:
901
- # these may sometimes be more likely to contain a relevant verbatim extract
902
- with status("[cyan]Finding fuzzy matches in chunks..."):
903
- if self.chunked_docs is None:
904
- logger.warning("No chunked docs; cannot use fuzzy matching")
905
- return []
906
- if self.chunked_docs_clean is None:
907
- logger.warning("No cleaned chunked docs; cannot use fuzzy-search")
908
- return []
909
- fuzzy_match_docs = find_fuzzy_matches_in_docs(
910
- query,
911
- self.chunked_docs,
912
- self.chunked_docs_clean,
913
- k=self.config.parsing.n_similar_docs * multiple,
914
- words_before=self.config.n_fuzzy_neighbor_words or None,
915
- words_after=self.config.n_fuzzy_neighbor_words or None,
916
- )
917
- return fuzzy_match_docs
918
-
919
- def rerank_with_cross_encoder(
920
- self, query: str, passages: List[Document]
921
- ) -> List[Document]:
922
- with status("[cyan]Re-ranking retrieved chunks using cross-encoder..."):
923
- try:
924
- from sentence_transformers import CrossEncoder
925
- except ImportError:
926
- raise ImportError(
927
- """
928
- To use cross-encoder re-ranking, you must install
929
- langroid with the [hf-embeddings] extra, e.g.:
930
- pip install "langroid[hf-embeddings]"
931
- """
932
- )
933
-
934
- model = CrossEncoder(self.config.cross_encoder_reranking_model)
935
- scores = model.predict([(query, p.content) for p in passages])
936
- # Convert to [0,1] so we might could use a cutoff later.
937
- scores = 1.0 / (1 + np.exp(-np.array(scores)))
938
- # get top k scoring passages
939
- sorted_pairs = sorted(
940
- zip(scores, passages),
941
- key=lambda x: x[0],
942
- reverse=True,
943
- )
944
- passages = [
945
- d for _, d in sorted_pairs[: self.config.parsing.n_similar_docs]
946
- ]
947
- return passages
948
-
949
- def rerank_with_diversity(self, passages: List[Document]) -> List[Document]:
950
- """
951
- Rerank a list of items in such a way that each successive item is least similar
952
- (on average) to the earlier items.
953
-
954
- Args:
955
- query (str): The query for which the passages are relevant.
956
- passages (List[Document]): A list of Documents to be reranked.
957
-
958
- Returns:
959
- List[Documents]: A reranked list of Documents.
960
- """
961
-
962
- if self.vecdb is None:
963
- logger.warning("No vecdb; cannot use rerank_with_diversity")
964
- return passages
965
- emb_model = self.vecdb.embedding_model
966
- emb_fn = emb_model.embedding_fn()
967
- embs = emb_fn([p.content for p in passages])
968
- embs_arr = [np.array(e) for e in embs]
969
- indices = list(range(len(passages)))
970
-
971
- # Helper function to compute average similarity to
972
- # items in the current result list.
973
- def avg_similarity_to_result(i: int, result: List[int]) -> float:
974
- return sum( # type: ignore
975
- (embs_arr[i] @ embs_arr[j])
976
- / (np.linalg.norm(embs_arr[i]) * np.linalg.norm(embs_arr[j]))
977
- for j in result
978
- ) / len(result)
979
-
980
- # copy passages to items
981
- result = [indices.pop(0)] # Start with the first item.
982
-
983
- while indices:
984
- # Find the item that has the least average similarity
985
- # to items in the result list.
986
- least_similar_item = min(
987
- indices, key=lambda i: avg_similarity_to_result(i, result)
988
- )
989
- result.append(least_similar_item)
990
- indices.remove(least_similar_item)
991
-
992
- # return passages in order of result list
993
- return [passages[i] for i in result]
994
-
995
- def rerank_to_periphery(self, passages: List[Document]) -> List[Document]:
996
- """
997
- Rerank to avoid Lost In the Middle (LIM) problem,
998
- where LLMs pay more attention to items at the ends of a list,
999
- rather than the middle. So we re-rank to make the best passages
1000
- appear at the periphery of the list.
1001
- https://arxiv.org/abs/2307.03172
1002
-
1003
- Example reranking:
1004
- 1 2 3 4 5 6 7 8 9 ==> 1 3 5 7 9 8 6 4 2
1005
-
1006
- Args:
1007
- passages (List[Document]): A list of Documents to be reranked.
1008
-
1009
- Returns:
1010
- List[Documents]: A reranked list of Documents.
1011
-
1012
- """
1013
- # Splitting items into odds and evens based on index, not value
1014
- odds = passages[::2]
1015
- evens = passages[1::2][::-1]
1016
-
1017
- # Merging them back together
1018
- return odds + evens
1019
-
1020
- def add_context_window(
1021
- self,
1022
- docs_scores: List[Tuple[Document, float]],
1023
- ) -> List[Tuple[Document, float]]:
1024
- """
1025
- In each doc's metadata, there may be a window_ids field indicating
1026
- the ids of the chunks around the current chunk. We use these stored
1027
- window_ids to retrieve the desired number
1028
- (self.config.n_neighbor_chunks) of neighbors
1029
- on either side of the current chunk.
1030
-
1031
- Args:
1032
- docs_scores (List[Tuple[Document, float]]): List of pairs of documents
1033
- to add context windows to together with their match scores.
1034
-
1035
- Returns:
1036
- List[Tuple[Document, float]]: List of (Document, score) tuples.
1037
- """
1038
- if self.vecdb is None or self.config.n_neighbor_chunks == 0:
1039
- return docs_scores
1040
- if len(docs_scores) == 0:
1041
- return []
1042
- if set(docs_scores[0][0].__fields__) != {"content", "metadata"}:
1043
- # Do not add context window when there are other fields besides just
1044
- # content and metadata, since we do not know how to set those other fields
1045
- # for newly created docs with combined content.
1046
- return docs_scores
1047
- return self.vecdb.add_context_window(docs_scores, self.config.n_neighbor_chunks)
1048
-
1049
- def get_semantic_search_results(
1050
- self,
1051
- query: str,
1052
- k: int = 10,
1053
- ) -> List[Tuple[Document, float]]:
1054
- """
1055
- Get semantic search results from vecdb.
1056
- Args:
1057
- query (str): query to search for
1058
- k (int): number of results to return
1059
- Returns:
1060
- List[Tuple[Document, float]]: List of (Document, score) tuples.
1061
- """
1062
- if self.vecdb is None:
1063
- raise ValueError("VecDB not set")
1064
- # Note: for dynamic filtering based on a query, users can
1065
- # use the `temp_update` context-manager to pass in a `filter` to self.config,
1066
- # e.g.:
1067
- # with temp_update(self.config, {"filter": "metadata.source=='source1'"}):
1068
- # docs_scores = self.get_semantic_search_results(query, k=k)
1069
- # This avoids having pass the `filter` argument to every function call
1070
- # upstream of this one.
1071
- # The `temp_update` context manager is defined in
1072
- # `langroid/utils/pydantic_utils.py`
1073
- return self.vecdb.similar_texts_with_scores(
1074
- query,
1075
- k=k,
1076
- where=self.config.filter,
1077
- )
1078
-
1079
- def get_relevant_chunks(
1080
- self, query: str, query_proxies: List[str] = []
1081
- ) -> List[Document]:
1082
- """
1083
- The retrieval stage in RAG: get doc-chunks that are most "relevant"
1084
- to the query (and possibly any proxy queries), from the document-store,
1085
- which currently is the vector store,
1086
- but in theory could be any document store, or even web-search.
1087
- This stage does NOT involve an LLM, and the retrieved chunks
1088
- could either be pre-chunked text (from the initial pre-processing stage
1089
- where chunks were stored in the vector store), or they could be
1090
- dynamically retrieved based on a window around a lexical match.
1091
-
1092
- These are the steps (some optional based on config):
1093
- - semantic search based on vector-embedding distance, from vecdb
1094
- - lexical search using bm25-ranking (keyword similarity)
1095
- - fuzzy matching (keyword similarity)
1096
- - re-ranking of doc-chunks by relevance to query, using cross-encoder,
1097
- and pick top k
1098
-
1099
- Args:
1100
- query: original query (assumed to be in stand-alone form)
1101
- query_proxies: possible rephrases, or hypothetical answer to query
1102
- (e.g. for HyDE-type retrieval)
1103
-
1104
- Returns:
1105
-
1106
- """
1107
-
1108
- if (
1109
- self.vecdb is None
1110
- or self.vecdb.config.collection_name
1111
- not in self.vecdb.list_collections(empty=False)
1112
- ):
1113
- return []
1114
-
1115
- # if we are using cross-encoder reranking or reciprocal rank fusion (RRF),
1116
- # we can retrieve more docs during retrieval, and leave it to the cross-encoder
1117
- # or RRF reranking to whittle down to self.config.parsing.n_similar_docs
1118
- retrieval_multiple = (
1119
- 1
1120
- if (
1121
- self.config.cross_encoder_reranking_model == ""
1122
- and not self.config.use_reciprocal_rank_fusion
1123
- )
1124
- else 3
1125
- )
1126
-
1127
- if self.vecdb is None:
1128
- raise ValueError("VecDB not set")
1129
-
1130
- with status("[cyan]Searching VecDB for relevant doc passages..."):
1131
- docs_and_scores: List[Tuple[Document, float]] = []
1132
- for q in [query] + query_proxies:
1133
- docs_and_scores += self.get_semantic_search_results(
1134
- q,
1135
- k=self.config.parsing.n_similar_docs * retrieval_multiple,
1136
- )
1137
- # sort by score descending
1138
- docs_and_scores = sorted(
1139
- docs_and_scores, key=lambda x: x[1], reverse=True
1140
- )
1141
-
1142
- # keep only docs with unique d.id()
1143
- id2_rank_semantic = {d.id(): i for i, (d, _) in enumerate(docs_and_scores)}
1144
- id2doc = {d.id(): d for d, _ in docs_and_scores}
1145
- # make sure we get unique docs
1146
- passages = [id2doc[id] for id, _ in id2_rank_semantic.items()]
1147
-
1148
- id2_rank_bm25 = {}
1149
- if self.config.use_bm25_search:
1150
- # TODO: Add score threshold in config
1151
- docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
1152
- if self.config.cross_encoder_reranking_model == "":
1153
- # only if we're not re-ranking with a cross-encoder,
1154
- # we collect these ranks for Reciprocal Rank Fusion down below.
1155
- docs_scores = sorted(docs_scores, key=lambda x: x[1], reverse=True)
1156
- id2_rank_bm25 = {d.id(): i for i, (d, _) in enumerate(docs_scores)}
1157
- id2doc.update({d.id(): d for d, _ in docs_scores})
1158
- else:
1159
- passages += [d for (d, _) in docs_scores]
1160
-
1161
- id2_rank_fuzzy = {}
1162
- if self.config.use_fuzzy_match:
1163
- # TODO: Add score threshold in config
1164
- fuzzy_match_doc_scores = self.get_fuzzy_matches(query, retrieval_multiple)
1165
- if self.config.cross_encoder_reranking_model == "":
1166
- # only if we're not re-ranking with a cross-encoder,
1167
- # we collect these ranks for Reciprocal Rank Fusion down below.
1168
- fuzzy_match_doc_scores = sorted(
1169
- fuzzy_match_doc_scores, key=lambda x: x[1], reverse=True
1170
- )
1171
- id2_rank_fuzzy = {
1172
- d.id(): i for i, (d, _) in enumerate(fuzzy_match_doc_scores)
1173
- }
1174
- id2doc.update({d.id(): d for d, _ in fuzzy_match_doc_scores})
1175
- else:
1176
- passages += [d for (d, _) in fuzzy_match_doc_scores]
1177
-
1178
- if (
1179
- self.config.cross_encoder_reranking_model == ""
1180
- and self.config.use_reciprocal_rank_fusion
1181
- and (self.config.use_bm25_search or self.config.use_fuzzy_match)
1182
- ):
1183
- # Since we're not using cross-enocder re-ranking,
1184
- # we need to re-order the retrieved chunks from potentially three
1185
- # different retrieval methods (semantic, bm25, fuzzy), where the
1186
- # similarity scores are on different scales.
1187
- # We order the retrieved chunks using Reciprocal Rank Fusion (RRF) score.
1188
- # Combine the ranks from each id2doc_rank_* dict into a single dict,
1189
- # where the reciprocal rank score is the sum of
1190
- # 1/(rank + self.config.reciprocal_rank_fusion_constant).
1191
- # See https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking
1192
- #
1193
- # Note: diversity/periphery-reranking below may modify the final ranking.
1194
- id2_reciprocal_score = {}
1195
- for id_ in (
1196
- set(id2_rank_semantic.keys())
1197
- | set(id2_rank_bm25.keys())
1198
- | set(id2_rank_fuzzy.keys())
1199
- ):
1200
- rank_semantic = id2_rank_semantic.get(id_, float("inf"))
1201
- rank_bm25 = id2_rank_bm25.get(id_, float("inf"))
1202
- rank_fuzzy = id2_rank_fuzzy.get(id_, float("inf"))
1203
- c = self.config.reciprocal_rank_fusion_constant
1204
- reciprocal_fusion_score = (
1205
- 1 / (rank_semantic + c) + 1 / (rank_bm25 + c) + 1 / (rank_fuzzy + c)
1206
- )
1207
- id2_reciprocal_score[id_] = reciprocal_fusion_score
1208
-
1209
- # sort the docs by the reciprocal score, in descending order
1210
- id2_reciprocal_score = OrderedDict(
1211
- sorted(
1212
- id2_reciprocal_score.items(),
1213
- key=lambda x: x[1],
1214
- reverse=True,
1215
- )
1216
- )
1217
- # each method retrieved up to retrieval_multiple * n_similar_docs,
1218
- # so we need to take the top n_similar_docs from the combined list
1219
- passages = [
1220
- id2doc[id]
1221
- for i, (id, _) in enumerate(id2_reciprocal_score.items())
1222
- if i < self.config.parsing.n_similar_docs
1223
- ]
1224
- # passages must have distinct ids
1225
- assert len(passages) == len(set([d.id() for d in passages])), (
1226
- f"Duplicate passages in retrieved docs: {len(passages)} != "
1227
- f"{len(set([d.id() for d in passages]))}"
1228
- )
1229
-
1230
- if len(passages) == 0:
1231
- return []
1232
-
1233
- if self.config.rerank_after_adding_context:
1234
- passages_scores = [(p, 0.0) for p in passages]
1235
- passages_scores = self.add_context_window(passages_scores)
1236
- passages = [p for p, _ in passages_scores]
1237
- # now passages can potentially have a lot of doc chunks,
1238
- # so we re-rank them using a cross-encoder scoring model,
1239
- # and pick top k where k = config.parsing.n_similar_docs
1240
- # https://www.sbert.net/examples/applications/retrieve_rerank
1241
- if self.config.cross_encoder_reranking_model != "":
1242
- passages = self.rerank_with_cross_encoder(query, passages)
1243
-
1244
- if self.config.rerank_diversity:
1245
- # reorder to increase diversity among top docs
1246
- passages = self.rerank_with_diversity(passages)
1247
-
1248
- if self.config.rerank_periphery:
1249
- # reorder so most important docs are at periphery
1250
- # (see Lost In the Middle issue).
1251
- passages = self.rerank_to_periphery(passages)
1252
-
1253
- if not self.config.rerank_after_adding_context:
1254
- passages_scores = [(p, 0.0) for p in passages]
1255
- passages_scores = self.add_context_window(passages_scores)
1256
- passages = [p for p, _ in passages_scores]
1257
-
1258
- return passages[: self.config.parsing.n_similar_docs]
1259
-
1260
- @no_type_check
1261
- def get_relevant_extracts(self, query: str) -> Tuple[str, List[Document]]:
1262
- """
1263
- Get list of (verbatim) extracts from doc-chunks relevant to answering a query.
1264
-
1265
- These are the stages (some optional based on config):
1266
- - use LLM to convert query to stand-alone query
1267
- - optionally use LLM to rephrase query to use below
1268
- - optionally use LLM to generate hypothetical answer (HyDE) to use below.
1269
- - get_relevant_chunks(): get doc-chunks relevant to query and proxies
1270
- - use LLM to get relevant extracts from doc-chunks
1271
-
1272
- Args:
1273
- query (str): query to search for
1274
-
1275
- Returns:
1276
- query (str): stand-alone version of input query
1277
- List[Document]: list of relevant extracts
1278
-
1279
- """
1280
- if (
1281
- self.vecdb is None
1282
- or self.vecdb.config.collection_name
1283
- not in self.vecdb.list_collections(empty=False)
1284
- ):
1285
- return query, []
1286
-
1287
- if len(self.dialog) > 0 and not self.config.assistant_mode:
1288
- # Regardless of whether we are in conversation mode or not,
1289
- # for relevant doc/chunk extraction, we must convert the query
1290
- # to a standalone query to get more relevant results.
1291
- with status("[cyan]Converting to stand-alone query...[/cyan]"):
1292
- with StreamingIfAllowed(self.llm, False):
1293
- query = self.llm.followup_to_standalone(self.dialog, query)
1294
- print(f"[orange2]New query: {query}")
1295
-
1296
- proxies = []
1297
- if self.config.hypothetical_answer:
1298
- answer = self.llm_hypothetical_answer(query)
1299
- proxies = [answer]
1300
-
1301
- if self.config.n_query_rephrases > 0:
1302
- rephrases = self.llm_rephrase_query(query)
1303
- proxies += rephrases
1304
-
1305
- passages = self.get_relevant_chunks(query, proxies) # no LLM involved
1306
-
1307
- if len(passages) == 0:
1308
- return query, []
1309
-
1310
- with status("[cyan]LLM Extracting verbatim passages..."):
1311
- with StreamingIfAllowed(self.llm, False):
1312
- # these are async calls, one per passage; turn off streaming
1313
- extracts = self.get_verbatim_extracts(query, passages)
1314
- extracts = [e for e in extracts if e.content != NO_ANSWER]
1315
-
1316
- return query, extracts
1317
-
1318
- def get_verbatim_extracts(
1319
- self,
1320
- query: str,
1321
- passages: List[Document],
1322
- ) -> List[Document]:
1323
- """
1324
- Run RelevanceExtractorAgent in async/concurrent mode on passages,
1325
- to extract portions relevant to answering query, from each passage.
1326
- Args:
1327
- query (str): query to answer
1328
- passages (List[Documents]): list of passages to extract from
1329
-
1330
- Returns:
1331
- List[Document]: list of Documents containing extracts and metadata.
1332
- """
1333
- agent_cfg = self.config.relevance_extractor_config
1334
- if agent_cfg is None:
1335
- # no relevance extraction: simply return passages
1336
- return passages
1337
- if agent_cfg.llm is None:
1338
- # Use main DocChatAgent's LLM if not provided explicitly:
1339
- # this reduces setup burden on the user
1340
- agent_cfg.llm = self.config.llm
1341
- agent_cfg.query = query
1342
- agent_cfg.segment_length = self.config.extraction_granularity
1343
- agent_cfg.llm.stream = False # disable streaming for concurrent calls
1344
-
1345
- agent = RelevanceExtractorAgent(agent_cfg)
1346
- task = Task(
1347
- agent,
1348
- name="Relevance-Extractor",
1349
- interactive=False,
1350
- )
1351
-
1352
- extracts: list[str] = run_batch_tasks(
1353
- task,
1354
- passages,
1355
- input_map=lambda msg: msg.content,
1356
- output_map=lambda ans: ans.content if ans is not None else NO_ANSWER,
1357
- ) # type: ignore
1358
-
1359
- # Caution: Retain ALL other fields in the Documents (which could be
1360
- # other than just `content` and `metadata`), while simply replacing
1361
- # `content` with the extracted portions
1362
- passage_extracts = []
1363
- for p, e in zip(passages, extracts):
1364
- if e == NO_ANSWER or len(e) == 0:
1365
- continue
1366
- p_copy = p.copy()
1367
- p_copy.content = e
1368
- passage_extracts.append(p_copy)
1369
-
1370
- return passage_extracts
1371
-
1372
- def answer_from_docs(self, query: str) -> ChatDocument:
1373
- """
1374
- Answer query based on relevant docs from the VecDB
1375
-
1376
- Args:
1377
- query (str): query to answer
1378
-
1379
- Returns:
1380
- Document: answer
1381
- """
1382
- response = ChatDocument(
1383
- content=NO_ANSWER,
1384
- metadata=ChatDocMetaData(
1385
- source="None",
1386
- sender=Entity.LLM,
1387
- ),
1388
- )
1389
- # query may be updated to a stand-alone version
1390
- query, extracts = self.get_relevant_extracts(query)
1391
- if len(extracts) == 0:
1392
- return response
1393
- if self.llm is None:
1394
- raise ValueError("LLM not set")
1395
- if self.config.retrieve_only:
1396
- # only return extracts, skip LLM-based summary answer
1397
- meta = dict(
1398
- sender=Entity.LLM,
1399
- )
1400
- # copy metadata from first doc, unclear what to do here.
1401
- meta.update(extracts[0].metadata)
1402
- return ChatDocument(
1403
- content="\n\n".join([e.content for e in extracts]),
1404
- metadata=ChatDocMetaData(**meta), # type: ignore
1405
- )
1406
- response = self.get_summary_answer(query, extracts)
1407
-
1408
- self.update_dialog(query, response.content)
1409
- self.response = response # save last response
1410
- return response
1411
-
1412
- def summarize_docs(
1413
- self,
1414
- instruction: str = "Give a concise summary of the following text:",
1415
- ) -> None | ChatDocument:
1416
- """Summarize all docs"""
1417
- if self.llm is None:
1418
- raise ValueError("LLM not set")
1419
- if len(self.original_docs) == 0:
1420
- logger.warning(
1421
- """
1422
- No docs to summarize! Perhaps you are re-using a previously
1423
- defined collection?
1424
- In that case, we don't have access to the original docs.
1425
- To create a summary, use a new collection, and specify a list of docs.
1426
- """
1427
- )
1428
- return None
1429
- full_text = "\n\n".join([d.content for d in self.original_docs])
1430
- if self.parser is None:
1431
- raise ValueError("No parser defined")
1432
- tot_tokens = self.parser.num_tokens(full_text)
1433
- MAX_INPUT_TOKENS = (
1434
- self.llm.completion_context_length()
1435
- - self.config.llm.max_output_tokens
1436
- - 100
1437
- )
1438
- if tot_tokens > MAX_INPUT_TOKENS:
1439
- # truncate
1440
- full_text = self.parser.tokenizer.decode(
1441
- self.parser.tokenizer.encode(full_text)[:MAX_INPUT_TOKENS]
1442
- )
1443
- logger.warning(
1444
- f"Summarizing after truncating text to {MAX_INPUT_TOKENS} tokens"
1445
- )
1446
- prompt = f"""
1447
- {instruction}
1448
-
1449
- FULL TEXT:
1450
- {full_text}
1451
- """.strip()
1452
- with StreamingIfAllowed(self.llm):
1453
- summary = ChatAgent.llm_response(self, prompt)
1454
- return summary
1455
-
1456
- def justify_response(self) -> ChatDocument | None:
1457
- """Show evidence for last response"""
1458
- if self.response is None:
1459
- print("[magenta]No response yet")
1460
- return None
1461
- source = self.response.metadata.source
1462
- if len(source) > 0:
1463
- print("[magenta]" + source)
1464
- else:
1465
- print("[magenta]No source found")
1466
- return None