langroid 0.1.139__py3-none-any.whl → 0.1.219__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. langroid/__init__.py +70 -0
  2. langroid/agent/__init__.py +22 -0
  3. langroid/agent/base.py +120 -33
  4. langroid/agent/batch.py +134 -35
  5. langroid/agent/callbacks/__init__.py +0 -0
  6. langroid/agent/callbacks/chainlit.py +608 -0
  7. langroid/agent/chat_agent.py +164 -100
  8. langroid/agent/chat_document.py +19 -2
  9. langroid/agent/openai_assistant.py +20 -10
  10. langroid/agent/special/__init__.py +33 -10
  11. langroid/agent/special/doc_chat_agent.py +521 -108
  12. langroid/agent/special/lance_doc_chat_agent.py +258 -0
  13. langroid/agent/special/lance_rag/__init__.py +9 -0
  14. langroid/agent/special/lance_rag/critic_agent.py +136 -0
  15. langroid/agent/special/lance_rag/lance_rag_task.py +80 -0
  16. langroid/agent/special/lance_rag/query_planner_agent.py +180 -0
  17. langroid/agent/special/lance_tools.py +44 -0
  18. langroid/agent/special/neo4j/__init__.py +0 -0
  19. langroid/agent/special/neo4j/csv_kg_chat.py +174 -0
  20. langroid/agent/special/neo4j/neo4j_chat_agent.py +370 -0
  21. langroid/agent/special/neo4j/utils/__init__.py +0 -0
  22. langroid/agent/special/neo4j/utils/system_message.py +46 -0
  23. langroid/agent/special/relevance_extractor_agent.py +23 -7
  24. langroid/agent/special/retriever_agent.py +29 -174
  25. langroid/agent/special/sql/__init__.py +7 -0
  26. langroid/agent/special/sql/sql_chat_agent.py +47 -23
  27. langroid/agent/special/sql/utils/__init__.py +11 -0
  28. langroid/agent/special/sql/utils/description_extractors.py +95 -46
  29. langroid/agent/special/sql/utils/populate_metadata.py +28 -21
  30. langroid/agent/special/table_chat_agent.py +43 -9
  31. langroid/agent/task.py +423 -114
  32. langroid/agent/tool_message.py +67 -10
  33. langroid/agent/tools/__init__.py +8 -0
  34. langroid/agent/tools/duckduckgo_search_tool.py +66 -0
  35. langroid/agent/tools/google_search_tool.py +11 -0
  36. langroid/agent/tools/metaphor_search_tool.py +67 -0
  37. langroid/agent/tools/recipient_tool.py +6 -24
  38. langroid/agent/tools/sciphi_search_rag_tool.py +79 -0
  39. langroid/cachedb/__init__.py +6 -0
  40. langroid/embedding_models/__init__.py +24 -0
  41. langroid/embedding_models/base.py +9 -1
  42. langroid/embedding_models/models.py +117 -17
  43. langroid/embedding_models/protoc/embeddings.proto +19 -0
  44. langroid/embedding_models/protoc/embeddings_pb2.py +33 -0
  45. langroid/embedding_models/protoc/embeddings_pb2.pyi +50 -0
  46. langroid/embedding_models/protoc/embeddings_pb2_grpc.py +79 -0
  47. langroid/embedding_models/remote_embeds.py +153 -0
  48. langroid/language_models/__init__.py +22 -0
  49. langroid/language_models/azure_openai.py +47 -4
  50. langroid/language_models/base.py +26 -10
  51. langroid/language_models/config.py +5 -0
  52. langroid/language_models/openai_gpt.py +407 -121
  53. langroid/language_models/prompt_formatter/__init__.py +9 -0
  54. langroid/language_models/prompt_formatter/base.py +4 -6
  55. langroid/language_models/prompt_formatter/hf_formatter.py +135 -0
  56. langroid/language_models/utils.py +10 -9
  57. langroid/mytypes.py +10 -4
  58. langroid/parsing/__init__.py +33 -1
  59. langroid/parsing/document_parser.py +259 -63
  60. langroid/parsing/image_text.py +32 -0
  61. langroid/parsing/parse_json.py +143 -0
  62. langroid/parsing/parser.py +20 -7
  63. langroid/parsing/repo_loader.py +108 -46
  64. langroid/parsing/search.py +8 -0
  65. langroid/parsing/table_loader.py +44 -0
  66. langroid/parsing/url_loader.py +59 -13
  67. langroid/parsing/urls.py +18 -9
  68. langroid/parsing/utils.py +130 -9
  69. langroid/parsing/web_search.py +73 -0
  70. langroid/prompts/__init__.py +7 -0
  71. langroid/prompts/chat-gpt4-system-prompt.md +68 -0
  72. langroid/prompts/prompts_config.py +1 -1
  73. langroid/utils/__init__.py +10 -0
  74. langroid/utils/algorithms/__init__.py +3 -0
  75. langroid/utils/configuration.py +0 -1
  76. langroid/utils/constants.py +4 -0
  77. langroid/utils/logging.py +2 -5
  78. langroid/utils/output/__init__.py +15 -2
  79. langroid/utils/output/status.py +33 -0
  80. langroid/utils/pandas_utils.py +30 -0
  81. langroid/utils/pydantic_utils.py +446 -4
  82. langroid/utils/system.py +36 -1
  83. langroid/vector_store/__init__.py +34 -2
  84. langroid/vector_store/base.py +33 -2
  85. langroid/vector_store/chromadb.py +42 -13
  86. langroid/vector_store/lancedb.py +226 -60
  87. langroid/vector_store/meilisearch.py +7 -6
  88. langroid/vector_store/momento.py +3 -2
  89. langroid/vector_store/qdrantdb.py +82 -11
  90. {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/METADATA +190 -129
  91. langroid-0.1.219.dist-info/RECORD +127 -0
  92. langroid/agent/special/recipient_validator_agent.py +0 -157
  93. langroid/parsing/json.py +0 -64
  94. langroid/utils/web/selenium_login.py +0 -36
  95. langroid-0.1.139.dist-info/RECORD +0 -103
  96. {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/LICENSE +0 -0
  97. {langroid-0.1.139.dist-info → langroid-0.1.219.dist-info}/WHEEL +0 -0
@@ -12,16 +12,17 @@ langroid with the [hf-embeddings] extra, e.g.:
12
12
  pip install "langroid[hf-embeddings]"
13
13
 
14
14
  """
15
+
15
16
  import logging
16
17
  from contextlib import ExitStack
17
- from typing import List, Optional, Tuple, no_type_check
18
+ from functools import cache
19
+ from typing import Any, Dict, List, Optional, Set, Tuple, no_type_check
18
20
 
21
+ import nest_asyncio
19
22
  import numpy as np
20
- from rich import print
21
- from rich.console import Console
23
+ import pandas as pd
22
24
  from rich.prompt import Prompt
23
25
 
24
- from langroid.agent.base import Agent
25
26
  from langroid.agent.batch import run_batch_tasks
26
27
  from langroid.agent.chat_agent import ChatAgent, ChatAgentConfig
27
28
  from langroid.agent.chat_document import ChatDocMetaData, ChatDocument
@@ -34,6 +35,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
34
35
  from langroid.language_models.base import StreamingIfAllowed
35
36
  from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
36
37
  from langroid.mytypes import DocMetaData, Document, Entity
38
+ from langroid.parsing.document_parser import DocumentType
37
39
  from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
38
40
  from langroid.parsing.repo_loader import RepoLoader
39
41
  from langroid.parsing.search import (
@@ -41,20 +43,26 @@ from langroid.parsing.search import (
41
43
  find_fuzzy_matches_in_docs,
42
44
  preprocess_text,
43
45
  )
46
+ from langroid.parsing.table_loader import describe_dataframe
44
47
  from langroid.parsing.url_loader import URLLoader
45
- from langroid.parsing.urls import get_list_from_user, get_urls_and_paths
48
+ from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
46
49
  from langroid.parsing.utils import batched
47
50
  from langroid.prompts.prompts_config import PromptsConfig
48
51
  from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
49
52
  from langroid.utils.configuration import settings
50
53
  from langroid.utils.constants import NO_ANSWER
51
- from langroid.utils.output.printing import show_if_debug
52
- from langroid.vector_store.base import VectorStoreConfig
54
+ from langroid.utils.output import show_if_debug, status
55
+ from langroid.utils.pydantic_utils import dataframe_to_documents, extract_fields
56
+ from langroid.vector_store.base import VectorStore, VectorStoreConfig
53
57
  from langroid.vector_store.lancedb import LanceDBConfig
54
58
 
55
- logger = logging.getLogger(__name__)
56
59
 
57
- console = Console()
60
+ @cache
61
+ def apply_nest_asyncio() -> None:
62
+ nest_asyncio.apply()
63
+
64
+
65
+ logger = logging.getLogger(__name__)
58
66
 
59
67
  DEFAULT_DOC_CHAT_INSTRUCTIONS = """
60
68
  Your task is to answer questions about various documents.
@@ -66,25 +74,29 @@ DEFAULT_DOC_CHAT_SYSTEM_MESSAGE = """
66
74
  You are a helpful assistant, helping me understand a collection of documents.
67
75
  """
68
76
 
77
+ has_sentence_transformers = False
78
+ try:
79
+ from sentence_transformer import SentenceTransformer # noqa: F401
69
80
 
70
- class DocChatAgentConfig(ChatAgentConfig):
71
- """
72
- Attributes:
73
- max_context_tokens (int): threshold to use for various steps, e.g.
74
- if we are able to fit the current stage of doc processing into
75
- this many tokens, we skip additional compression steps, and
76
- use the current docs as-is in the context
77
- conversation_mode (bool): if True, we will accumulate message history,
78
- and pass entire history to LLM at each round.
79
- If False, each request to LLM will consist only of the
80
- initial task messages plus the current query.
81
- """
81
+ has_sentence_transformers = True
82
+ except ImportError:
83
+ pass
82
84
 
85
+
86
+ class DocChatAgentConfig(ChatAgentConfig):
83
87
  system_message: str = DEFAULT_DOC_CHAT_SYSTEM_MESSAGE
84
88
  user_message: str = DEFAULT_DOC_CHAT_INSTRUCTIONS
85
89
  summarize_prompt: str = SUMMARY_ANSWER_PROMPT_GPT4
86
- max_context_tokens: int = 1000
87
- conversation_mode: bool = True
90
+ # extra fields to include in content as key=value pairs
91
+ # (helps retrieval for table-like data)
92
+ add_fields_to_content: List[str] = []
93
+ filter_fields: List[str] = [] # fields usable in filter
94
+ retrieve_only: bool = False # only retr relevant extracts, don't gen summary answer
95
+ extraction_granularity: int = 1 # granularity (in sentences) for relev extraction
96
+ filter: str | None = (
97
+ None # filter condition for various lexical/semantic search fns
98
+ )
99
+ conversation_mode: bool = True # accumulate message history?
88
100
  # In assistant mode, DocChatAgent receives questions from another Agent,
89
101
  # and those will already be in stand-alone form, so in this mode
90
102
  # there is no need to convert them to stand-alone form.
@@ -100,17 +112,22 @@ class DocChatAgentConfig(ChatAgentConfig):
100
112
  n_fuzzy_neighbor_words: int = 100 # num neighbor words to retrieve for fuzzy match
101
113
  use_fuzzy_match: bool = True
102
114
  use_bm25_search: bool = True
103
- cross_encoder_reranking_model: str = "cross-encoder/ms-marco-MiniLM-L-6-v2"
115
+ cross_encoder_reranking_model: str = (
116
+ "cross-encoder/ms-marco-MiniLM-L-6-v2" if has_sentence_transformers else ""
117
+ )
104
118
  rerank_diversity: bool = True # rerank to maximize diversity?
105
119
  rerank_periphery: bool = True # rerank to avoid Lost In the Middle effect?
106
120
  embed_batch_size: int = 500 # get embedding of at most this many at a time
107
121
  cache: bool = True # cache results
108
122
  debug: bool = False
109
123
  stream: bool = True # allow streaming where needed
110
- relevance_extractor_config: RelevanceExtractorAgentConfig = (
111
- RelevanceExtractorAgentConfig()
124
+ split: bool = True # use chunking
125
+ relevance_extractor_config: None | RelevanceExtractorAgentConfig = (
126
+ RelevanceExtractorAgentConfig(
127
+ llm=None # use the parent's llm unless explicitly set here
128
+ )
112
129
  )
113
- doc_paths: List[str] = []
130
+ doc_paths: List[str | bytes] = []
114
131
  default_paths: List[str] = [
115
132
  "https://news.ycombinator.com/item?id=35629033",
116
133
  "https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
@@ -135,7 +152,7 @@ class DocChatAgentConfig(ChatAgentConfig):
135
152
  # NOTE: PDF parsing is extremely challenging, and each library
136
153
  # has its own strengths and weaknesses.
137
154
  # Try one that works for your use case.
138
- # or "haystack", "unstructured", "pdfplumber", "fitz", "pypdf"
155
+ # or "unstructured", "pdfplumber", "fitz", "pypdf"
139
156
  library="pdfplumber",
140
157
  ),
141
158
  )
@@ -156,7 +173,7 @@ class DocChatAgentConfig(ChatAgentConfig):
156
173
  collection_name="doc-chat-lancedb",
157
174
  replace_collection=True,
158
175
  storage_path=".lancedb/data/",
159
- embedding=hf_embed_config,
176
+ embedding=hf_embed_config if has_sentence_transformers else oai_embed_config,
160
177
  )
161
178
  llm: OpenAIGPTConfig = OpenAIGPTConfig(
162
179
  type="openai",
@@ -180,14 +197,40 @@ class DocChatAgent(ChatAgent):
180
197
  ):
181
198
  super().__init__(config)
182
199
  self.config: DocChatAgentConfig = config
183
- self.original_docs: None | List[Document] = None
200
+ self.original_docs: List[Document] = []
184
201
  self.original_docs_length = 0
185
- self.chunked_docs: None | List[Document] = None
186
- self.chunked_docs_clean: None | List[Document] = None
202
+ self.from_dataframe = False
203
+ self.df_description = ""
204
+ self.chunked_docs: List[Document] = []
205
+ self.chunked_docs_clean: List[Document] = []
187
206
  self.response: None | Document = None
188
207
  if len(config.doc_paths) > 0:
189
208
  self.ingest()
190
209
 
210
+ def clear(self) -> None:
211
+ """Clear the document collection and the specific collection in vecdb"""
212
+ if self.vecdb is None:
213
+ raise ValueError("VecDB not set")
214
+ self.original_docs = []
215
+ self.original_docs_length = 0
216
+ self.chunked_docs = []
217
+ self.chunked_docs_clean = []
218
+ collection_name = self.vecdb.config.collection_name
219
+ if collection_name is None:
220
+ return
221
+ try:
222
+ # Note we may have used a vecdb with a config.collection_name
223
+ # different from the agent's config.vecdb.collection_name!!
224
+ self.vecdb.delete_collection(collection_name)
225
+ self.vecdb = VectorStore.create(self.vecdb.config)
226
+ except Exception as e:
227
+ logger.warning(
228
+ f"""
229
+ Error while deleting collection {collection_name}:
230
+ {e}
231
+ """
232
+ )
233
+
191
234
  def ingest(self) -> None:
192
235
  """
193
236
  Chunk + embed + store docs specified by self.config.doc_paths
@@ -204,63 +247,316 @@ class DocChatAgent(ChatAgent):
204
247
  # do keyword and other non-vector searches
205
248
  if self.vecdb is None:
206
249
  raise ValueError("VecDB not set")
207
- self.chunked_docs = self.vecdb.get_all_documents()
208
- # used for lexical similarity e.g. keyword search (bm25 etc)
209
- self.chunked_docs_clean = [
210
- Document(content=preprocess_text(d.content), metadata=d.metadata)
211
- for d in self.chunked_docs
212
- ]
250
+ self.setup_documents(filter=self.config.filter)
213
251
  return
214
- urls, paths = get_urls_and_paths(self.config.doc_paths)
252
+ self.ingest_doc_paths(self.config.doc_paths) # type: ignore
253
+
254
+ def ingest_doc_paths(
255
+ self,
256
+ paths: str | bytes | List[str | bytes],
257
+ metadata: (
258
+ List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
259
+ ) = [],
260
+ doc_type: str | DocumentType | None = None,
261
+ ) -> List[Document]:
262
+ """Split, ingest docs from specified paths,
263
+ do not add these to config.doc_paths.
264
+
265
+ Args:
266
+ paths: document paths, urls or byte-content of docs.
267
+ The bytes option is intended to support cases where a document
268
+ has already been read in as bytes (e.g. from an API or a database),
269
+ and we want to avoid having to write it to a temporary file
270
+ just to read it back in.
271
+ metadata: List of metadata dicts, one for each path.
272
+ If a single dict is passed in, it is used for all paths.
273
+ doc_type: DocumentType to use for parsing, if known.
274
+ MUST apply to all docs if specified.
275
+ This is especially useful when the `paths` are of bytes type,
276
+ to help with document type detection.
277
+ Returns:
278
+ List of Document objects
279
+ """
280
+ if isinstance(paths, str) or isinstance(paths, bytes):
281
+ paths = [paths]
282
+ all_paths = paths
283
+ paths_meta: Dict[int, Any] = {}
284
+ urls_meta: Dict[int, Any] = {}
285
+ idxs = range(len(all_paths))
286
+ url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
287
+ urls = [all_paths[i] for i in url_idxs]
288
+ paths = [all_paths[i] for i in path_idxs]
289
+ bytes_list = [all_paths[i] for i in bytes_idxs]
290
+ path_idxs.extend(bytes_idxs)
291
+ paths.extend(bytes_list)
292
+ if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
293
+ metadata, list
294
+ ):
295
+ if isinstance(metadata, list):
296
+ idx2meta = {
297
+ p: (
298
+ m
299
+ if isinstance(m, dict)
300
+ else (isinstance(m, DocMetaData) and m.dict())
301
+ ) # appease mypy
302
+ for p, m in zip(idxs, metadata)
303
+ }
304
+ elif isinstance(metadata, dict):
305
+ idx2meta = {p: metadata for p in idxs}
306
+ else:
307
+ idx2meta = {p: metadata.dict() for p in idxs}
308
+ urls_meta = {u: idx2meta[u] for u in url_idxs}
309
+ paths_meta = {p: idx2meta[p] for p in path_idxs}
215
310
  docs: List[Document] = []
216
311
  parser = Parser(self.config.parsing)
217
312
  if len(urls) > 0:
218
- loader = URLLoader(urls=urls, parser=parser)
219
- docs = loader.load()
220
- if len(paths) > 0:
221
- for p in paths:
222
- path_docs = RepoLoader.get_documents(p, parser=parser)
313
+ for ui in url_idxs:
314
+ meta = urls_meta.get(ui, {})
315
+ loader = URLLoader(urls=[all_paths[ui]], parser=parser) # type: ignore
316
+ url_docs = loader.load()
317
+ # update metadata of each doc with meta
318
+ for d in url_docs:
319
+ d.metadata = d.metadata.copy(update=meta)
320
+ docs.extend(url_docs)
321
+ if len(paths) > 0: # paths OR bytes are handled similarly
322
+ for pi in path_idxs:
323
+ meta = paths_meta.get(pi, {})
324
+ p = all_paths[pi]
325
+ path_docs = RepoLoader.get_documents(
326
+ p,
327
+ parser=parser,
328
+ doc_type=doc_type,
329
+ )
330
+ # update metadata of each doc with meta
331
+ for d in path_docs:
332
+ d.metadata = d.metadata.copy(update=meta)
223
333
  docs.extend(path_docs)
224
334
  n_docs = len(docs)
225
- n_splits = self.ingest_docs(docs)
335
+ n_splits = self.ingest_docs(docs, split=self.config.split)
226
336
  if n_docs == 0:
227
- return
337
+ return []
228
338
  n_urls = len(urls)
229
339
  n_paths = len(paths)
230
340
  print(
231
341
  f"""
232
342
  [green]I have processed the following {n_urls} URLs
233
- and {n_paths} paths into {n_splits} parts:
343
+ and {n_paths} docs into {n_splits} parts:
234
344
  """.strip()
235
345
  )
236
- print("\n".join(urls))
237
- print("\n".join(paths))
346
+ path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
347
+ print("\n".join([u for u in urls if isinstance(u, str)])) # appease mypy
348
+ print("\n".join(path_reps))
349
+ return docs
238
350
 
239
- def ingest_docs(self, docs: List[Document]) -> int:
351
+ def ingest_docs(
352
+ self,
353
+ docs: List[Document],
354
+ split: bool = True,
355
+ metadata: (
356
+ List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
357
+ ) = [],
358
+ ) -> int:
240
359
  """
241
360
  Chunk docs into pieces, map each chunk to vec-embedding, store in vec-db
361
+
362
+ Args:
363
+ docs: List of Document objects
364
+ split: Whether to split docs into chunks. Default is True.
365
+ If False, docs are treated as "chunks" and are not split.
366
+ metadata: List of metadata dicts, one for each doc, to augment
367
+ whatever metadata is already in the doc.
368
+ [ASSUME no conflicting keys between the two metadata dicts.]
369
+ If a single dict is passed in, it is used for all docs.
242
370
  """
243
- self.original_docs = docs
371
+ if isinstance(metadata, list) and len(metadata) > 0:
372
+ for d, m in zip(docs, metadata):
373
+ d.metadata = d.metadata.copy(
374
+ update=m if isinstance(m, dict) else m.dict() # type: ignore
375
+ )
376
+ elif isinstance(metadata, dict):
377
+ for d in docs:
378
+ d.metadata = d.metadata.copy(update=metadata)
379
+ elif isinstance(metadata, DocMetaData):
380
+ for d in docs:
381
+ d.metadata = d.metadata.copy(update=metadata.dict())
382
+
383
+ self.original_docs.extend(docs)
244
384
  if self.parser is None:
245
385
  raise ValueError("Parser not set")
246
386
  for d in docs:
247
387
  if d.metadata.id in [None, ""]:
248
388
  d.metadata.id = d._unique_hash_id()
249
- docs = self.parser.split(docs)
250
- self.chunked_docs = docs
251
- self.chunked_docs_clean = [
252
- Document(content=preprocess_text(d.content), metadata=d.metadata)
253
- for d in self.chunked_docs
254
- ]
389
+ if split:
390
+ docs = self.parser.split(docs)
391
+ else:
392
+ # treat each doc as a chunk
393
+ for d in docs:
394
+ d.metadata.is_chunk = True
255
395
  if self.vecdb is None:
256
396
  raise ValueError("VecDB not set")
397
+
398
+ # If any additional fields need to be added to content,
399
+ # add them as key=value pairs for all docs, before batching.
400
+ # This helps retrieval for table-like data.
401
+ # Note we need to do this at stage so that the embeddings
402
+ # are computed on the full content with these additional fields.
403
+ if len(self.config.add_fields_to_content) > 0:
404
+ fields = [
405
+ f for f in extract_fields(docs[0], self.config.add_fields_to_content)
406
+ ]
407
+ if len(fields) > 0:
408
+ for d in docs:
409
+ key_vals = extract_fields(d, fields)
410
+ d.content = (
411
+ ",".join(f"{k}={v}" for k, v in key_vals.items())
412
+ + ",content="
413
+ + d.content
414
+ )
415
+ docs = docs[: self.config.parsing.max_chunks]
257
416
  # add embeddings in batches, to stay under limit of embeddings API
258
417
  batches = list(batched(docs, self.config.embed_batch_size))
259
418
  for batch in batches:
260
419
  self.vecdb.add_documents(batch)
261
420
  self.original_docs_length = self.doc_length(docs)
421
+ self.setup_documents(docs, filter=self.config.filter)
262
422
  return len(docs)
263
423
 
424
+ @staticmethod
425
+ def document_compatible_dataframe(
426
+ df: pd.DataFrame,
427
+ content: str = "content",
428
+ metadata: List[str] = [],
429
+ ) -> Tuple[pd.DataFrame, List[str]]:
430
+ """
431
+ Convert dataframe so it is compatible with Document class:
432
+ - has "content" column
433
+ - has an "id" column to be used as Document.metadata.id
434
+
435
+ Args:
436
+ df: dataframe to convert
437
+ content: name of content column
438
+ metadata: list of metadata column names
439
+
440
+ Returns:
441
+ Tuple[pd.DataFrame, List[str]]: dataframe, metadata
442
+ - dataframe: dataframe with "content" column and "id" column
443
+ - metadata: list of metadata column names, including "id"
444
+ """
445
+ if content not in df.columns:
446
+ raise ValueError(
447
+ f"""
448
+ Content column {content} not in dataframe,
449
+ so we cannot ingest into the DocChatAgent.
450
+ Please specify the `content` parameter as a suitable
451
+ text-based column in the dataframe.
452
+ """
453
+ )
454
+ if content != "content":
455
+ # rename content column to "content", leave existing column intact
456
+ df = df.rename(columns={content: "content"}, inplace=False)
457
+
458
+ actual_metadata = metadata.copy()
459
+ if "id" not in df.columns:
460
+ docs = dataframe_to_documents(df, content="content", metadata=metadata)
461
+ ids = [str(d.id()) for d in docs]
462
+ df["id"] = ids
463
+
464
+ if "id" not in actual_metadata:
465
+ actual_metadata += ["id"]
466
+
467
+ return df, actual_metadata
468
+
469
+ def ingest_dataframe(
470
+ self,
471
+ df: pd.DataFrame,
472
+ content: str = "content",
473
+ metadata: List[str] = [],
474
+ ) -> int:
475
+ """
476
+ Ingest a dataframe into vecdb.
477
+ """
478
+ self.from_dataframe = True
479
+ self.df_description = describe_dataframe(
480
+ df, filter_fields=self.config.filter_fields, n_vals=5
481
+ )
482
+ df, metadata = DocChatAgent.document_compatible_dataframe(df, content, metadata)
483
+ docs = dataframe_to_documents(df, content="content", metadata=metadata)
484
+ # When ingesting a dataframe we will no longer do any chunking,
485
+ # so we mark each doc as a chunk.
486
+ # TODO - revisit this since we may still want to chunk large text columns
487
+ for d in docs:
488
+ d.metadata.is_chunk = True
489
+ return self.ingest_docs(docs)
490
+
491
+ def set_filter(self, filter: str) -> None:
492
+ self.config.filter = filter
493
+ self.setup_documents(filter=filter)
494
+
495
+ def setup_documents(
496
+ self,
497
+ docs: List[Document] = [],
498
+ filter: str | None = None,
499
+ ) -> None:
500
+ """
501
+ Setup `self.chunked_docs` and `self.chunked_docs_clean`
502
+ based on possible filter.
503
+ These will be used in various non-vector-based search functions,
504
+ e.g. self.get_similar_chunks_bm25(), self.get_fuzzy_matches(), etc.
505
+
506
+ Args:
507
+ docs: List of Document objects. This is empty when we are calling this
508
+ method after initial doc ingestion.
509
+ filter: Filter condition for various lexical/semantic search fns.
510
+ """
511
+ if filter is None and len(docs) > 0:
512
+ # no filter, so just use the docs passed in
513
+ self.chunked_docs.extend(docs)
514
+ else:
515
+ if self.vecdb is None:
516
+ raise ValueError("VecDB not set")
517
+ self.chunked_docs = self.vecdb.get_all_documents(where=filter or "")
518
+
519
+ self.chunked_docs_clean = [
520
+ Document(content=preprocess_text(d.content), metadata=d.metadata)
521
+ for d in self.chunked_docs
522
+ ]
523
+
524
+ def get_field_values(self, fields: list[str]) -> Dict[str, str]:
525
+ """Get string-listing of possible values of each filterable field,
526
+ e.g.
527
+ {
528
+ "genre": "crime, drama, mystery, ... (10 more)",
529
+ "certificate": "R, PG-13, PG, R",
530
+ }
531
+ """
532
+ field_values: Dict[str, Set[str]] = {}
533
+ # make empty set for each field
534
+ for f in fields:
535
+ field_values[f] = set()
536
+ if self.vecdb is None:
537
+ raise ValueError("VecDB not set")
538
+ # get all documents and accumulate possible values of each field until 10
539
+ docs = self.vecdb.get_all_documents() # only works for vecdbs that support this
540
+ for d in docs:
541
+ # extract fields from d
542
+ doc_field_vals = extract_fields(d, fields)
543
+ for field, val in doc_field_vals.items():
544
+ field_values[field].add(val)
545
+ # For each field make a string showing list of possible values,
546
+ # truncate to 20 values, and if there are more, indicate how many
547
+ # more there are, e.g. Genre: crime, drama, mystery, ... (20 more)
548
+ field_values_list = {}
549
+ for f in fields:
550
+ vals = list(field_values[f])
551
+ n = len(vals)
552
+ remaining = n - 20
553
+ vals = vals[:20]
554
+ if n > 20:
555
+ vals.append(f"(...{remaining} more)")
556
+ # make a string of the values, ensure they are strings
557
+ field_values_list[f] = ", ".join(str(v) for v in vals)
558
+ return field_values_list
559
+
264
560
  def doc_length(self, docs: List[Document]) -> int:
265
561
  """
266
562
  Calc token-length of a list of docs
@@ -342,10 +638,9 @@ class DocChatAgent(ChatAgent):
342
638
  if len(inputs) == 0:
343
639
  if is_new_collection:
344
640
  inputs = self.config.default_paths
345
- self.config.doc_paths = inputs
641
+ self.config.doc_paths = inputs # type: ignore
346
642
  self.ingest()
347
643
 
348
- @no_type_check
349
644
  def llm_response(
350
645
  self,
351
646
  query: None | str | ChatDocument = None,
@@ -362,10 +657,55 @@ class DocChatAgent(ChatAgent):
362
657
  query_str = query_str[1:] if query_str is not None else None
363
658
  if self.llm is None:
364
659
  raise ValueError("LLM not set")
365
- with StreamingIfAllowed(self.llm):
660
+ with StreamingIfAllowed(self.llm, self.llm.get_stream()):
366
661
  response = super().llm_response(query_str)
367
662
  if query_str is not None:
368
- self.update_dialog(query_str, response.content)
663
+ self.update_dialog(
664
+ query_str, "" if response is None else response.content
665
+ )
666
+ return response
667
+ if query_str == "":
668
+ return None
669
+ elif query_str == "?" and self.response is not None:
670
+ return self.justify_response()
671
+ elif (query_str.startswith(("summar", "?")) and self.response is None) or (
672
+ query_str == "??"
673
+ ):
674
+ return self.summarize_docs()
675
+ else:
676
+ self.callbacks.show_start_response(entity="llm")
677
+ response = self.answer_from_docs(query_str)
678
+ return ChatDocument(
679
+ content=response.content,
680
+ metadata=ChatDocMetaData(
681
+ source=response.metadata.source,
682
+ sender=Entity.LLM,
683
+ ),
684
+ )
685
+
686
+ async def llm_response_async(
687
+ self,
688
+ query: None | str | ChatDocument = None,
689
+ ) -> Optional[ChatDocument]:
690
+ apply_nest_asyncio()
691
+ if not self.llm_can_respond(query):
692
+ return None
693
+ query_str: str | None
694
+ if isinstance(query, ChatDocument):
695
+ query_str = query.content
696
+ else:
697
+ query_str = query
698
+ if query_str is None or query_str.startswith("!"):
699
+ # direct query to LLM
700
+ query_str = query_str[1:] if query_str is not None else None
701
+ if self.llm is None:
702
+ raise ValueError("LLM not set")
703
+ with StreamingIfAllowed(self.llm, self.llm.get_stream()):
704
+ response = await super().llm_response_async(query_str)
705
+ if query_str is not None:
706
+ self.update_dialog(
707
+ query_str, "" if response is None else response.content
708
+ )
369
709
  return response
370
710
  if query_str == "":
371
711
  return None
@@ -376,6 +716,7 @@ class DocChatAgent(ChatAgent):
376
716
  ):
377
717
  return self.summarize_docs()
378
718
  else:
719
+ self.callbacks.show_start_response(entity="llm")
379
720
  response = self.answer_from_docs(query_str)
380
721
  return ChatDocument(
381
722
  content=response.content,
@@ -407,7 +748,9 @@ class DocChatAgent(ChatAgent):
407
748
  ]
408
749
  )
409
750
 
410
- def get_summary_answer(self, question: str, passages: List[Document]) -> Document:
751
+ def get_summary_answer(
752
+ self, question: str, passages: List[Document]
753
+ ) -> ChatDocument:
411
754
  """
412
755
  Given a question and a list of (possibly) doc snippets,
413
756
  generate an answer if possible
@@ -435,9 +778,6 @@ class DocChatAgent(ChatAgent):
435
778
  # 2 new LLMMessage objects:
436
779
  # one for `final_prompt`, and one for the LLM response
437
780
 
438
- # TODO need to "forget" last two messages in message_history
439
- # if we are not in conversation mode
440
-
441
781
  if self.config.conversation_mode:
442
782
  # respond with temporary context
443
783
  answer_doc = super()._llm_response_temp_context(question, final_prompt)
@@ -446,16 +786,23 @@ class DocChatAgent(ChatAgent):
446
786
 
447
787
  final_answer = answer_doc.content.strip()
448
788
  show_if_debug(final_answer, "SUMMARIZE_RESPONSE= ")
449
- parts = final_answer.split("SOURCE:", maxsplit=1)
450
- if len(parts) > 1:
451
- content = parts[0].strip()
452
- sources = parts[1].strip()
453
- else:
789
+
790
+ if final_answer.startswith("SOURCE"):
791
+ # sometimes SOURCE may be shown first,
792
+ # in this case just use final_answer as-is for both content and source
454
793
  content = final_answer
455
- sources = ""
456
- return Document(
794
+ sources = final_answer
795
+ else:
796
+ parts = final_answer.split("SOURCE:", maxsplit=1)
797
+ if len(parts) > 1:
798
+ content = parts[0].strip()
799
+ sources = parts[1].strip()
800
+ else:
801
+ content = final_answer
802
+ sources = ""
803
+ return ChatDocument(
457
804
  content=content,
458
- metadata=DocMetaData(
805
+ metadata=ChatDocMetaData(
459
806
  source="SOURCE: " + sources,
460
807
  sender=Entity.LLM,
461
808
  cached=getattr(answer_doc.metadata, "cached", False),
@@ -465,7 +812,7 @@ class DocChatAgent(ChatAgent):
465
812
  def llm_hypothetical_answer(self, query: str) -> str:
466
813
  if self.llm is None:
467
814
  raise ValueError("LLM not set")
468
- with console.status("[cyan]LLM generating hypothetical answer..."):
815
+ with status("[cyan]LLM generating hypothetical answer..."):
469
816
  with StreamingIfAllowed(self.llm, False):
470
817
  # TODO: provide an easy way to
471
818
  # Adjust this prompt depending on context.
@@ -485,7 +832,7 @@ class DocChatAgent(ChatAgent):
485
832
  def llm_rephrase_query(self, query: str) -> List[str]:
486
833
  if self.llm is None:
487
834
  raise ValueError("LLM not set")
488
- with console.status("[cyan]LLM generating rephrases of query..."):
835
+ with status("[cyan]LLM generating rephrases of query..."):
489
836
  with StreamingIfAllowed(self.llm, False):
490
837
  rephrases = self.llm_response_forget(
491
838
  f"""
@@ -501,11 +848,11 @@ class DocChatAgent(ChatAgent):
501
848
  ) -> List[Tuple[Document, float]]:
502
849
  # find similar docs using bm25 similarity:
503
850
  # these may sometimes be more likely to contain a relevant verbatim extract
504
- with console.status("[cyan]Searching for similar chunks using bm25..."):
505
- if self.chunked_docs is None:
851
+ with status("[cyan]Searching for similar chunks using bm25..."):
852
+ if self.chunked_docs is None or len(self.chunked_docs) == 0:
506
853
  logger.warning("No chunked docs; cannot use bm25-similarity")
507
854
  return []
508
- if self.chunked_docs_clean is None:
855
+ if self.chunked_docs_clean is None or len(self.chunked_docs_clean) == 0:
509
856
  logger.warning("No cleaned chunked docs; cannot use bm25-similarity")
510
857
  return []
511
858
  docs_scores = find_closest_matches_with_bm25(
@@ -519,7 +866,7 @@ class DocChatAgent(ChatAgent):
519
866
  def get_fuzzy_matches(self, query: str, multiple: int) -> List[Document]:
520
867
  # find similar docs using fuzzy matching:
521
868
  # these may sometimes be more likely to contain a relevant verbatim extract
522
- with console.status("[cyan]Finding fuzzy matches in chunks..."):
869
+ with status("[cyan]Finding fuzzy matches in chunks..."):
523
870
  if self.chunked_docs is None:
524
871
  logger.warning("No chunked docs; cannot use fuzzy matching")
525
872
  return []
@@ -539,7 +886,7 @@ class DocChatAgent(ChatAgent):
539
886
  def rerank_with_cross_encoder(
540
887
  self, query: str, passages: List[Document]
541
888
  ) -> List[Document]:
542
- with console.status("[cyan]Re-ranking retrieved chunks using cross-encoder..."):
889
+ with status("[cyan]Re-ranking retrieved chunks using cross-encoder..."):
543
890
  try:
544
891
  from sentence_transformers import CrossEncoder
545
892
  except ImportError:
@@ -657,8 +1004,45 @@ class DocChatAgent(ChatAgent):
657
1004
  """
658
1005
  if self.vecdb is None or self.config.n_neighbor_chunks == 0:
659
1006
  return docs_scores
1007
+ if len(docs_scores) == 0:
1008
+ return []
1009
+ if set(docs_scores[0][0].__fields__) != {"content", "metadata"}:
1010
+ # Do not add context window when there are other fields besides just
1011
+ # content and metadata, since we do not know how to set those other fields
1012
+ # for newly created docs with combined content.
1013
+ return docs_scores
660
1014
  return self.vecdb.add_context_window(docs_scores, self.config.n_neighbor_chunks)
661
1015
 
1016
+ def get_semantic_search_results(
1017
+ self,
1018
+ query: str,
1019
+ k: int = 10,
1020
+ ) -> List[Tuple[Document, float]]:
1021
+ """
1022
+ Get semantic search results from vecdb.
1023
+ Args:
1024
+ query (str): query to search for
1025
+ k (int): number of results to return
1026
+ Returns:
1027
+ List[Tuple[Document, float]]: List of (Document, score) tuples.
1028
+ """
1029
+ if self.vecdb is None:
1030
+ raise ValueError("VecDB not set")
1031
+ # Note: for dynamic filtering based on a query, users can
1032
+ # use the `temp_update` context-manager to pass in a `filter` to self.config,
1033
+ # e.g.:
1034
+ # with temp_update(self.config, {"filter": "metadata.source=='source1'"}):
1035
+ # docs_scores = self.get_semantic_search_results(query, k=k)
1036
+ # This avoids having pass the `filter` argument to every function call
1037
+ # upstream of this one.
1038
+ # The `temp_update` context manager is defined in
1039
+ # `langroid/utils/pydantic_utils.py`
1040
+ return self.vecdb.similar_texts_with_scores(
1041
+ query,
1042
+ k=k,
1043
+ where=self.config.filter,
1044
+ )
1045
+
662
1046
  def get_relevant_chunks(
663
1047
  self, query: str, query_proxies: List[str] = []
664
1048
  ) -> List[Document]:
@@ -695,21 +1079,21 @@ class DocChatAgent(ChatAgent):
695
1079
  if self.vecdb is None:
696
1080
  raise ValueError("VecDB not set")
697
1081
 
698
- with console.status("[cyan]Searching VecDB for relevant doc passages..."):
1082
+ with status("[cyan]Searching VecDB for relevant doc passages..."):
699
1083
  docs_and_scores: List[Tuple[Document, float]] = []
700
1084
  for q in [query] + query_proxies:
701
- docs_and_scores += self.vecdb.similar_texts_with_scores(
1085
+ docs_and_scores += self.get_semantic_search_results(
702
1086
  q,
703
1087
  k=self.config.parsing.n_similar_docs * retrieval_multiple,
704
1088
  )
705
1089
  # keep only docs with unique d.id()
706
1090
  id2doc_score = {d.id(): (d, s) for d, s in docs_and_scores}
707
1091
  docs_and_scores = list(id2doc_score.values())
708
-
709
- passages = [
710
- Document(content=d.content, metadata=d.metadata)
711
- for (d, _) in docs_and_scores
712
- ]
1092
+ passages = [d for (d, _) in docs_and_scores]
1093
+ # passages = [
1094
+ # Document(content=d.content, metadata=d.metadata)
1095
+ # for (d, _) in docs_and_scores
1096
+ # ]
713
1097
 
714
1098
  if self.config.use_bm25_search:
715
1099
  docs_scores = self.get_similar_chunks_bm25(query, retrieval_multiple)
@@ -771,7 +1155,7 @@ class DocChatAgent(ChatAgent):
771
1155
  # Regardless of whether we are in conversation mode or not,
772
1156
  # for relevant doc/chunk extraction, we must convert the query
773
1157
  # to a standalone query to get more relevant results.
774
- with console.status("[cyan]Converting to stand-alone query...[/cyan]"):
1158
+ with status("[cyan]Converting to stand-alone query...[/cyan]"):
775
1159
  with StreamingIfAllowed(self.llm, False):
776
1160
  query = self.llm.followup_to_standalone(self.dialog, query)
777
1161
  print(f"[orange2]New query: {query}")
@@ -790,7 +1174,7 @@ class DocChatAgent(ChatAgent):
790
1174
  if len(passages) == 0:
791
1175
  return query, []
792
1176
 
793
- with console.status("[cyan]LLM Extracting verbatim passages..."):
1177
+ with status("[cyan]LLM Extracting verbatim passages..."):
794
1178
  with StreamingIfAllowed(self.llm, False):
795
1179
  # these are async calls, one per passage; turn off streaming
796
1180
  extracts = self.get_verbatim_extracts(query, passages)
@@ -814,8 +1198,15 @@ class DocChatAgent(ChatAgent):
814
1198
  List[Document]: list of Documents containing extracts and metadata.
815
1199
  """
816
1200
  agent_cfg = self.config.relevance_extractor_config
1201
+ if agent_cfg is None:
1202
+ # no relevance extraction: simply return passages
1203
+ return passages
1204
+ if agent_cfg.llm is None:
1205
+ # Use main DocChatAgent's LLM if not provided explicitly:
1206
+ # this reduces setup burden on the user
1207
+ agent_cfg.llm = self.config.llm
817
1208
  agent_cfg.query = query
818
- agent_cfg.segment_length = 1
1209
+ agent_cfg.segment_length = self.config.extraction_granularity
819
1210
  agent_cfg.llm.stream = False # disable streaming for concurrent calls
820
1211
 
821
1212
  agent = RelevanceExtractorAgent(agent_cfg)
@@ -831,16 +1222,21 @@ class DocChatAgent(ChatAgent):
831
1222
  input_map=lambda msg: msg.content,
832
1223
  output_map=lambda ans: ans.content if ans is not None else NO_ANSWER,
833
1224
  )
834
- metadatas = [P.metadata for P in passages]
835
- # return with metadata so we can use it downstream, e.g. to cite sources
836
- return [
837
- Document(content=e, metadata=m)
838
- for e, m in zip(extracts, metadatas)
839
- if (e != NO_ANSWER and len(e) > 0)
840
- ]
841
1225
 
842
- @no_type_check
843
- def answer_from_docs(self, query: str) -> Document:
1226
+ # Caution: Retain ALL other fields in the Documents (which could be
1227
+ # other than just `content` and `metadata`), while simply replacing
1228
+ # `content` with the extracted portions
1229
+ passage_extracts = []
1230
+ for p, e in zip(passages, extracts):
1231
+ if e == NO_ANSWER or len(e) == 0:
1232
+ continue
1233
+ p_copy = p.copy()
1234
+ p_copy.content = e
1235
+ passage_extracts.append(p_copy)
1236
+
1237
+ return passage_extracts
1238
+
1239
+ def answer_from_docs(self, query: str) -> ChatDocument:
844
1240
  """
845
1241
  Answer query based on relevant docs from the VecDB
846
1242
 
@@ -850,24 +1246,38 @@ class DocChatAgent(ChatAgent):
850
1246
  Returns:
851
1247
  Document: answer
852
1248
  """
853
- response = Document(
1249
+ response = ChatDocument(
854
1250
  content=NO_ANSWER,
855
- metadata=DocMetaData(
1251
+ metadata=ChatDocMetaData(
856
1252
  source="None",
1253
+ sender=Entity.LLM,
857
1254
  ),
858
1255
  )
859
1256
  # query may be updated to a stand-alone version
860
1257
  query, extracts = self.get_relevant_extracts(query)
861
1258
  if len(extracts) == 0:
862
1259
  return response
1260
+ if self.llm is None:
1261
+ raise ValueError("LLM not set")
1262
+ if self.config.retrieve_only:
1263
+ # only return extracts, skip LLM-based summary answer
1264
+ meta = dict(
1265
+ sender=Entity.LLM,
1266
+ )
1267
+ # copy metadata from first doc, unclear what to do here.
1268
+ meta.update(extracts[0].metadata)
1269
+ return ChatDocument(
1270
+ content="\n\n".join([e.content for e in extracts]),
1271
+ metadata=ChatDocMetaData(**meta),
1272
+ )
863
1273
  with ExitStack() as stack:
864
1274
  # conditionally use Streaming or rich console context
865
1275
  cm = (
866
1276
  StreamingIfAllowed(self.llm)
867
1277
  if settings.stream
868
- else (console.status("LLM Generating final answer..."))
1278
+ else (status("LLM Generating final answer..."))
869
1279
  )
870
- stack.enter_context(cm)
1280
+ stack.enter_context(cm) # type: ignore
871
1281
  response = self.get_summary_answer(query, extracts)
872
1282
 
873
1283
  self.update_dialog(query, response.content)
@@ -881,7 +1291,7 @@ class DocChatAgent(ChatAgent):
881
1291
  """Summarize all docs"""
882
1292
  if self.llm is None:
883
1293
  raise ValueError("LLM not set")
884
- if self.original_docs is None:
1294
+ if len(self.original_docs) == 0:
885
1295
  logger.warning(
886
1296
  """
887
1297
  No docs to summarize! Perhaps you are re-using a previously
@@ -910,19 +1320,22 @@ class DocChatAgent(ChatAgent):
910
1320
  )
911
1321
  prompt = f"""
912
1322
  {instruction}
1323
+
1324
+ FULL TEXT:
913
1325
  {full_text}
914
1326
  """.strip()
915
1327
  with StreamingIfAllowed(self.llm):
916
- summary = Agent.llm_response(self, prompt)
917
- return summary # type: ignore
1328
+ summary = ChatAgent.llm_response(self, prompt)
1329
+ return summary
918
1330
 
919
- def justify_response(self) -> None:
1331
+ def justify_response(self) -> ChatDocument | None:
920
1332
  """Show evidence for last response"""
921
1333
  if self.response is None:
922
1334
  print("[magenta]No response yet")
923
- return
1335
+ return None
924
1336
  source = self.response.metadata.source
925
1337
  if len(source) > 0:
926
1338
  print("[magenta]" + source)
927
1339
  else:
928
1340
  print("[magenta]No source found")
1341
+ return None