langroid 0.1.217__tar.gz → 0.1.219__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {langroid-0.1.217 → langroid-0.1.219}/PKG-INFO +3 -2
  2. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/doc_chat_agent.py +54 -25
  3. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/document_parser.py +145 -22
  4. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/parse_json.py +18 -24
  5. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/repo_loader.py +69 -49
  6. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/urls.py +18 -9
  7. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/utils.py +27 -9
  8. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/system.py +1 -1
  9. {langroid-0.1.217 → langroid-0.1.219}/pyproject.toml +3 -2
  10. {langroid-0.1.217 → langroid-0.1.219}/LICENSE +0 -0
  11. {langroid-0.1.217 → langroid-0.1.219}/README.md +0 -0
  12. {langroid-0.1.217 → langroid-0.1.219}/langroid/__init__.py +0 -0
  13. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/__init__.py +0 -0
  14. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/base.py +0 -0
  15. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/batch.py +0 -0
  16. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/callbacks/__init__.py +0 -0
  17. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/callbacks/chainlit.py +0 -0
  18. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/chat_agent.py +0 -0
  19. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/chat_document.py +0 -0
  20. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/helpers.py +0 -0
  21. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/junk +0 -0
  22. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/openai_assistant.py +0 -0
  23. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/__init__.py +0 -0
  24. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  25. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_rag/__init__.py +0 -0
  26. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  27. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  28. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  29. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/lance_tools.py +0 -0
  30. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/__init__.py +0 -0
  31. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  32. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  33. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/utils/__init__.py +0 -0
  34. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/neo4j/utils/system_message.py +0 -0
  35. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  36. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/retriever_agent.py +0 -0
  37. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/__init__.py +0 -0
  38. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  39. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/__init__.py +0 -0
  40. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  41. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  42. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/system_message.py +0 -0
  43. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/sql/utils/tools.py +0 -0
  44. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/special/table_chat_agent.py +0 -0
  45. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/task.py +0 -0
  46. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tool_message.py +0 -0
  47. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/__init__.py +0 -0
  48. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  49. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/extract_tool.py +0 -0
  50. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/generator_tool.py +0 -0
  51. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/google_search_tool.py +0 -0
  52. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  53. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/recipient_tool.py +0 -0
  54. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/run_python_code.py +0 -0
  55. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/sciphi_search_rag_tool.py +0 -0
  56. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent/tools/segment_extract_tool.py +0 -0
  57. {langroid-0.1.217 → langroid-0.1.219}/langroid/agent_config.py +0 -0
  58. {langroid-0.1.217 → langroid-0.1.219}/langroid/cachedb/__init__.py +0 -0
  59. {langroid-0.1.217 → langroid-0.1.219}/langroid/cachedb/base.py +0 -0
  60. {langroid-0.1.217 → langroid-0.1.219}/langroid/cachedb/momento_cachedb.py +0 -0
  61. {langroid-0.1.217 → langroid-0.1.219}/langroid/cachedb/redis_cachedb.py +0 -0
  62. {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/__init__.py +0 -0
  63. {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/base.py +0 -0
  64. {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/clustering.py +0 -0
  65. {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/models.py +0 -0
  66. {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings.proto +0 -0
  67. {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
  68. {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
  69. {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
  70. {langroid-0.1.217 → langroid-0.1.219}/langroid/embedding_models/remote_embeds.py +0 -0
  71. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/__init__.py +0 -0
  72. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/azure_openai.py +0 -0
  73. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/base.py +0 -0
  74. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/config.py +0 -0
  75. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/openai_assistants.py +0 -0
  76. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/openai_gpt.py +0 -0
  77. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  78. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/prompt_formatter/base.py +0 -0
  79. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
  80. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  81. {langroid-0.1.217 → langroid-0.1.219}/langroid/language_models/utils.py +0 -0
  82. {langroid-0.1.217 → langroid-0.1.219}/langroid/mytypes.py +0 -0
  83. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/__init__.py +0 -0
  84. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/agent_chats.py +0 -0
  85. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/code-parsing.md +0 -0
  86. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/code_parser.py +0 -0
  87. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/config.py +0 -0
  88. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/image_text.py +0 -0
  89. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/para_sentence_split.py +0 -0
  90. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/parser.py +0 -0
  91. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/search.py +0 -0
  92. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/spider.py +0 -0
  93. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/table_loader.py +0 -0
  94. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/url_loader.py +0 -0
  95. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/url_loader_cookies.py +0 -0
  96. {langroid-0.1.217 → langroid-0.1.219}/langroid/parsing/web_search.py +0 -0
  97. {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/__init__.py +0 -0
  98. {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/chat-gpt4-system-prompt.md +0 -0
  99. {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/dialog.py +0 -0
  100. {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/prompts_config.py +0 -0
  101. {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/templates.py +0 -0
  102. {langroid-0.1.217 → langroid-0.1.219}/langroid/prompts/transforms.py +0 -0
  103. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/__init__.py +0 -0
  104. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/algorithms/__init__.py +0 -0
  105. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/algorithms/graph.py +0 -0
  106. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/configuration.py +0 -0
  107. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/constants.py +0 -0
  108. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/docker.py +0 -0
  109. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/globals.py +0 -0
  110. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/llms/__init__.py +0 -0
  111. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/llms/strings.py +0 -0
  112. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/logging.py +0 -0
  113. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/output/__init__.py +0 -0
  114. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/output/printing.py +0 -0
  115. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/output/status.py +0 -0
  116. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/pandas_utils.py +0 -0
  117. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/pydantic_utils.py +0 -0
  118. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/web/__init__.py +0 -0
  119. {langroid-0.1.217 → langroid-0.1.219}/langroid/utils/web/login.py +0 -0
  120. {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/__init__.py +0 -0
  121. {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/base.py +0 -0
  122. {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/chromadb.py +0 -0
  123. {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/lancedb.py +0 -0
  124. {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/meilisearch.py +0 -0
  125. {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/momento.py +0 -0
  126. {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/qdrant_cloud.py +0 -0
  127. {langroid-0.1.217 → langroid-0.1.219}/langroid/vector_store/qdrantdb.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.1.217
3
+ Version: 0.1.219
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -85,7 +85,7 @@ Requires-Dist: pytest-redis (>=3.0.2,<4.0.0)
85
85
  Requires-Dist: python-docx (>=1.1.0,<2.0.0)
86
86
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
87
87
  Requires-Dist: python-socketio (>=5.11.0,<6.0.0) ; extra == "chainlit"
88
- Requires-Dist: qdrant-client (>=1.7.0,<2.0.0)
88
+ Requires-Dist: qdrant-client (>=1.8.0,<2.0.0)
89
89
  Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
90
90
  Requires-Dist: redis (>=5.0.1,<6.0.0)
91
91
  Requires-Dist: requests (>=2.31.0,<3.0.0)
@@ -101,6 +101,7 @@ Requires-Dist: tiktoken (>=0.5.1,<0.6.0)
101
101
  Requires-Dist: torch (==2.0.0) ; extra == "hf-embeddings"
102
102
  Requires-Dist: trafilatura (>=1.5.0,<2.0.0)
103
103
  Requires-Dist: typer (>=0.9.0,<0.10.0)
104
+ Requires-Dist: types-pyyaml (>=6.0.12.20240311,<7.0.0.0)
104
105
  Requires-Dist: types-redis (>=4.5.5.2,<5.0.0.0)
105
106
  Requires-Dist: types-requests (>=2.31.0.1,<3.0.0.0)
106
107
  Requires-Dist: unstructured[docx,pdf,pptx] (>=0.10.16,<0.10.18) ; extra == "unstructured"
@@ -35,6 +35,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
35
35
  from langroid.language_models.base import StreamingIfAllowed
36
36
  from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
37
37
  from langroid.mytypes import DocMetaData, Document, Entity
38
+ from langroid.parsing.document_parser import DocumentType
38
39
  from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
39
40
  from langroid.parsing.repo_loader import RepoLoader
40
41
  from langroid.parsing.search import (
@@ -44,7 +45,7 @@ from langroid.parsing.search import (
44
45
  )
45
46
  from langroid.parsing.table_loader import describe_dataframe
46
47
  from langroid.parsing.url_loader import URLLoader
47
- from langroid.parsing.urls import get_list_from_user, get_urls_and_paths
48
+ from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
48
49
  from langroid.parsing.utils import batched
49
50
  from langroid.prompts.prompts_config import PromptsConfig
50
51
  from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
@@ -126,7 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
126
127
  llm=None # use the parent's llm unless explicitly set here
127
128
  )
128
129
  )
129
- doc_paths: List[str] = []
130
+ doc_paths: List[str | bytes] = []
130
131
  default_paths: List[str] = [
131
132
  "https://news.ycombinator.com/item?id=35629033",
132
133
  "https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
@@ -248,62 +249,84 @@ class DocChatAgent(ChatAgent):
248
249
  raise ValueError("VecDB not set")
249
250
  self.setup_documents(filter=self.config.filter)
250
251
  return
251
- self.ingest_doc_paths(self.config.doc_paths)
252
+ self.ingest_doc_paths(self.config.doc_paths) # type: ignore
252
253
 
253
254
  def ingest_doc_paths(
254
255
  self,
255
- paths: List[str],
256
+ paths: str | bytes | List[str | bytes],
256
257
  metadata: (
257
258
  List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
258
259
  ) = [],
260
+ doc_type: str | DocumentType | None = None,
259
261
  ) -> List[Document]:
260
262
  """Split, ingest docs from specified paths,
261
263
  do not add these to config.doc_paths.
262
264
 
263
265
  Args:
264
- paths: List of file/folder paths or URLs
266
+ paths: document paths, urls or byte-content of docs.
267
+ The bytes option is intended to support cases where a document
268
+ has already been read in as bytes (e.g. from an API or a database),
269
+ and we want to avoid having to write it to a temporary file
270
+ just to read it back in.
265
271
  metadata: List of metadata dicts, one for each path.
266
272
  If a single dict is passed in, it is used for all paths.
273
+ doc_type: DocumentType to use for parsing, if known.
274
+ MUST apply to all docs if specified.
275
+ This is especially useful when the `paths` are of bytes type,
276
+ to help with document type detection.
267
277
  Returns:
268
278
  List of Document objects
269
279
  """
280
+ if isinstance(paths, str) or isinstance(paths, bytes):
281
+ paths = [paths]
270
282
  all_paths = paths
271
- paths_meta: Dict[str, Any] = {}
272
- urls_meta: Dict[str, Any] = {}
273
- urls, paths = get_urls_and_paths(paths)
283
+ paths_meta: Dict[int, Any] = {}
284
+ urls_meta: Dict[int, Any] = {}
285
+ idxs = range(len(all_paths))
286
+ url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
287
+ urls = [all_paths[i] for i in url_idxs]
288
+ paths = [all_paths[i] for i in path_idxs]
289
+ bytes_list = [all_paths[i] for i in bytes_idxs]
290
+ path_idxs.extend(bytes_idxs)
291
+ paths.extend(bytes_list)
274
292
  if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
275
293
  metadata, list
276
294
  ):
277
295
  if isinstance(metadata, list):
278
- path2meta = {
296
+ idx2meta = {
279
297
  p: (
280
298
  m
281
299
  if isinstance(m, dict)
282
300
  else (isinstance(m, DocMetaData) and m.dict())
283
301
  ) # appease mypy
284
- for p, m in zip(all_paths, metadata)
302
+ for p, m in zip(idxs, metadata)
285
303
  }
286
304
  elif isinstance(metadata, dict):
287
- path2meta = {p: metadata for p in all_paths}
305
+ idx2meta = {p: metadata for p in idxs}
288
306
  else:
289
- path2meta = {p: metadata.dict() for p in all_paths}
290
- urls_meta = {u: path2meta[u] for u in urls}
291
- paths_meta = {p: path2meta[p] for p in paths}
307
+ idx2meta = {p: metadata.dict() for p in idxs}
308
+ urls_meta = {u: idx2meta[u] for u in url_idxs}
309
+ paths_meta = {p: idx2meta[p] for p in path_idxs}
292
310
  docs: List[Document] = []
293
311
  parser = Parser(self.config.parsing)
294
312
  if len(urls) > 0:
295
- for u in urls:
296
- meta = urls_meta.get(u, {})
297
- loader = URLLoader(urls=[u], parser=parser)
313
+ for ui in url_idxs:
314
+ meta = urls_meta.get(ui, {})
315
+ loader = URLLoader(urls=[all_paths[ui]], parser=parser) # type: ignore
298
316
  url_docs = loader.load()
299
317
  # update metadata of each doc with meta
300
318
  for d in url_docs:
301
319
  d.metadata = d.metadata.copy(update=meta)
302
320
  docs.extend(url_docs)
303
- if len(paths) > 0:
304
- for p in paths:
305
- meta = paths_meta.get(p, {})
306
- path_docs = RepoLoader.get_documents(p, parser=parser)
321
+ if len(paths) > 0: # paths OR bytes are handled similarly
322
+ for pi in path_idxs:
323
+ meta = paths_meta.get(pi, {})
324
+ p = all_paths[pi]
325
+ path_docs = RepoLoader.get_documents(
326
+ p,
327
+ parser=parser,
328
+ doc_type=doc_type,
329
+ )
307
330
  # update metadata of each doc with meta
308
331
  for d in path_docs:
309
332
  d.metadata = d.metadata.copy(update=meta)
@@ -317,11 +340,12 @@ class DocChatAgent(ChatAgent):
317
340
  print(
318
341
  f"""
319
342
  [green]I have processed the following {n_urls} URLs
320
- and {n_paths} paths into {n_splits} parts:
343
+ and {n_paths} docs into {n_splits} parts:
321
344
  """.strip()
322
345
  )
323
- print("\n".join(urls))
324
- print("\n".join(paths))
346
+ path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
347
+ print("\n".join([u for u in urls if isinstance(u, str)])) # appease mypy
348
+ print("\n".join(path_reps))
325
349
  return docs
326
350
 
327
351
  def ingest_docs(
@@ -388,6 +412,7 @@ class DocChatAgent(ChatAgent):
388
412
  + ",content="
389
413
  + d.content
390
414
  )
415
+ docs = docs[: self.config.parsing.max_chunks]
391
416
  # add embeddings in batches, to stay under limit of embeddings API
392
417
  batches = list(batched(docs, self.config.embed_batch_size))
393
418
  for batch in batches:
@@ -463,6 +488,10 @@ class DocChatAgent(ChatAgent):
463
488
  d.metadata.is_chunk = True
464
489
  return self.ingest_docs(docs)
465
490
 
491
+ def set_filter(self, filter: str) -> None:
492
+ self.config.filter = filter
493
+ self.setup_documents(filter=filter)
494
+
466
495
  def setup_documents(
467
496
  self,
468
497
  docs: List[Document] = [],
@@ -609,7 +638,7 @@ class DocChatAgent(ChatAgent):
609
638
  if len(inputs) == 0:
610
639
  if is_new_collection:
611
640
  inputs = self.config.default_paths
612
- self.config.doc_paths = inputs
641
+ self.config.doc_paths = inputs # type: ignore
613
642
  self.ingest()
614
643
 
615
644
  def llm_response(
@@ -1,3 +1,4 @@
1
+ import itertools
1
2
  import logging
2
3
  import re
3
4
  from enum import Enum
@@ -8,6 +9,7 @@ import fitz
8
9
  import pdfplumber
9
10
  import pypdf
10
11
  import requests
12
+ from bs4 import BeautifulSoup
11
13
  from PIL import Image
12
14
 
13
15
  from langroid.mytypes import DocMetaData, Document
@@ -20,6 +22,29 @@ class DocumentType(str, Enum):
20
22
  PDF = "pdf"
21
23
  DOCX = "docx"
22
24
  DOC = "doc"
25
+ TXT = "txt"
26
+
27
+
28
+ def is_plain_text(path_or_bytes: str | bytes) -> bool:
29
+ if isinstance(path_or_bytes, str):
30
+ if path_or_bytes.startswith(("http://", "https://")):
31
+ response = requests.get(path_or_bytes)
32
+ response.raise_for_status()
33
+ content = response.content[:1024]
34
+ else:
35
+ with open(path_or_bytes, "rb") as f:
36
+ content = f.read(1024)
37
+ else:
38
+ content = path_or_bytes[:1024]
39
+ try:
40
+ # Attempt to decode the content as UTF-8
41
+ _ = content.decode("utf-8")
42
+ # Additional checks can go here, e.g., to verify that the content
43
+ # doesn't contain too many unusual characters for it to be considered text
44
+ return True
45
+ except UnicodeDecodeError:
46
+ # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
47
+ return False
23
48
 
24
49
 
25
50
  class DocumentParser(Parser):
@@ -33,19 +58,26 @@ class DocumentParser(Parser):
33
58
  """
34
59
 
35
60
  @classmethod
36
- def create(cls, source: str, config: ParsingConfig) -> "DocumentParser":
61
+ def create(
62
+ cls,
63
+ source: str | bytes,
64
+ config: ParsingConfig,
65
+ doc_type: str | DocumentType | None = None,
66
+ ) -> "DocumentParser":
37
67
  """
38
68
  Create a DocumentParser instance based on source type
39
69
  and config.<source_type>.library specified.
40
70
 
41
71
  Args:
42
- source (str): The source of the PDF, either a URL or a file path.
72
+ source (str|bytes): The source, could be a URL, file path,
73
+ or bytes object.
43
74
  config (ParserConfig): The parser configuration.
75
+ doc_type (str|None): The type of document, if known
44
76
 
45
77
  Returns:
46
78
  DocumentParser: An instance of a DocumentParser subclass.
47
79
  """
48
- if DocumentParser._document_type(source) == DocumentType.PDF:
80
+ if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
49
81
  if config.pdf.library == "fitz":
50
82
  return FitzPDFParser(source, config)
51
83
  elif config.pdf.library == "pypdf":
@@ -60,7 +92,7 @@ class DocumentParser(Parser):
60
92
  raise ValueError(
61
93
  f"Unsupported PDF library specified: {config.pdf.library}"
62
94
  )
63
- elif DocumentParser._document_type(source) == DocumentType.DOCX:
95
+ elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
64
96
  if config.docx.library == "unstructured":
65
97
  return UnstructuredDocxParser(source, config)
66
98
  elif config.docx.library == "python-docx":
@@ -69,42 +101,78 @@ class DocumentParser(Parser):
69
101
  raise ValueError(
70
102
  f"Unsupported DOCX library specified: {config.docx.library}"
71
103
  )
72
- elif DocumentParser._document_type(source) == DocumentType.DOC:
104
+ elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
73
105
  return UnstructuredDocParser(source, config)
74
106
  else:
75
- raise ValueError(f"Unsupported document type: {source}")
107
+ source_name = source if isinstance(source, str) else "bytes"
108
+ raise ValueError(f"Unsupported document type: {source_name}")
76
109
 
77
- def __init__(self, source: str, config: ParsingConfig):
110
+ def __init__(self, source: str | bytes, config: ParsingConfig):
78
111
  """
79
- Initialize the PDFParser.
80
-
81
112
  Args:
82
- source (str): The source of the PDF, either a URL or a file path.
113
+ source (str|bytes): The source, which could be
114
+ a path, a URL or a bytes object.
83
115
  """
84
116
  super().__init__(config)
85
- self.source = source
86
117
  self.config = config
87
- self.doc_bytes = self._load_doc_as_bytesio()
118
+ if isinstance(source, bytes):
119
+ self.source = "bytes"
120
+ self.doc_bytes = BytesIO(source)
121
+ else:
122
+ self.source = source
123
+ self.doc_bytes = self._load_doc_as_bytesio()
88
124
 
89
125
  @staticmethod
90
- def _document_type(source: str) -> DocumentType:
126
+ def _document_type(
127
+ source: str | bytes, doc_type: str | DocumentType | None = None
128
+ ) -> DocumentType:
91
129
  """
92
130
  Determine the type of document based on the source.
93
131
 
94
132
  Args:
95
- source (str): The source of the PDF, either a URL or a file path.
133
+ source (str|bytes): The source, which could be a URL,
134
+ a file path, or a bytes object.
135
+ doc_type (str|DocumentType|None): The type of document, if known.
96
136
 
97
137
  Returns:
98
138
  str: The document type.
99
139
  """
100
- if source.lower().endswith(".pdf"):
101
- return DocumentType.PDF
102
- elif source.lower().endswith(".docx"):
103
- return DocumentType.DOCX
104
- elif source.lower().endswith(".doc"):
105
- return DocumentType.DOC
140
+ if isinstance(doc_type, DocumentType):
141
+ return doc_type
142
+ if doc_type:
143
+ return DocumentType(doc_type.lower())
144
+ if is_plain_text(source):
145
+ return DocumentType.TXT
146
+ if isinstance(source, str):
147
+ # detect file type from path extension
148
+ if source.lower().endswith(".pdf"):
149
+ return DocumentType.PDF
150
+ elif source.lower().endswith(".docx"):
151
+ return DocumentType.DOCX
152
+ elif source.lower().endswith(".doc"):
153
+ return DocumentType.DOC
154
+ else:
155
+ raise ValueError(f"Unsupported document type: {source}")
106
156
  else:
107
- raise ValueError(f"Unsupported document type: {source}")
157
+ # must be bytes: attempt to detect type from content
158
+ # using magic mime type detection
159
+ import magic
160
+
161
+ mime_type = magic.from_buffer(source, mime=True)
162
+ if mime_type == "application/pdf":
163
+ return DocumentType.PDF
164
+ elif mime_type in [
165
+ "application/vnd.openxmlformats-officedocument"
166
+ ".wordprocessingml.document",
167
+ "application/zip",
168
+ ]:
169
+ # DOCX files are essentially ZIP files,
170
+ # but this might catch other ZIP-based formats too!
171
+ return DocumentType.DOCX
172
+ elif mime_type == "application/msword":
173
+ return DocumentType.DOC
174
+ else:
175
+ raise ValueError("Unsupported document type from bytes")
108
176
 
109
177
  def _load_doc_as_bytesio(self) -> BytesIO:
110
178
  """
@@ -121,6 +189,61 @@ class DocumentParser(Parser):
121
189
  with open(self.source, "rb") as f:
122
190
  return BytesIO(f.read())
123
191
 
192
+ @staticmethod
193
+ def chunks_from_path_or_bytes(
194
+ source: str | bytes,
195
+ parser: Parser,
196
+ doc_type: str | DocumentType | None = None,
197
+ lines: int | None = None,
198
+ ) -> List[Document]:
199
+ """
200
+ Get document chunks from a file path or bytes object.
201
+ Args:
202
+ source (str|bytes): The source, which could be a URL, path or bytes object.
203
+ parser (Parser): The parser instance (for splitting the document).
204
+ doc_type (str|DocumentType|None): The type of document, if known.
205
+ lines (int|None): The number of lines to read from a plain text file.
206
+ Returns:
207
+ List[Document]: A list of `Document` objects,
208
+ each containing a chunk of text, determined by the
209
+ chunking and splitting settings in the parser config.
210
+ """
211
+ dtype: DocumentType = DocumentParser._document_type(source, doc_type)
212
+ if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
213
+ doc_parser = DocumentParser.create(
214
+ source,
215
+ parser.config,
216
+ doc_type=doc_type,
217
+ )
218
+ chunks = doc_parser.get_doc_chunks()
219
+ if len(chunks) == 0 and dtype == DocumentType.PDF:
220
+ doc_parser = ImagePdfParser(source, parser.config)
221
+ chunks = doc_parser.get_doc_chunks()
222
+ return chunks
223
+ else:
224
+ # try getting as plain text; these will be chunked downstream
225
+ # -- could be a bytes object or a path
226
+ if isinstance(source, bytes):
227
+ content = source.decode()
228
+ if lines is not None:
229
+ file_lines = content.splitlines()[:lines]
230
+ content = "\n".join(line.strip() for line in file_lines)
231
+ else:
232
+ with open(source, "r") as f:
233
+ if lines is not None:
234
+ file_lines = list(itertools.islice(f, lines))
235
+ content = "\n".join(line.strip() for line in file_lines)
236
+ else:
237
+ content = f.read()
238
+ soup = BeautifulSoup(content, "html.parser")
239
+ text = soup.get_text()
240
+ source_name = source if isinstance(source, str) else "bytes"
241
+ doc = Document(
242
+ content=text,
243
+ metadata=DocMetaData(source=str(source_name)),
244
+ )
245
+ return parser.split([doc])
246
+
124
247
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
125
248
  """Yield each page in the PDF."""
126
249
  raise NotImplementedError
@@ -145,7 +268,7 @@ class DocumentParser(Parser):
145
268
 
146
269
  def get_doc(self) -> Document:
147
270
  """
148
- Get entire text from pdf source as a single document.
271
+ Get entire text from source as a single document.
149
272
 
150
273
  Returns:
151
274
  a `Document` object containing the content of the pdf file,
@@ -1,7 +1,7 @@
1
1
  import json
2
- import re
3
2
  from typing import Any, Iterator, List
4
3
 
4
+ import yaml
5
5
  from pyparsing import nestedExpr, originalTextFor
6
6
 
7
7
 
@@ -45,37 +45,31 @@ def get_json_candidates(s: str) -> List[str]:
45
45
  return []
46
46
 
47
47
 
48
- def replace_undefined(s: str, undefined_placeholder: str = '"<undefined>"') -> str:
48
+ def add_quotes(s: str) -> str:
49
49
  """
50
- Replace undefined values in a potential json str with a placeholder.
50
+ Replace accidentally un-quoted string-like keys and values in a potential json str.
51
+ Intended to handle cases where a weak LLM may produce a JSON-like string
52
+ containing, e.g. "rent": DO-NOT-KNOW, where it "forgot" to put quotes on the value,
53
+ or city: "New York" where it "forgot" to put quotes on the key.
54
+ It will even handle cases like 'address: do not know'.
55
+
56
+ Got this fiendishly clever solution from
57
+ https://stackoverflow.com/a/66053900/10940584
58
+ Far better/safer than trying to do it with regexes.
51
59
 
52
60
  Args:
53
61
  - s (str): The potential JSON string to parse.
54
- - undefined_placeholder (str): The placeholder or error message
55
- for undefined values.
56
62
 
57
63
  Returns:
58
- - str: The (potential) JSON string with undefined values
59
- replaced by the placeholder.
64
+ - str: The (potential) JSON string with un-quoted string-like values
65
+ replaced by quoted values.
60
66
  """
61
-
62
- # Preprocess the string to replace undefined values with the placeholder
63
- # This regex looks for patterns like ": <identifier>" and replaces them
64
- # with the placeholder.
65
- # It's a simple approach and might need adjustments for complex cases
66
- # This is an attempt to handle cases where a weak LLM may produce
67
- # a JSON-like string without quotes around some values, e.g.
68
- # {"rent": DO-NOT-KNOW }
69
- preprocessed_s = re.sub(
70
- r":\s*([a-zA-Z_][a-zA-Z_0-9\-]*)", f": {undefined_placeholder}", s
71
- )
72
-
73
- # Now, attempt to parse the preprocessed string as JSON
67
+ if is_valid_json(s):
68
+ return s
74
69
  try:
75
- return preprocessed_s
70
+ dct = yaml.load(s, yaml.SafeLoader)
71
+ return json.dumps(dct)
76
72
  except Exception:
77
- # If parsing fails, return an error message instead
78
- # (this should be rare after preprocessing)
79
73
  return s
80
74
 
81
75
 
@@ -115,7 +109,7 @@ def extract_top_level_json(s: str) -> List[str]:
115
109
  candidate.replace("\\{", "{").replace("\\}", "}").replace("\\_", "_")
116
110
  for candidate in json_candidates
117
111
  ]
118
- candidates = [replace_undefined(candidate) for candidate in normalized_candidates]
112
+ candidates = [add_quotes(candidate) for candidate in normalized_candidates]
119
113
  candidates = [repair_newlines(candidate) for candidate in candidates]
120
114
  top_level_jsons = [
121
115
  candidate for candidate in candidates if is_valid_json(candidate)