langroid 0.1.218__tar.gz → 0.1.219__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. {langroid-0.1.218 → langroid-0.1.219}/PKG-INFO +2 -2
  2. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/doc_chat_agent.py +54 -25
  3. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/document_parser.py +145 -22
  4. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/repo_loader.py +69 -49
  5. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/urls.py +18 -9
  6. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/utils.py +27 -9
  7. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/system.py +1 -1
  8. {langroid-0.1.218 → langroid-0.1.219}/pyproject.toml +2 -2
  9. {langroid-0.1.218 → langroid-0.1.219}/LICENSE +0 -0
  10. {langroid-0.1.218 → langroid-0.1.219}/README.md +0 -0
  11. {langroid-0.1.218 → langroid-0.1.219}/langroid/__init__.py +0 -0
  12. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/__init__.py +0 -0
  13. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/base.py +0 -0
  14. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/batch.py +0 -0
  15. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/callbacks/__init__.py +0 -0
  16. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/callbacks/chainlit.py +0 -0
  17. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/chat_agent.py +0 -0
  18. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/chat_document.py +0 -0
  19. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/helpers.py +0 -0
  20. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/junk +0 -0
  21. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/openai_assistant.py +0 -0
  22. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/__init__.py +0 -0
  23. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  24. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/lance_rag/__init__.py +0 -0
  25. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  26. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  27. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  28. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/lance_tools.py +0 -0
  29. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/neo4j/__init__.py +0 -0
  30. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  31. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  32. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/neo4j/utils/__init__.py +0 -0
  33. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/neo4j/utils/system_message.py +0 -0
  34. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  35. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/retriever_agent.py +0 -0
  36. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/sql/__init__.py +0 -0
  37. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  38. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/sql/utils/__init__.py +0 -0
  39. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  40. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  41. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/sql/utils/system_message.py +0 -0
  42. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/sql/utils/tools.py +0 -0
  43. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/special/table_chat_agent.py +0 -0
  44. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/task.py +0 -0
  45. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tool_message.py +0 -0
  46. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/__init__.py +0 -0
  47. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  48. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/extract_tool.py +0 -0
  49. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/generator_tool.py +0 -0
  50. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/google_search_tool.py +0 -0
  51. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  52. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/recipient_tool.py +0 -0
  53. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/run_python_code.py +0 -0
  54. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/sciphi_search_rag_tool.py +0 -0
  55. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent/tools/segment_extract_tool.py +0 -0
  56. {langroid-0.1.218 → langroid-0.1.219}/langroid/agent_config.py +0 -0
  57. {langroid-0.1.218 → langroid-0.1.219}/langroid/cachedb/__init__.py +0 -0
  58. {langroid-0.1.218 → langroid-0.1.219}/langroid/cachedb/base.py +0 -0
  59. {langroid-0.1.218 → langroid-0.1.219}/langroid/cachedb/momento_cachedb.py +0 -0
  60. {langroid-0.1.218 → langroid-0.1.219}/langroid/cachedb/redis_cachedb.py +0 -0
  61. {langroid-0.1.218 → langroid-0.1.219}/langroid/embedding_models/__init__.py +0 -0
  62. {langroid-0.1.218 → langroid-0.1.219}/langroid/embedding_models/base.py +0 -0
  63. {langroid-0.1.218 → langroid-0.1.219}/langroid/embedding_models/clustering.py +0 -0
  64. {langroid-0.1.218 → langroid-0.1.219}/langroid/embedding_models/models.py +0 -0
  65. {langroid-0.1.218 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings.proto +0 -0
  66. {langroid-0.1.218 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
  67. {langroid-0.1.218 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
  68. {langroid-0.1.218 → langroid-0.1.219}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
  69. {langroid-0.1.218 → langroid-0.1.219}/langroid/embedding_models/remote_embeds.py +0 -0
  70. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/__init__.py +0 -0
  71. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/azure_openai.py +0 -0
  72. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/base.py +0 -0
  73. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/config.py +0 -0
  74. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/openai_assistants.py +0 -0
  75. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/openai_gpt.py +0 -0
  76. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  77. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/prompt_formatter/base.py +0 -0
  78. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
  79. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  80. {langroid-0.1.218 → langroid-0.1.219}/langroid/language_models/utils.py +0 -0
  81. {langroid-0.1.218 → langroid-0.1.219}/langroid/mytypes.py +0 -0
  82. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/__init__.py +0 -0
  83. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/agent_chats.py +0 -0
  84. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/code-parsing.md +0 -0
  85. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/code_parser.py +0 -0
  86. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/config.py +0 -0
  87. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/image_text.py +0 -0
  88. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/para_sentence_split.py +0 -0
  89. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/parse_json.py +0 -0
  90. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/parser.py +0 -0
  91. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/search.py +0 -0
  92. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/spider.py +0 -0
  93. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/table_loader.py +0 -0
  94. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/url_loader.py +0 -0
  95. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/url_loader_cookies.py +0 -0
  96. {langroid-0.1.218 → langroid-0.1.219}/langroid/parsing/web_search.py +0 -0
  97. {langroid-0.1.218 → langroid-0.1.219}/langroid/prompts/__init__.py +0 -0
  98. {langroid-0.1.218 → langroid-0.1.219}/langroid/prompts/chat-gpt4-system-prompt.md +0 -0
  99. {langroid-0.1.218 → langroid-0.1.219}/langroid/prompts/dialog.py +0 -0
  100. {langroid-0.1.218 → langroid-0.1.219}/langroid/prompts/prompts_config.py +0 -0
  101. {langroid-0.1.218 → langroid-0.1.219}/langroid/prompts/templates.py +0 -0
  102. {langroid-0.1.218 → langroid-0.1.219}/langroid/prompts/transforms.py +0 -0
  103. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/__init__.py +0 -0
  104. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/algorithms/__init__.py +0 -0
  105. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/algorithms/graph.py +0 -0
  106. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/configuration.py +0 -0
  107. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/constants.py +0 -0
  108. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/docker.py +0 -0
  109. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/globals.py +0 -0
  110. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/llms/__init__.py +0 -0
  111. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/llms/strings.py +0 -0
  112. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/logging.py +0 -0
  113. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/output/__init__.py +0 -0
  114. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/output/printing.py +0 -0
  115. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/output/status.py +0 -0
  116. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/pandas_utils.py +0 -0
  117. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/pydantic_utils.py +0 -0
  118. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/web/__init__.py +0 -0
  119. {langroid-0.1.218 → langroid-0.1.219}/langroid/utils/web/login.py +0 -0
  120. {langroid-0.1.218 → langroid-0.1.219}/langroid/vector_store/__init__.py +0 -0
  121. {langroid-0.1.218 → langroid-0.1.219}/langroid/vector_store/base.py +0 -0
  122. {langroid-0.1.218 → langroid-0.1.219}/langroid/vector_store/chromadb.py +0 -0
  123. {langroid-0.1.218 → langroid-0.1.219}/langroid/vector_store/lancedb.py +0 -0
  124. {langroid-0.1.218 → langroid-0.1.219}/langroid/vector_store/meilisearch.py +0 -0
  125. {langroid-0.1.218 → langroid-0.1.219}/langroid/vector_store/momento.py +0 -0
  126. {langroid-0.1.218 → langroid-0.1.219}/langroid/vector_store/qdrant_cloud.py +0 -0
  127. {langroid-0.1.218 → langroid-0.1.219}/langroid/vector_store/qdrantdb.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langroid
3
- Version: 0.1.218
3
+ Version: 0.1.219
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  License: MIT
6
6
  Author: Prasad Chalasani
@@ -85,7 +85,7 @@ Requires-Dist: pytest-redis (>=3.0.2,<4.0.0)
85
85
  Requires-Dist: python-docx (>=1.1.0,<2.0.0)
86
86
  Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
87
87
  Requires-Dist: python-socketio (>=5.11.0,<6.0.0) ; extra == "chainlit"
88
- Requires-Dist: qdrant-client (>=1.7.0,<2.0.0)
88
+ Requires-Dist: qdrant-client (>=1.8.0,<2.0.0)
89
89
  Requires-Dist: rank-bm25 (>=0.2.2,<0.3.0)
90
90
  Requires-Dist: redis (>=5.0.1,<6.0.0)
91
91
  Requires-Dist: requests (>=2.31.0,<3.0.0)
@@ -35,6 +35,7 @@ from langroid.embedding_models.models import OpenAIEmbeddingsConfig
35
35
  from langroid.language_models.base import StreamingIfAllowed
36
36
  from langroid.language_models.openai_gpt import OpenAIChatModel, OpenAIGPTConfig
37
37
  from langroid.mytypes import DocMetaData, Document, Entity
38
+ from langroid.parsing.document_parser import DocumentType
38
39
  from langroid.parsing.parser import Parser, ParsingConfig, PdfParsingConfig, Splitter
39
40
  from langroid.parsing.repo_loader import RepoLoader
40
41
  from langroid.parsing.search import (
@@ -44,7 +45,7 @@ from langroid.parsing.search import (
44
45
  )
45
46
  from langroid.parsing.table_loader import describe_dataframe
46
47
  from langroid.parsing.url_loader import URLLoader
47
- from langroid.parsing.urls import get_list_from_user, get_urls_and_paths
48
+ from langroid.parsing.urls import get_list_from_user, get_urls_paths_bytes_indices
48
49
  from langroid.parsing.utils import batched
49
50
  from langroid.prompts.prompts_config import PromptsConfig
50
51
  from langroid.prompts.templates import SUMMARY_ANSWER_PROMPT_GPT4
@@ -126,7 +127,7 @@ class DocChatAgentConfig(ChatAgentConfig):
126
127
  llm=None # use the parent's llm unless explicitly set here
127
128
  )
128
129
  )
129
- doc_paths: List[str] = []
130
+ doc_paths: List[str | bytes] = []
130
131
  default_paths: List[str] = [
131
132
  "https://news.ycombinator.com/item?id=35629033",
132
133
  "https://www.newyorker.com/tech/annals-of-technology/chatgpt-is-a-blurry-jpeg-of-the-web",
@@ -248,62 +249,84 @@ class DocChatAgent(ChatAgent):
248
249
  raise ValueError("VecDB not set")
249
250
  self.setup_documents(filter=self.config.filter)
250
251
  return
251
- self.ingest_doc_paths(self.config.doc_paths)
252
+ self.ingest_doc_paths(self.config.doc_paths) # type: ignore
252
253
 
253
254
  def ingest_doc_paths(
254
255
  self,
255
- paths: List[str],
256
+ paths: str | bytes | List[str | bytes],
256
257
  metadata: (
257
258
  List[Dict[str, Any]] | Dict[str, Any] | DocMetaData | List[DocMetaData]
258
259
  ) = [],
260
+ doc_type: str | DocumentType | None = None,
259
261
  ) -> List[Document]:
260
262
  """Split, ingest docs from specified paths,
261
263
  do not add these to config.doc_paths.
262
264
 
263
265
  Args:
264
- paths: List of file/folder paths or URLs
266
+ paths: document paths, urls or byte-content of docs.
267
+ The bytes option is intended to support cases where a document
268
+ has already been read in as bytes (e.g. from an API or a database),
269
+ and we want to avoid having to write it to a temporary file
270
+ just to read it back in.
265
271
  metadata: List of metadata dicts, one for each path.
266
272
  If a single dict is passed in, it is used for all paths.
273
+ doc_type: DocumentType to use for parsing, if known.
274
+ MUST apply to all docs if specified.
275
+ This is especially useful when the `paths` are of bytes type,
276
+ to help with document type detection.
267
277
  Returns:
268
278
  List of Document objects
269
279
  """
280
+ if isinstance(paths, str) or isinstance(paths, bytes):
281
+ paths = [paths]
270
282
  all_paths = paths
271
- paths_meta: Dict[str, Any] = {}
272
- urls_meta: Dict[str, Any] = {}
273
- urls, paths = get_urls_and_paths(paths)
283
+ paths_meta: Dict[int, Any] = {}
284
+ urls_meta: Dict[int, Any] = {}
285
+ idxs = range(len(all_paths))
286
+ url_idxs, path_idxs, bytes_idxs = get_urls_paths_bytes_indices(all_paths)
287
+ urls = [all_paths[i] for i in url_idxs]
288
+ paths = [all_paths[i] for i in path_idxs]
289
+ bytes_list = [all_paths[i] for i in bytes_idxs]
290
+ path_idxs.extend(bytes_idxs)
291
+ paths.extend(bytes_list)
274
292
  if (isinstance(metadata, list) and len(metadata) > 0) or not isinstance(
275
293
  metadata, list
276
294
  ):
277
295
  if isinstance(metadata, list):
278
- path2meta = {
296
+ idx2meta = {
279
297
  p: (
280
298
  m
281
299
  if isinstance(m, dict)
282
300
  else (isinstance(m, DocMetaData) and m.dict())
283
301
  ) # appease mypy
284
- for p, m in zip(all_paths, metadata)
302
+ for p, m in zip(idxs, metadata)
285
303
  }
286
304
  elif isinstance(metadata, dict):
287
- path2meta = {p: metadata for p in all_paths}
305
+ idx2meta = {p: metadata for p in idxs}
288
306
  else:
289
- path2meta = {p: metadata.dict() for p in all_paths}
290
- urls_meta = {u: path2meta[u] for u in urls}
291
- paths_meta = {p: path2meta[p] for p in paths}
307
+ idx2meta = {p: metadata.dict() for p in idxs}
308
+ urls_meta = {u: idx2meta[u] for u in url_idxs}
309
+ paths_meta = {p: idx2meta[p] for p in path_idxs}
292
310
  docs: List[Document] = []
293
311
  parser = Parser(self.config.parsing)
294
312
  if len(urls) > 0:
295
- for u in urls:
296
- meta = urls_meta.get(u, {})
297
- loader = URLLoader(urls=[u], parser=parser)
313
+ for ui in url_idxs:
314
+ meta = urls_meta.get(ui, {})
315
+ loader = URLLoader(urls=[all_paths[ui]], parser=parser) # type: ignore
298
316
  url_docs = loader.load()
299
317
  # update metadata of each doc with meta
300
318
  for d in url_docs:
301
319
  d.metadata = d.metadata.copy(update=meta)
302
320
  docs.extend(url_docs)
303
- if len(paths) > 0:
304
- for p in paths:
305
- meta = paths_meta.get(p, {})
306
- path_docs = RepoLoader.get_documents(p, parser=parser)
321
+ if len(paths) > 0: # paths OR bytes are handled similarly
322
+ for pi in path_idxs:
323
+ meta = paths_meta.get(pi, {})
324
+ p = all_paths[pi]
325
+ path_docs = RepoLoader.get_documents(
326
+ p,
327
+ parser=parser,
328
+ doc_type=doc_type,
329
+ )
307
330
  # update metadata of each doc with meta
308
331
  for d in path_docs:
309
332
  d.metadata = d.metadata.copy(update=meta)
@@ -317,11 +340,12 @@ class DocChatAgent(ChatAgent):
317
340
  print(
318
341
  f"""
319
342
  [green]I have processed the following {n_urls} URLs
320
- and {n_paths} paths into {n_splits} parts:
343
+ and {n_paths} docs into {n_splits} parts:
321
344
  """.strip()
322
345
  )
323
- print("\n".join(urls))
324
- print("\n".join(paths))
346
+ path_reps = [p if isinstance(p, str) else "bytes" for p in paths]
347
+ print("\n".join([u for u in urls if isinstance(u, str)])) # appease mypy
348
+ print("\n".join(path_reps))
325
349
  return docs
326
350
 
327
351
  def ingest_docs(
@@ -388,6 +412,7 @@ class DocChatAgent(ChatAgent):
388
412
  + ",content="
389
413
  + d.content
390
414
  )
415
+ docs = docs[: self.config.parsing.max_chunks]
391
416
  # add embeddings in batches, to stay under limit of embeddings API
392
417
  batches = list(batched(docs, self.config.embed_batch_size))
393
418
  for batch in batches:
@@ -463,6 +488,10 @@ class DocChatAgent(ChatAgent):
463
488
  d.metadata.is_chunk = True
464
489
  return self.ingest_docs(docs)
465
490
 
491
+ def set_filter(self, filter: str) -> None:
492
+ self.config.filter = filter
493
+ self.setup_documents(filter=filter)
494
+
466
495
  def setup_documents(
467
496
  self,
468
497
  docs: List[Document] = [],
@@ -609,7 +638,7 @@ class DocChatAgent(ChatAgent):
609
638
  if len(inputs) == 0:
610
639
  if is_new_collection:
611
640
  inputs = self.config.default_paths
612
- self.config.doc_paths = inputs
641
+ self.config.doc_paths = inputs # type: ignore
613
642
  self.ingest()
614
643
 
615
644
  def llm_response(
@@ -1,3 +1,4 @@
1
+ import itertools
1
2
  import logging
2
3
  import re
3
4
  from enum import Enum
@@ -8,6 +9,7 @@ import fitz
8
9
  import pdfplumber
9
10
  import pypdf
10
11
  import requests
12
+ from bs4 import BeautifulSoup
11
13
  from PIL import Image
12
14
 
13
15
  from langroid.mytypes import DocMetaData, Document
@@ -20,6 +22,29 @@ class DocumentType(str, Enum):
20
22
  PDF = "pdf"
21
23
  DOCX = "docx"
22
24
  DOC = "doc"
25
+ TXT = "txt"
26
+
27
+
28
+ def is_plain_text(path_or_bytes: str | bytes) -> bool:
29
+ if isinstance(path_or_bytes, str):
30
+ if path_or_bytes.startswith(("http://", "https://")):
31
+ response = requests.get(path_or_bytes)
32
+ response.raise_for_status()
33
+ content = response.content[:1024]
34
+ else:
35
+ with open(path_or_bytes, "rb") as f:
36
+ content = f.read(1024)
37
+ else:
38
+ content = path_or_bytes[:1024]
39
+ try:
40
+ # Attempt to decode the content as UTF-8
41
+ _ = content.decode("utf-8")
42
+ # Additional checks can go here, e.g., to verify that the content
43
+ # doesn't contain too many unusual characters for it to be considered text
44
+ return True
45
+ except UnicodeDecodeError:
46
+ # If decoding fails, it's likely not plain text (or not encoded in UTF-8)
47
+ return False
23
48
 
24
49
 
25
50
  class DocumentParser(Parser):
@@ -33,19 +58,26 @@ class DocumentParser(Parser):
33
58
  """
34
59
 
35
60
  @classmethod
36
- def create(cls, source: str, config: ParsingConfig) -> "DocumentParser":
61
+ def create(
62
+ cls,
63
+ source: str | bytes,
64
+ config: ParsingConfig,
65
+ doc_type: str | DocumentType | None = None,
66
+ ) -> "DocumentParser":
37
67
  """
38
68
  Create a DocumentParser instance based on source type
39
69
  and config.<source_type>.library specified.
40
70
 
41
71
  Args:
42
- source (str): The source of the PDF, either a URL or a file path.
72
+ source (str|bytes): The source, could be a URL, file path,
73
+ or bytes object.
43
74
  config (ParserConfig): The parser configuration.
75
+ doc_type (str|None): The type of document, if known
44
76
 
45
77
  Returns:
46
78
  DocumentParser: An instance of a DocumentParser subclass.
47
79
  """
48
- if DocumentParser._document_type(source) == DocumentType.PDF:
80
+ if DocumentParser._document_type(source, doc_type) == DocumentType.PDF:
49
81
  if config.pdf.library == "fitz":
50
82
  return FitzPDFParser(source, config)
51
83
  elif config.pdf.library == "pypdf":
@@ -60,7 +92,7 @@ class DocumentParser(Parser):
60
92
  raise ValueError(
61
93
  f"Unsupported PDF library specified: {config.pdf.library}"
62
94
  )
63
- elif DocumentParser._document_type(source) == DocumentType.DOCX:
95
+ elif DocumentParser._document_type(source, doc_type) == DocumentType.DOCX:
64
96
  if config.docx.library == "unstructured":
65
97
  return UnstructuredDocxParser(source, config)
66
98
  elif config.docx.library == "python-docx":
@@ -69,42 +101,78 @@ class DocumentParser(Parser):
69
101
  raise ValueError(
70
102
  f"Unsupported DOCX library specified: {config.docx.library}"
71
103
  )
72
- elif DocumentParser._document_type(source) == DocumentType.DOC:
104
+ elif DocumentParser._document_type(source, doc_type) == DocumentType.DOC:
73
105
  return UnstructuredDocParser(source, config)
74
106
  else:
75
- raise ValueError(f"Unsupported document type: {source}")
107
+ source_name = source if isinstance(source, str) else "bytes"
108
+ raise ValueError(f"Unsupported document type: {source_name}")
76
109
 
77
- def __init__(self, source: str, config: ParsingConfig):
110
+ def __init__(self, source: str | bytes, config: ParsingConfig):
78
111
  """
79
- Initialize the PDFParser.
80
-
81
112
  Args:
82
- source (str): The source of the PDF, either a URL or a file path.
113
+ source (str|bytes): The source, which could be
114
+ a path, a URL or a bytes object.
83
115
  """
84
116
  super().__init__(config)
85
- self.source = source
86
117
  self.config = config
87
- self.doc_bytes = self._load_doc_as_bytesio()
118
+ if isinstance(source, bytes):
119
+ self.source = "bytes"
120
+ self.doc_bytes = BytesIO(source)
121
+ else:
122
+ self.source = source
123
+ self.doc_bytes = self._load_doc_as_bytesio()
88
124
 
89
125
  @staticmethod
90
- def _document_type(source: str) -> DocumentType:
126
+ def _document_type(
127
+ source: str | bytes, doc_type: str | DocumentType | None = None
128
+ ) -> DocumentType:
91
129
  """
92
130
  Determine the type of document based on the source.
93
131
 
94
132
  Args:
95
- source (str): The source of the PDF, either a URL or a file path.
133
+ source (str|bytes): The source, which could be a URL,
134
+ a file path, or a bytes object.
135
+ doc_type (str|DocumentType|None): The type of document, if known.
96
136
 
97
137
  Returns:
98
138
  str: The document type.
99
139
  """
100
- if source.lower().endswith(".pdf"):
101
- return DocumentType.PDF
102
- elif source.lower().endswith(".docx"):
103
- return DocumentType.DOCX
104
- elif source.lower().endswith(".doc"):
105
- return DocumentType.DOC
140
+ if isinstance(doc_type, DocumentType):
141
+ return doc_type
142
+ if doc_type:
143
+ return DocumentType(doc_type.lower())
144
+ if is_plain_text(source):
145
+ return DocumentType.TXT
146
+ if isinstance(source, str):
147
+ # detect file type from path extension
148
+ if source.lower().endswith(".pdf"):
149
+ return DocumentType.PDF
150
+ elif source.lower().endswith(".docx"):
151
+ return DocumentType.DOCX
152
+ elif source.lower().endswith(".doc"):
153
+ return DocumentType.DOC
154
+ else:
155
+ raise ValueError(f"Unsupported document type: {source}")
106
156
  else:
107
- raise ValueError(f"Unsupported document type: {source}")
157
+ # must be bytes: attempt to detect type from content
158
+ # using magic mime type detection
159
+ import magic
160
+
161
+ mime_type = magic.from_buffer(source, mime=True)
162
+ if mime_type == "application/pdf":
163
+ return DocumentType.PDF
164
+ elif mime_type in [
165
+ "application/vnd.openxmlformats-officedocument"
166
+ ".wordprocessingml.document",
167
+ "application/zip",
168
+ ]:
169
+ # DOCX files are essentially ZIP files,
170
+ # but this might catch other ZIP-based formats too!
171
+ return DocumentType.DOCX
172
+ elif mime_type == "application/msword":
173
+ return DocumentType.DOC
174
+ else:
175
+ raise ValueError("Unsupported document type from bytes")
108
176
 
109
177
  def _load_doc_as_bytesio(self) -> BytesIO:
110
178
  """
@@ -121,6 +189,61 @@ class DocumentParser(Parser):
121
189
  with open(self.source, "rb") as f:
122
190
  return BytesIO(f.read())
123
191
 
192
+ @staticmethod
193
+ def chunks_from_path_or_bytes(
194
+ source: str | bytes,
195
+ parser: Parser,
196
+ doc_type: str | DocumentType | None = None,
197
+ lines: int | None = None,
198
+ ) -> List[Document]:
199
+ """
200
+ Get document chunks from a file path or bytes object.
201
+ Args:
202
+ source (str|bytes): The source, which could be a URL, path or bytes object.
203
+ parser (Parser): The parser instance (for splitting the document).
204
+ doc_type (str|DocumentType|None): The type of document, if known.
205
+ lines (int|None): The number of lines to read from a plain text file.
206
+ Returns:
207
+ List[Document]: A list of `Document` objects,
208
+ each containing a chunk of text, determined by the
209
+ chunking and splitting settings in the parser config.
210
+ """
211
+ dtype: DocumentType = DocumentParser._document_type(source, doc_type)
212
+ if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
213
+ doc_parser = DocumentParser.create(
214
+ source,
215
+ parser.config,
216
+ doc_type=doc_type,
217
+ )
218
+ chunks = doc_parser.get_doc_chunks()
219
+ if len(chunks) == 0 and dtype == DocumentType.PDF:
220
+ doc_parser = ImagePdfParser(source, parser.config)
221
+ chunks = doc_parser.get_doc_chunks()
222
+ return chunks
223
+ else:
224
+ # try getting as plain text; these will be chunked downstream
225
+ # -- could be a bytes object or a path
226
+ if isinstance(source, bytes):
227
+ content = source.decode()
228
+ if lines is not None:
229
+ file_lines = content.splitlines()[:lines]
230
+ content = "\n".join(line.strip() for line in file_lines)
231
+ else:
232
+ with open(source, "r") as f:
233
+ if lines is not None:
234
+ file_lines = list(itertools.islice(f, lines))
235
+ content = "\n".join(line.strip() for line in file_lines)
236
+ else:
237
+ content = f.read()
238
+ soup = BeautifulSoup(content, "html.parser")
239
+ text = soup.get_text()
240
+ source_name = source if isinstance(source, str) else "bytes"
241
+ doc = Document(
242
+ content=text,
243
+ metadata=DocMetaData(source=str(source_name)),
244
+ )
245
+ return parser.split([doc])
246
+
124
247
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
125
248
  """Yield each page in the PDF."""
126
249
  raise NotImplementedError
@@ -145,7 +268,7 @@ class DocumentParser(Parser):
145
268
 
146
269
  def get_doc(self) -> Document:
147
270
  """
148
- Get entire text from pdf source as a single document.
271
+ Get entire text from source as a single document.
149
272
 
150
273
  Returns:
151
274
  a `Document` object containing the content of the pdf file,
@@ -10,7 +10,6 @@ from pathlib import Path
10
10
  from typing import Any, Dict, List, Optional, Tuple, Union
11
11
  from urllib.parse import urlparse
12
12
 
13
- from bs4 import BeautifulSoup
14
13
  from dotenv import load_dotenv
15
14
  from github import Github
16
15
  from github.ContentFile import ContentFile
@@ -19,7 +18,7 @@ from github.Repository import Repository
19
18
  from pydantic import BaseModel, BaseSettings, Field
20
19
 
21
20
  from langroid.mytypes import DocMetaData, Document
22
- from langroid.parsing.document_parser import DocumentParser, ImagePdfParser
21
+ from langroid.parsing.document_parser import DocumentParser, DocumentType
23
22
  from langroid.parsing.parser import Parser, ParsingConfig
24
23
 
25
24
  logger = logging.getLogger(__name__)
@@ -491,18 +490,25 @@ class RepoLoader:
491
490
 
492
491
  @staticmethod
493
492
  def get_documents(
494
- path: str,
493
+ path: str | bytes,
495
494
  parser: Parser = Parser(ParsingConfig()),
496
495
  file_types: Optional[List[str]] = None,
497
496
  exclude_dirs: Optional[List[str]] = None,
498
497
  depth: int = -1,
499
498
  lines: Optional[int] = None,
499
+ doc_type: str | DocumentType | None = None,
500
500
  ) -> List[Document]:
501
501
  """
502
502
  Recursively get all files under a path as Document objects.
503
503
 
504
504
  Args:
505
- path (str): The path to the directory or file.
505
+ path (str|bytes): The path to the directory or file, or bytes content.
506
+ The bytes option is meant to support the case where the content
507
+ has already been read from a file in an upstream process
508
+ (e.g. from an API or a database), and we want to avoid having to
509
+ write it to a temporary file just to read it again.
510
+ (which can be very slow for large files,
511
+ especially in a docker container)
506
512
  parser (Parser): Parser to use to parse files.
507
513
  file_types (List[str], optional): List of file extensions OR
508
514
  filenames OR file_path_names to include.
@@ -513,6 +519,7 @@ class RepoLoader:
513
519
  which includes all depths.
514
520
  lines (int, optional): Number of lines to read from each file.
515
521
  Defaults to None, which reads all lines.
522
+ doc_type (str|DocumentType, optional): The type of document to parse.
516
523
 
517
524
  Returns:
518
525
  List[Document]: List of Document objects representing files.
@@ -520,56 +527,69 @@ class RepoLoader:
520
527
  """
521
528
  docs = []
522
529
  file_paths = []
523
- path_obj = Path(path).resolve()
524
-
525
- if path_obj.is_file():
526
- file_paths.append(str(path_obj))
530
+ if isinstance(path, bytes):
531
+ file_paths.append(path)
527
532
  else:
528
- path_depth = len(path_obj.parts)
529
- for root, dirs, files in os.walk(path):
530
- # Exclude directories if needed
531
- if exclude_dirs:
532
- dirs[:] = [d for d in dirs if d not in exclude_dirs]
533
-
534
- current_depth = len(Path(root).resolve().parts) - path_depth
535
- if depth == -1 or current_depth <= depth:
536
- for file in files:
537
- file_path = str(Path(root) / file)
538
- if (
539
- file_types is None
540
- or RepoLoader._file_type(file_path) in file_types
541
- or os.path.basename(file_path) in file_types
542
- or file_path in file_types
543
- ):
544
- file_paths.append(file_path)
533
+ path_obj = Path(path).resolve()
534
+
535
+ if path_obj.is_file():
536
+ file_paths.append(str(path_obj))
537
+ else:
538
+ path_depth = len(path_obj.parts)
539
+ for root, dirs, files in os.walk(path):
540
+ # Exclude directories if needed
541
+ if exclude_dirs:
542
+ dirs[:] = [d for d in dirs if d not in exclude_dirs]
543
+
544
+ current_depth = len(Path(root).resolve().parts) - path_depth
545
+ if depth == -1 or current_depth <= depth:
546
+ for file in files:
547
+ file_path = str(Path(root) / file)
548
+ if (
549
+ file_types is None
550
+ or RepoLoader._file_type(file_path) in file_types
551
+ or os.path.basename(file_path) in file_types
552
+ or file_path in file_types
553
+ ):
554
+ file_paths.append(file_path)
545
555
 
546
556
  for file_path in file_paths:
547
- _, file_extension = os.path.splitext(file_path)
548
- if file_extension.lower() in [".pdf", ".docx", ".doc"]:
549
- doc_parser = DocumentParser.create(
557
+ docs.extend(
558
+ DocumentParser.chunks_from_path_or_bytes(
550
559
  file_path,
551
- parser.config,
552
- )
553
- new_chunks = doc_parser.get_doc_chunks()
554
- if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
555
- doc_parser = ImagePdfParser(file_path, parser.config)
556
- new_chunks = doc_parser.get_doc_chunks()
557
- docs.extend(new_chunks)
558
- else:
559
- with open(file_path, "r") as f:
560
- if lines is not None:
561
- file_lines = list(itertools.islice(f, lines))
562
- content = "\n".join(line.strip() for line in file_lines)
563
- else:
564
- content = f.read()
565
- soup = BeautifulSoup(content, "html.parser")
566
- text = soup.get_text()
567
- docs.append(
568
- Document(
569
- content=text,
570
- metadata=DocMetaData(source=str(file_path)),
571
- )
560
+ parser,
561
+ doc_type=doc_type,
562
+ lines=lines,
572
563
  )
564
+ )
565
+ # dtype: DocumentType = DocumentParser._document_type(file_path, doc_type)
566
+ # if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
567
+ # doc_parser = DocumentParser.create(
568
+ # file_path,
569
+ # parser.config,
570
+ # doc_type=doc_type,
571
+ # )
572
+ # new_chunks = doc_parser.get_doc_chunks()
573
+ # if len(new_chunks) == 0 and file_extension.lower() == ".pdf":
574
+ # doc_parser = ImagePdfParser(file_path, parser.config)
575
+ # new_chunks = doc_parser.get_doc_chunks()
576
+ # docs.extend(new_chunks)
577
+ # else:
578
+ # # try getting as plain text; these will be chunked downstream
579
+ # with open(file_path, "r") as f:
580
+ # if lines is not None:
581
+ # file_lines = list(itertools.islice(f, lines))
582
+ # content = "\n".join(line.strip() for line in file_lines)
583
+ # else:
584
+ # content = f.read()
585
+ # soup = BeautifulSoup(content, "html.parser")
586
+ # text = soup.get_text()
587
+ # docs.append(
588
+ # Document(
589
+ # content=text,
590
+ # metadata=DocMetaData(source=str(file_path)),
591
+ # )
592
+ # )
573
593
 
574
594
  return docs
575
595
 
@@ -112,26 +112,35 @@ def is_url(s: str) -> bool:
112
112
  return False
113
113
 
114
114
 
115
- def get_urls_and_paths(inputs: List[str]) -> Tuple[List[str], List[str]]:
115
+ def get_urls_paths_bytes_indices(
116
+ inputs: List[str | bytes],
117
+ ) -> Tuple[List[int], List[int], List[int]]:
116
118
  """
117
- Given a list of inputs, return a list of URLs and a list of paths.
119
+ Given a list of inputs, return a
120
+ list of indices of URLs, list of indices of paths, list of indices of byte-contents.
118
121
  Args:
119
- inputs: list of strings
122
+ inputs: list of strings or bytes
120
123
  Returns:
121
- list of URLs, list of paths
124
+ list of Indices of URLs,
125
+ list of indices of paths,
126
+ list of indices of byte-contents
122
127
  """
123
128
  urls = []
124
129
  paths = []
125
- for item in inputs:
130
+ byte_list = []
131
+ for i, item in enumerate(inputs):
132
+ if isinstance(item, bytes):
133
+ byte_list.append(i)
134
+ continue
126
135
  try:
127
- m = Url(url=parse_obj_as(HttpUrl, item))
128
- urls.append(str(m.url))
136
+ Url(url=parse_obj_as(HttpUrl, item))
137
+ urls.append(i)
129
138
  except ValidationError:
130
139
  if os.path.exists(item):
131
- paths.append(item)
140
+ paths.append(i)
132
141
  else:
133
142
  logger.warning(f"{item} is neither a URL nor a path.")
134
- return urls, paths
143
+ return urls, paths, byte_list
135
144
 
136
145
 
137
146
  def crawl_url(url: str, max_urls: int = 1) -> List[str]:
@@ -10,10 +10,11 @@ import nltk
10
10
  from faker import Faker
11
11
 
12
12
  from langroid.mytypes import Document
13
+ from langroid.parsing.document_parser import DocumentType
13
14
  from langroid.parsing.parser import Parser, ParsingConfig
14
15
  from langroid.parsing.repo_loader import RepoLoader
15
16
  from langroid.parsing.url_loader import URLLoader
16
- from langroid.parsing.urls import get_urls_and_paths
17
+ from langroid.parsing.urls import get_urls_paths_bytes_indices
17
18
 
18
19
  Faker.seed(23)
19
20
  random.seed(43)
@@ -314,37 +315,54 @@ def extract_numbered_segments(s: str, specs: str) -> str:
314
315
 
315
316
 
316
317
  def extract_content_from_path(
317
- path: str | List[str], parsing: ParsingConfig
318
+ path: bytes | str | List[bytes | str],
319
+ parsing: ParsingConfig,
320
+ doc_type: str | DocumentType | None = None,
318
321
  ) -> str | List[str]:
319
322
  """
320
323
  Extract the content from a file path or URL, or a list of file paths or URLs.
321
324
 
322
325
  Args:
323
- path (str | List[str]): The file path or URL, or a list of file paths or URLs.
326
+ path (bytes | str | List[str]): The file path or URL, or a list of file paths or
327
+ URLs, or bytes content. The bytes option is meant to support cases
328
+ where upstream code may have already loaded the content (e.g., from a
329
+ database or API) and we want to avoid having to copy the content to a
330
+ temporary file.
324
331
  parsing (ParsingConfig): The parsing configuration.
332
+ doc_type (str | DocumentType | None): The document type if known.
333
+ If multiple paths are given, this MUST apply to ALL docs.
325
334
 
326
335
  Returns:
327
336
  str | List[str]: The extracted content if a single file path or URL is provided,
328
337
  or a list of extracted contents if a
329
338
  list of file paths or URLs is provided.
330
339
  """
331
- if isinstance(path, str):
332
- path = [path]
340
+ if isinstance(path, str) or isinstance(path, bytes):
341
+ paths = [path]
333
342
  elif isinstance(path, list) and len(path) == 0:
334
343
  return ""
335
- urls, path_list = get_urls_and_paths(path)
344
+ else:
345
+ paths = path
346
+
347
+ url_idxs, path_idxs, byte_idxs = get_urls_paths_bytes_indices(paths)
348
+ urls = [paths[i] for i in url_idxs]
349
+ path_list = [paths[i] for i in path_idxs]
350
+ byte_list = [paths[i] for i in byte_idxs]
351
+ path_list.extend(byte_list)
336
352
  parser = Parser(parsing)
337
353
  docs: List[Document] = []
338
354
  try:
339
355
  if len(urls) > 0:
340
- loader = URLLoader(urls=urls, parser=parser)
356
+ loader = URLLoader(urls=urls, parser=parser) # type: ignore
341
357
  docs = loader.load()
342
358
  if len(path_list) > 0:
343
359
  for p in path_list:
344
- path_docs = RepoLoader.get_documents(p, parser=parser)
360
+ path_docs = RepoLoader.get_documents(
361
+ p, parser=parser, doc_type=doc_type
362
+ )
345
363
  docs.extend(path_docs)
346
364
  except Exception as e:
347
- logger.warning(f"Error loading path {path}: {e}")
365
+ logger.warning(f"Error loading path {paths}: {e}")
348
366
  return ""
349
367
  if len(docs) == 1:
350
368
  return docs[0].content
@@ -131,7 +131,7 @@ def generate_user_id(org: str = "") -> str:
131
131
  def update_hash(hash: str | None = None, s: str = "") -> str:
132
132
  """
133
133
  Takes a SHA256 hash string and a new string, updates the hash with the new string,
134
- and returns the updated hash string along with the original string.
134
+ and returns the updated hash string.
135
135
 
136
136
  Args:
137
137
  hash (str): A SHA256 hash string.
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "langroid"
3
- version = "0.1.218"
3
+ version = "0.1.219"
4
4
  description = "Harness LLMs with Multi-Agent Programming"
5
5
  authors = ["Prasad Chalasani <pchalasani@gmail.com>"]
6
6
  readme = "README.md"
@@ -47,7 +47,7 @@ types-redis = "^4.5.5.2"
47
47
  types-requests = "^2.31.0.1"
48
48
  pyparsing = "^3.0.9"
49
49
  nltk = "^3.8.1"
50
- qdrant-client = "^1.7.0"
50
+ qdrant-client = "^1.8.0"
51
51
  pydantic = "1.10.13"
52
52
  pypdf = "^3.12.2"
53
53
  momento = "^1.10.2"
File without changes
File without changes