langroid 0.47.2__tar.gz → 0.48.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. {langroid-0.47.2 → langroid-0.48.1}/PKG-INFO +5 -3
  2. {langroid-0.47.2 → langroid-0.48.1}/langroid/mytypes.py +2 -0
  3. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/document_parser.py +40 -0
  4. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/parser.py +1 -1
  5. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/url_loader.py +95 -2
  6. {langroid-0.47.2 → langroid-0.48.1}/pyproject.toml +7 -3
  7. {langroid-0.47.2 → langroid-0.48.1}/.gitignore +0 -0
  8. {langroid-0.47.2 → langroid-0.48.1}/LICENSE +0 -0
  9. {langroid-0.47.2 → langroid-0.48.1}/README.md +0 -0
  10. {langroid-0.47.2 → langroid-0.48.1}/langroid/__init__.py +0 -0
  11. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/__init__.py +0 -0
  12. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/base.py +0 -0
  13. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/batch.py +0 -0
  14. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/callbacks/__init__.py +0 -0
  15. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/callbacks/chainlit.py +0 -0
  16. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/chat_agent.py +0 -0
  17. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/chat_document.py +0 -0
  18. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/openai_assistant.py +0 -0
  19. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/__init__.py +0 -0
  20. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/arangodb/__init__.py +0 -0
  21. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/arangodb/arangodb_agent.py +0 -0
  22. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/arangodb/system_messages.py +0 -0
  23. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/arangodb/tools.py +0 -0
  24. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/arangodb/utils.py +0 -0
  25. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/doc_chat_agent.py +0 -0
  26. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
  27. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/lance_rag/__init__.py +0 -0
  28. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
  29. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
  30. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
  31. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/lance_tools.py +0 -0
  32. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/neo4j/__init__.py +0 -0
  33. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
  34. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
  35. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/neo4j/system_messages.py +0 -0
  36. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/neo4j/tools.py +0 -0
  37. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/relevance_extractor_agent.py +0 -0
  38. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/retriever_agent.py +0 -0
  39. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/sql/__init__.py +0 -0
  40. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
  41. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/sql/utils/__init__.py +0 -0
  42. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
  43. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
  44. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/sql/utils/system_message.py +0 -0
  45. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/sql/utils/tools.py +0 -0
  46. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/special/table_chat_agent.py +0 -0
  47. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/task.py +0 -0
  48. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tool_message.py +0 -0
  49. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/__init__.py +0 -0
  50. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
  51. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/exa_search_tool.py +0 -0
  52. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/file_tools.py +0 -0
  53. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/google_search_tool.py +0 -0
  54. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/metaphor_search_tool.py +0 -0
  55. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/orchestration.py +0 -0
  56. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/recipient_tool.py +0 -0
  57. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/retrieval_tool.py +0 -0
  58. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/rewind_tool.py +0 -0
  59. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/segment_extract_tool.py +0 -0
  60. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/tools/tavily_search_tool.py +0 -0
  61. {langroid-0.47.2 → langroid-0.48.1}/langroid/agent/xml_tool_message.py +0 -0
  62. {langroid-0.47.2 → langroid-0.48.1}/langroid/cachedb/__init__.py +0 -0
  63. {langroid-0.47.2 → langroid-0.48.1}/langroid/cachedb/base.py +0 -0
  64. {langroid-0.47.2 → langroid-0.48.1}/langroid/cachedb/momento_cachedb.py +0 -0
  65. {langroid-0.47.2 → langroid-0.48.1}/langroid/cachedb/redis_cachedb.py +0 -0
  66. {langroid-0.47.2 → langroid-0.48.1}/langroid/embedding_models/__init__.py +0 -0
  67. {langroid-0.47.2 → langroid-0.48.1}/langroid/embedding_models/base.py +0 -0
  68. {langroid-0.47.2 → langroid-0.48.1}/langroid/embedding_models/models.py +0 -0
  69. {langroid-0.47.2 → langroid-0.48.1}/langroid/embedding_models/protoc/__init__.py +0 -0
  70. {langroid-0.47.2 → langroid-0.48.1}/langroid/embedding_models/protoc/embeddings.proto +0 -0
  71. {langroid-0.47.2 → langroid-0.48.1}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
  72. {langroid-0.47.2 → langroid-0.48.1}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
  73. {langroid-0.47.2 → langroid-0.48.1}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
  74. {langroid-0.47.2 → langroid-0.48.1}/langroid/embedding_models/remote_embeds.py +0 -0
  75. {langroid-0.47.2 → langroid-0.48.1}/langroid/exceptions.py +0 -0
  76. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/__init__.py +0 -0
  77. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/azure_openai.py +0 -0
  78. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/base.py +0 -0
  79. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/config.py +0 -0
  80. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/mock_lm.py +0 -0
  81. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/model_info.py +0 -0
  82. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/openai_gpt.py +0 -0
  83. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/prompt_formatter/__init__.py +0 -0
  84. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/prompt_formatter/base.py +0 -0
  85. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
  86. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
  87. {langroid-0.47.2 → langroid-0.48.1}/langroid/language_models/utils.py +0 -0
  88. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/__init__.py +0 -0
  89. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/agent_chats.py +0 -0
  90. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/code_parser.py +0 -0
  91. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/para_sentence_split.py +0 -0
  92. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/parse_json.py +0 -0
  93. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/pdf_utils.py +0 -0
  94. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/repo_loader.py +0 -0
  95. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/routing.py +0 -0
  96. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/search.py +0 -0
  97. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/spider.py +0 -0
  98. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/table_loader.py +0 -0
  99. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/urls.py +0 -0
  100. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/utils.py +0 -0
  101. {langroid-0.47.2 → langroid-0.48.1}/langroid/parsing/web_search.py +0 -0
  102. {langroid-0.47.2 → langroid-0.48.1}/langroid/prompts/__init__.py +0 -0
  103. {langroid-0.47.2 → langroid-0.48.1}/langroid/prompts/dialog.py +0 -0
  104. {langroid-0.47.2 → langroid-0.48.1}/langroid/prompts/prompts_config.py +0 -0
  105. {langroid-0.47.2 → langroid-0.48.1}/langroid/prompts/templates.py +0 -0
  106. {langroid-0.47.2 → langroid-0.48.1}/langroid/py.typed +0 -0
  107. {langroid-0.47.2 → langroid-0.48.1}/langroid/pydantic_v1/__init__.py +0 -0
  108. {langroid-0.47.2 → langroid-0.48.1}/langroid/pydantic_v1/main.py +0 -0
  109. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/__init__.py +0 -0
  110. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/algorithms/__init__.py +0 -0
  111. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/algorithms/graph.py +0 -0
  112. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/configuration.py +0 -0
  113. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/constants.py +0 -0
  114. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/git_utils.py +0 -0
  115. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/globals.py +0 -0
  116. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/logging.py +0 -0
  117. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/object_registry.py +0 -0
  118. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/output/__init__.py +0 -0
  119. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/output/citations.py +0 -0
  120. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/output/printing.py +0 -0
  121. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/output/status.py +0 -0
  122. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/pandas_utils.py +0 -0
  123. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/pydantic_utils.py +0 -0
  124. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/system.py +0 -0
  125. {langroid-0.47.2 → langroid-0.48.1}/langroid/utils/types.py +0 -0
  126. {langroid-0.47.2 → langroid-0.48.1}/langroid/vector_store/__init__.py +0 -0
  127. {langroid-0.47.2 → langroid-0.48.1}/langroid/vector_store/base.py +0 -0
  128. {langroid-0.47.2 → langroid-0.48.1}/langroid/vector_store/chromadb.py +0 -0
  129. {langroid-0.47.2 → langroid-0.48.1}/langroid/vector_store/lancedb.py +0 -0
  130. {langroid-0.47.2 → langroid-0.48.1}/langroid/vector_store/meilisearch.py +0 -0
  131. {langroid-0.47.2 → langroid-0.48.1}/langroid/vector_store/pineconedb.py +0 -0
  132. {langroid-0.47.2 → langroid-0.48.1}/langroid/vector_store/postgres.py +0 -0
  133. {langroid-0.47.2 → langroid-0.48.1}/langroid/vector_store/qdrantdb.py +0 -0
  134. {langroid-0.47.2 → langroid-0.48.1}/langroid/vector_store/weaviatedb.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.47.2
3
+ Version: 0.48.1
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -108,7 +108,7 @@ Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
108
108
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
109
109
  Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
110
110
  Provides-Extra: doc-parsers
111
- Requires-Dist: markitdown>=0.0.1a3; extra == 'doc-parsers'
111
+ Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'doc-parsers'
112
112
  Requires-Dist: openpyxl>=3.1.5; extra == 'doc-parsers'
113
113
  Requires-Dist: python-docx>=1.1.2; extra == 'doc-parsers'
114
114
  Requires-Dist: python-pptx>=1.0.2; extra == 'doc-parsers'
@@ -144,6 +144,8 @@ Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'litellm'
144
144
  Provides-Extra: marker-pdf
145
145
  Requires-Dist: marker-pdf[full]>=1.6.0; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'marker-pdf'
146
146
  Requires-Dist: opencv-python>=4.11.0.86; extra == 'marker-pdf'
147
+ Provides-Extra: markitdown
148
+ Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'markitdown'
147
149
  Provides-Extra: meilisearch
148
150
  Requires-Dist: meilisearch-python-sdk<3.0.0,>=2.2.3; extra == 'meilisearch'
149
151
  Provides-Extra: metaphor
@@ -157,7 +159,7 @@ Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
157
159
  Provides-Extra: pdf-parsers
158
160
  Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
159
161
  Requires-Dist: marker-pdf; extra == 'pdf-parsers'
160
- Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
162
+ Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'pdf-parsers'
161
163
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
162
164
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
163
165
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
@@ -45,6 +45,8 @@ class DocMetaData(BaseModel):
45
45
 
46
46
  source: str = "context" # just reference
47
47
  source_content: str = "context" # reference and content
48
+ title: str = "unknown"
49
+ published_date: str = "unknown"
48
50
  is_chunk: bool = False # if it is a chunk, don't split
49
51
  id: str = Field(default_factory=lambda: str(uuid4()))
50
52
  window_ids: List[str] = [] # for RAG: ids of chunks around this one
@@ -161,6 +161,8 @@ class DocumentParser(Parser):
161
161
  return UnstructuredDocxParser(source, config)
162
162
  elif config.docx.library == "python-docx":
163
163
  return PythonDocxParser(source, config)
164
+ elif config.docx.library == "markitdown-docx":
165
+ return MarkitdownDocxParser(source, config)
164
166
  else:
165
167
  raise ValueError(
166
168
  f"Unsupported DOCX library specified: {config.docx.library}"
@@ -887,6 +889,44 @@ class PythonDocxParser(DocumentParser):
887
889
  )
888
890
 
889
891
 
892
+ class MarkitdownDocxParser(DocumentParser):
893
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
894
+ try:
895
+ from markitdown import MarkItDown
896
+ except ImportError:
897
+ LangroidImportError("markitdown", ["markitdown", "doc-parsers"])
898
+ md = MarkItDown()
899
+ self.doc_bytes.seek(0) # Reset to start
900
+
901
+ # Direct conversion from stream works for DOCX (unlike XLSX)
902
+ result = md.convert_stream(self.doc_bytes, file_extension=".docx")
903
+
904
+ # Split content into logical sections (paragraphs, sections, etc.)
905
+ # This approach differs from the strict page-based approach used for PDFs
906
+ sections = re.split(r"(?=# |\n## |\n### )", result.text_content)
907
+
908
+ # Filter out empty sections
909
+ sections = [section for section in sections if section.strip()]
910
+
911
+ for i, section in enumerate(sections):
912
+ yield i, section
913
+
914
+ def get_document_from_page(self, md_content: str) -> Document:
915
+ """
916
+ Get Document object from a given markdown section.
917
+
918
+ Args:
919
+ md_content (str): The markdown content for the section.
920
+
921
+ Returns:
922
+ Document: Document object, with content and possible metadata.
923
+ """
924
+ return Document(
925
+ content=self.fix_text(md_content),
926
+ metadata=DocMetaData(source=self.source),
927
+ )
928
+
929
+
890
930
  class MarkitdownXLSXParser(DocumentParser):
891
931
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
892
932
  try:
@@ -78,7 +78,7 @@ class PdfParsingConfig(BaseParsingConfig):
78
78
 
79
79
 
80
80
  class DocxParsingConfig(BaseSettings):
81
- library: Literal["python-docx", "unstructured"] = "unstructured"
81
+ library: Literal["python-docx", "unstructured", "markitdown-docx"] = "unstructured"
82
82
 
83
83
 
84
84
  class DocParsingConfig(BaseSettings):
@@ -48,6 +48,15 @@ class FirecrawlConfig(BaseCrawlerConfig):
48
48
  env_prefix = "FIRECRAWL_"
49
49
 
50
50
 
51
+ class ExaCrawlerConfig(BaseCrawlerConfig):
52
+ api_key: str = ""
53
+
54
+ class Config:
55
+ # Allow setting of fields via env vars with prefix EXA_
56
+ # e.g., EXA_API_KEY=your_api_key
57
+ env_prefix = "EXA_"
58
+
59
+
51
60
  class BaseCrawler(ABC):
52
61
  """Abstract base class for web crawlers."""
53
62
 
@@ -150,6 +159,8 @@ class CrawlerFactory:
150
159
  return TrafilaturaCrawler(config)
151
160
  elif isinstance(config, FirecrawlConfig):
152
161
  return FirecrawlCrawler(config)
162
+ elif isinstance(config, ExaCrawlerConfig):
163
+ return ExaCrawler(config)
153
164
  else:
154
165
  raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
155
166
 
@@ -247,7 +258,13 @@ class FirecrawlCrawler(BaseCrawler):
247
258
  with open(filename, "w") as f:
248
259
  f.write(content)
249
260
  docs.append(
250
- Document(content=content, metadata=DocMetaData(source=url))
261
+ Document(
262
+ content=content,
263
+ metadata=DocMetaData(
264
+ source=url,
265
+ title=page["metadata"].get("title", ""),
266
+ ),
267
+ )
251
268
  )
252
269
  processed_urls.add(url)
253
270
  new_pages += 1
@@ -289,7 +306,10 @@ class FirecrawlCrawler(BaseCrawler):
289
306
  docs.append(
290
307
  Document(
291
308
  content=result["markdown"],
292
- metadata=DocMetaData(source=url),
309
+ metadata=DocMetaData(
310
+ source=url,
311
+ title=metadata.get("title", ""),
312
+ ),
293
313
  )
294
314
  )
295
315
  except Exception as e:
@@ -311,6 +331,77 @@ class FirecrawlCrawler(BaseCrawler):
311
331
  return docs
312
332
 
313
333
 
334
+ class ExaCrawler(BaseCrawler):
335
+ """Crawler implementation using Exa API."""
336
+
337
+ def __init__(self, config: ExaCrawlerConfig) -> None:
338
+ """Initialize the Exa crawler.
339
+
340
+ Args:
341
+ config: Configuration for the crawler
342
+ """
343
+ super().__init__(config)
344
+ self.config: ExaCrawlerConfig = config
345
+
346
+ @property
347
+ def needs_parser(self) -> bool:
348
+ return True
349
+
350
+ def crawl(self, urls: List[str]) -> List[Document]:
351
+ """Crawl the given URLs using Exa SDK.
352
+
353
+ Args:
354
+ urls: List of URLs to crawl
355
+
356
+ Returns:
357
+ List of Documents with content extracted from the URLs
358
+
359
+ Raises:
360
+ LangroidImportError: If the exa package is not installed
361
+ ValueError: If the Exa API key is not set
362
+ """
363
+ try:
364
+ from exa_py import Exa
365
+ except ImportError:
366
+ raise LangroidImportError("exa", "exa")
367
+
368
+ if not self.config.api_key:
369
+ raise ValueError("EXA_API_KEY key is required in your env or .env")
370
+
371
+ exa = Exa(self.config.api_key)
372
+ docs = []
373
+
374
+ try:
375
+ for url in urls:
376
+ parsed_doc_chunks = self._process_document(url)
377
+ if parsed_doc_chunks:
378
+ docs.extend(parsed_doc_chunks)
379
+ continue
380
+ else:
381
+ results = exa.get_contents([url], livecrawl="always", text=True)
382
+ result = results.results[0]
383
+ if result.text:
384
+ # append a NON-chunked document
385
+ # (metadata.is_chunk = False, so will be chunked downstream)
386
+ docs.append(
387
+ Document(
388
+ content=result.text,
389
+ metadata=DocMetaData(
390
+ source=url,
391
+ title=getattr(result, "title", ""),
392
+ published_date=getattr(
393
+ result, "published_date", ""
394
+ ),
395
+ ),
396
+ )
397
+ )
398
+
399
+ except Exception as e:
400
+ logging.error(f"Error retrieving content from Exa API: {e}")
401
+
402
+ return docs
403
+
404
+
314
405
  class URLLoader:
315
406
  """Loads URLs and extracts text using a specified crawler."""
316
407
 
@@ -334,6 +425,8 @@ class URLLoader:
334
425
  crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
335
426
 
336
427
  self.crawler = CrawlerFactory.create_crawler(crawler_config)
428
+ if self.crawler.needs_parser:
429
+ self.crawler.parser = Parser(parsing_config)
337
430
 
338
431
  def load(self) -> List[Document]:
339
432
  """Load the URLs using the specified crawler."""
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "langroid"
3
- version = "0.47.2"
3
+ version = "0.48.1"
4
4
  authors = [
5
5
  {name = "Prasad Chalasani", email = "pchalasani@gmail.com"},
6
6
  ]
@@ -147,7 +147,7 @@ pdf-parsers = [
147
147
  "pymupdf4llm<0.1.0,>=0.0.17",
148
148
  "pdf2image<2.0.0,>=1.17.0",
149
149
  "pytesseract<0.4.0,>=0.3.10",
150
- "markitdown>=0.0.1a3",
150
+ "markitdown[docx,xlsx,pptx]>=0.0.1a3",
151
151
  "marker-pdf",
152
152
  ]
153
153
 
@@ -155,6 +155,10 @@ docx = [
155
155
  "python-docx<2.0.0,>=1.1.0",
156
156
  ]
157
157
 
158
+ markitdown = [
159
+ "markitdown[docx,xlsx,pptx]>=0.0.1a3",
160
+ ]
161
+
158
162
  marker-pdf = [
159
163
  "marker-pdf[full]>=1.6.0; sys_platform != 'darwin' or platform_machine != 'x86_64'",
160
164
  "opencv-python>=4.11.0.86",
@@ -252,7 +256,7 @@ google-generativeai = [
252
256
  "google-genai>=1.0.0",
253
257
  ]
254
258
  doc-parsers = [
255
- "markitdown>=0.0.1a3",
259
+ "markitdown[docx,xlsx,pptx]>=0.0.1a3",
256
260
  "openpyxl>=3.1.5",
257
261
  "python-docx>=1.1.2",
258
262
  "python-pptx>=1.0.2",
File without changes
File without changes
File without changes
File without changes