langroid 0.47.2__py3-none-any.whl → 0.48.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -161,6 +161,8 @@ class DocumentParser(Parser):
161
161
  return UnstructuredDocxParser(source, config)
162
162
  elif config.docx.library == "python-docx":
163
163
  return PythonDocxParser(source, config)
164
+ elif config.docx.library == "markitdown-docx":
165
+ return MarkitdownDocxParser(source, config)
164
166
  else:
165
167
  raise ValueError(
166
168
  f"Unsupported DOCX library specified: {config.docx.library}"
@@ -887,6 +889,44 @@ class PythonDocxParser(DocumentParser):
887
889
  )
888
890
 
889
891
 
892
+ class MarkitdownDocxParser(DocumentParser):
893
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
894
+ try:
895
+ from markitdown import MarkItDown
896
+ except ImportError:
897
+ LangroidImportError("markitdown", ["markitdown", "doc-parsers"])
898
+ md = MarkItDown()
899
+ self.doc_bytes.seek(0) # Reset to start
900
+
901
+ # Direct conversion from stream works for DOCX (unlike XLSX)
902
+ result = md.convert_stream(self.doc_bytes, file_extension=".docx")
903
+
904
+ # Split content into logical sections (paragraphs, sections, etc.)
905
+ # This approach differs from the strict page-based approach used for PDFs
906
+ sections = re.split(r"(?=# |\n## |\n### )", result.text_content)
907
+
908
+ # Filter out empty sections
909
+ sections = [section for section in sections if section.strip()]
910
+
911
+ for i, section in enumerate(sections):
912
+ yield i, section
913
+
914
+ def get_document_from_page(self, md_content: str) -> Document:
915
+ """
916
+ Get Document object from a given markdown section.
917
+
918
+ Args:
919
+ md_content (str): The markdown content for the section.
920
+
921
+ Returns:
922
+ Document: Document object, with content and possible metadata.
923
+ """
924
+ return Document(
925
+ content=self.fix_text(md_content),
926
+ metadata=DocMetaData(source=self.source),
927
+ )
928
+
929
+
890
930
  class MarkitdownXLSXParser(DocumentParser):
891
931
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
892
932
  try:
@@ -78,7 +78,7 @@ class PdfParsingConfig(BaseParsingConfig):
78
78
 
79
79
 
80
80
  class DocxParsingConfig(BaseSettings):
81
- library: Literal["python-docx", "unstructured"] = "unstructured"
81
+ library: Literal["python-docx", "unstructured", "markitdown-docx"] = "unstructured"
82
82
 
83
83
 
84
84
  class DocParsingConfig(BaseSettings):
@@ -48,6 +48,15 @@ class FirecrawlConfig(BaseCrawlerConfig):
48
48
  env_prefix = "FIRECRAWL_"
49
49
 
50
50
 
51
+ class ExaCrawlerConfig(BaseCrawlerConfig):
52
+ api_key: str = ""
53
+
54
+ class Config:
55
+ # Allow setting of fields via env vars with prefix EXA_
56
+ # e.g., EXA_API_KEY=your_api_key
57
+ env_prefix = "EXA_"
58
+
59
+
51
60
  class BaseCrawler(ABC):
52
61
  """Abstract base class for web crawlers."""
53
62
 
@@ -150,6 +159,8 @@ class CrawlerFactory:
150
159
  return TrafilaturaCrawler(config)
151
160
  elif isinstance(config, FirecrawlConfig):
152
161
  return FirecrawlCrawler(config)
162
+ elif isinstance(config, ExaCrawlerConfig):
163
+ return ExaCrawler(config)
153
164
  else:
154
165
  raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
155
166
 
@@ -311,6 +322,60 @@ class FirecrawlCrawler(BaseCrawler):
311
322
  return docs
312
323
 
313
324
 
325
+ class ExaCrawler(BaseCrawler):
326
+ """Crawler implementation using Exa API."""
327
+
328
+ def __init__(self, config: ExaCrawlerConfig) -> None:
329
+ """Initialize the Exa crawler.
330
+
331
+ Args:
332
+ config: Configuration for the crawler
333
+ """
334
+ super().__init__(config)
335
+ self.config: ExaCrawlerConfig = config
336
+
337
+ @property
338
+ def needs_parser(self) -> bool:
339
+ return False
340
+
341
+ def crawl(self, urls: List[str]) -> List[Document]:
342
+ """Crawl the given URLs using Exa SDK.
343
+
344
+ Args:
345
+ urls: List of URLs to crawl
346
+
347
+ Returns:
348
+ List of Documents with content extracted from the URLs
349
+
350
+ Raises:
351
+ LangroidImportError: If the exa package is not installed
352
+ ValueError: If the Exa API key is not set
353
+ """
354
+ try:
355
+ from exa_py import Exa
356
+ except ImportError:
357
+ raise LangroidImportError("exa", "exa")
358
+
359
+ if not self.config.api_key:
360
+ raise ValueError("EXA_API_KEY key is required in your env or .env")
361
+
362
+ exa = Exa(self.config.api_key)
363
+ docs = []
364
+
365
+ try:
366
+ results = exa.get_contents(urls, text=True)
367
+
368
+ for result in results.results:
369
+ if result.text:
370
+ metadata = DocMetaData(source=result.url)
371
+ docs.append(Document(content=result.text, metadata=metadata))
372
+
373
+ except Exception as e:
374
+ logging.error(f"Error retrieving content from Exa API: {e}")
375
+
376
+ return docs
377
+
378
+
314
379
  class URLLoader:
315
380
  """Loads URLs and extracts text using a specified crawler."""
316
381
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.47.2
3
+ Version: 0.48.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -108,7 +108,7 @@ Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
108
108
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
109
109
  Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
110
110
  Provides-Extra: doc-parsers
111
- Requires-Dist: markitdown>=0.0.1a3; extra == 'doc-parsers'
111
+ Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'doc-parsers'
112
112
  Requires-Dist: openpyxl>=3.1.5; extra == 'doc-parsers'
113
113
  Requires-Dist: python-docx>=1.1.2; extra == 'doc-parsers'
114
114
  Requires-Dist: python-pptx>=1.0.2; extra == 'doc-parsers'
@@ -144,6 +144,8 @@ Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'litellm'
144
144
  Provides-Extra: marker-pdf
145
145
  Requires-Dist: marker-pdf[full]>=1.6.0; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'marker-pdf'
146
146
  Requires-Dist: opencv-python>=4.11.0.86; extra == 'marker-pdf'
147
+ Provides-Extra: markitdown
148
+ Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'markitdown'
147
149
  Provides-Extra: meilisearch
148
150
  Requires-Dist: meilisearch-python-sdk<3.0.0,>=2.2.3; extra == 'meilisearch'
149
151
  Provides-Extra: metaphor
@@ -157,7 +159,7 @@ Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
157
159
  Provides-Extra: pdf-parsers
158
160
  Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
159
161
  Requires-Dist: marker-pdf; extra == 'pdf-parsers'
160
- Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
162
+ Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'pdf-parsers'
161
163
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
162
164
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
163
165
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
@@ -81,17 +81,17 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
81
81
  langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
82
82
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
83
83
  langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
84
- langroid/parsing/document_parser.py,sha256=fyCx4X1192asom5tp3DNV4J5Em2u4Z7rCC0FA8dNsSQ,52954
84
+ langroid/parsing/document_parser.py,sha256=72g9EUuLlCAAXGD9-8UPe7_l7JnZ7vgc764g_17EPWA,54454
85
85
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
86
86
  langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
87
- langroid/parsing/parser.py,sha256=ZUvBhzMZQWKerbb9UECbcqkNc9wWKuUgPyC8L6baxao,14295
87
+ langroid/parsing/parser.py,sha256=bxBXiyRnUBhS5Ng6s4OhAUpxqCSUXwNn4c7DaDSiWnE,14314
88
88
  langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
89
89
  langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
90
90
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
91
91
  langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
92
92
  langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
93
93
  langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
94
- langroid/parsing/url_loader.py,sha256=tNLyCo8A08GcB8KFr04YKDO9KFHyqNacKU0-DuWlu4I,11721
94
+ langroid/parsing/url_loader.py,sha256=UiKlokh8AE0Qz9d4uIGPGJ-1yUfXrJjjuRBzsbxvurg,13552
95
95
  langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
96
96
  langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
97
97
  langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
127
127
  langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
128
128
  langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
129
129
  langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
130
- langroid-0.47.2.dist-info/METADATA,sha256=1CsoTeRCHzsHCxHGocZ44e21J0anN0xNUtEamlfh85s,63473
131
- langroid-0.47.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
- langroid-0.47.2.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
- langroid-0.47.2.dist-info/RECORD,,
130
+ langroid-0.48.0.dist-info/METADATA,sha256=mhJmePv93Tsvw2Q1C2EeKrmtFzLmzxGBobBU8RAqpZs,63606
131
+ langroid-0.48.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
+ langroid-0.48.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
+ langroid-0.48.0.dist-info/RECORD,,