langroid 0.47.2__py3-none-any.whl → 0.48.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langroid/mytypes.py CHANGED
@@ -45,6 +45,8 @@ class DocMetaData(BaseModel):
45
45
 
46
46
  source: str = "context" # just reference
47
47
  source_content: str = "context" # reference and content
48
+ title: str = "unknown"
49
+ published_date: str = "unknown"
48
50
  is_chunk: bool = False # if it is a chunk, don't split
49
51
  id: str = Field(default_factory=lambda: str(uuid4()))
50
52
  window_ids: List[str] = [] # for RAG: ids of chunks around this one
@@ -161,6 +161,8 @@ class DocumentParser(Parser):
161
161
  return UnstructuredDocxParser(source, config)
162
162
  elif config.docx.library == "python-docx":
163
163
  return PythonDocxParser(source, config)
164
+ elif config.docx.library == "markitdown-docx":
165
+ return MarkitdownDocxParser(source, config)
164
166
  else:
165
167
  raise ValueError(
166
168
  f"Unsupported DOCX library specified: {config.docx.library}"
@@ -887,6 +889,44 @@ class PythonDocxParser(DocumentParser):
887
889
  )
888
890
 
889
891
 
892
+ class MarkitdownDocxParser(DocumentParser):
893
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
894
+ try:
895
+ from markitdown import MarkItDown
896
+ except ImportError:
897
+ LangroidImportError("markitdown", ["markitdown", "doc-parsers"])
898
+ md = MarkItDown()
899
+ self.doc_bytes.seek(0) # Reset to start
900
+
901
+ # Direct conversion from stream works for DOCX (unlike XLSX)
902
+ result = md.convert_stream(self.doc_bytes, file_extension=".docx")
903
+
904
+ # Split content into logical sections (paragraphs, sections, etc.)
905
+ # This approach differs from the strict page-based approach used for PDFs
906
+ sections = re.split(r"(?=# |\n## |\n### )", result.text_content)
907
+
908
+ # Filter out empty sections
909
+ sections = [section for section in sections if section.strip()]
910
+
911
+ for i, section in enumerate(sections):
912
+ yield i, section
913
+
914
+ def get_document_from_page(self, md_content: str) -> Document:
915
+ """
916
+ Get Document object from a given markdown section.
917
+
918
+ Args:
919
+ md_content (str): The markdown content for the section.
920
+
921
+ Returns:
922
+ Document: Document object, with content and possible metadata.
923
+ """
924
+ return Document(
925
+ content=self.fix_text(md_content),
926
+ metadata=DocMetaData(source=self.source),
927
+ )
928
+
929
+
890
930
  class MarkitdownXLSXParser(DocumentParser):
891
931
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
892
932
  try:
@@ -78,7 +78,7 @@ class PdfParsingConfig(BaseParsingConfig):
78
78
 
79
79
 
80
80
  class DocxParsingConfig(BaseSettings):
81
- library: Literal["python-docx", "unstructured"] = "unstructured"
81
+ library: Literal["python-docx", "unstructured", "markitdown-docx"] = "unstructured"
82
82
 
83
83
 
84
84
  class DocParsingConfig(BaseSettings):
@@ -48,6 +48,15 @@ class FirecrawlConfig(BaseCrawlerConfig):
48
48
  env_prefix = "FIRECRAWL_"
49
49
 
50
50
 
51
+ class ExaCrawlerConfig(BaseCrawlerConfig):
52
+ api_key: str = ""
53
+
54
+ class Config:
55
+ # Allow setting of fields via env vars with prefix EXA_
56
+ # e.g., EXA_API_KEY=your_api_key
57
+ env_prefix = "EXA_"
58
+
59
+
51
60
  class BaseCrawler(ABC):
52
61
  """Abstract base class for web crawlers."""
53
62
 
@@ -150,6 +159,8 @@ class CrawlerFactory:
150
159
  return TrafilaturaCrawler(config)
151
160
  elif isinstance(config, FirecrawlConfig):
152
161
  return FirecrawlCrawler(config)
162
+ elif isinstance(config, ExaCrawlerConfig):
163
+ return ExaCrawler(config)
153
164
  else:
154
165
  raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
155
166
 
@@ -247,7 +258,13 @@ class FirecrawlCrawler(BaseCrawler):
247
258
  with open(filename, "w") as f:
248
259
  f.write(content)
249
260
  docs.append(
250
- Document(content=content, metadata=DocMetaData(source=url))
261
+ Document(
262
+ content=content,
263
+ metadata=DocMetaData(
264
+ source=url,
265
+ title=page["metadata"].get("title", ""),
266
+ ),
267
+ )
251
268
  )
252
269
  processed_urls.add(url)
253
270
  new_pages += 1
@@ -289,7 +306,10 @@ class FirecrawlCrawler(BaseCrawler):
289
306
  docs.append(
290
307
  Document(
291
308
  content=result["markdown"],
292
- metadata=DocMetaData(source=url),
309
+ metadata=DocMetaData(
310
+ source=url,
311
+ title=metadata.get("title", ""),
312
+ ),
293
313
  )
294
314
  )
295
315
  except Exception as e:
@@ -311,6 +331,77 @@ class FirecrawlCrawler(BaseCrawler):
311
331
  return docs
312
332
 
313
333
 
334
+ class ExaCrawler(BaseCrawler):
335
+ """Crawler implementation using Exa API."""
336
+
337
+ def __init__(self, config: ExaCrawlerConfig) -> None:
338
+ """Initialize the Exa crawler.
339
+
340
+ Args:
341
+ config: Configuration for the crawler
342
+ """
343
+ super().__init__(config)
344
+ self.config: ExaCrawlerConfig = config
345
+
346
+ @property
347
+ def needs_parser(self) -> bool:
348
+ return True
349
+
350
+ def crawl(self, urls: List[str]) -> List[Document]:
351
+ """Crawl the given URLs using Exa SDK.
352
+
353
+ Args:
354
+ urls: List of URLs to crawl
355
+
356
+ Returns:
357
+ List of Documents with content extracted from the URLs
358
+
359
+ Raises:
360
+ LangroidImportError: If the exa package is not installed
361
+ ValueError: If the Exa API key is not set
362
+ """
363
+ try:
364
+ from exa_py import Exa
365
+ except ImportError:
366
+ raise LangroidImportError("exa", "exa")
367
+
368
+ if not self.config.api_key:
369
+ raise ValueError("EXA_API_KEY key is required in your env or .env")
370
+
371
+ exa = Exa(self.config.api_key)
372
+ docs = []
373
+
374
+ try:
375
+ for url in urls:
376
+ parsed_doc_chunks = self._process_document(url)
377
+ if parsed_doc_chunks:
378
+ docs.extend(parsed_doc_chunks)
379
+ continue
380
+ else:
381
+ results = exa.get_contents([url], livecrawl="always", text=True)
382
+ result = results.results[0]
383
+ if result.text:
384
+ # append a NON-chunked document
385
+ # (metadata.is_chunk = False, so will be chunked downstream)
386
+ docs.append(
387
+ Document(
388
+ content=result.text,
389
+ metadata=DocMetaData(
390
+ source=url,
391
+ title=getattr(result, "title", ""),
392
+ published_date=getattr(
393
+ result, "published_date", ""
394
+ ),
395
+ ),
396
+ )
397
+ )
398
+
399
+ except Exception as e:
400
+ logging.error(f"Error retrieving content from Exa API: {e}")
401
+
402
+ return docs
403
+
404
+
314
405
  class URLLoader:
315
406
  """Loads URLs and extracts text using a specified crawler."""
316
407
 
@@ -334,6 +425,8 @@ class URLLoader:
334
425
  crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
335
426
 
336
427
  self.crawler = CrawlerFactory.create_crawler(crawler_config)
428
+ if self.crawler.needs_parser:
429
+ self.crawler.parser = Parser(parsing_config)
337
430
 
338
431
  def load(self) -> List[Document]:
339
432
  """Load the URLs using the specified crawler."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.47.2
3
+ Version: 0.48.1
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -108,7 +108,7 @@ Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
108
108
  Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
109
109
  Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
110
110
  Provides-Extra: doc-parsers
111
- Requires-Dist: markitdown>=0.0.1a3; extra == 'doc-parsers'
111
+ Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'doc-parsers'
112
112
  Requires-Dist: openpyxl>=3.1.5; extra == 'doc-parsers'
113
113
  Requires-Dist: python-docx>=1.1.2; extra == 'doc-parsers'
114
114
  Requires-Dist: python-pptx>=1.0.2; extra == 'doc-parsers'
@@ -144,6 +144,8 @@ Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'litellm'
144
144
  Provides-Extra: marker-pdf
145
145
  Requires-Dist: marker-pdf[full]>=1.6.0; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'marker-pdf'
146
146
  Requires-Dist: opencv-python>=4.11.0.86; extra == 'marker-pdf'
147
+ Provides-Extra: markitdown
148
+ Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'markitdown'
147
149
  Provides-Extra: meilisearch
148
150
  Requires-Dist: meilisearch-python-sdk<3.0.0,>=2.2.3; extra == 'meilisearch'
149
151
  Provides-Extra: metaphor
@@ -157,7 +159,7 @@ Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
157
159
  Provides-Extra: pdf-parsers
158
160
  Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
159
161
  Requires-Dist: marker-pdf; extra == 'pdf-parsers'
160
- Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
162
+ Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'pdf-parsers'
161
163
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
162
164
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
163
165
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
@@ -1,6 +1,6 @@
1
1
  langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
2
2
  langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
3
- langroid/mytypes.py,sha256=wfb320SFnZVTv_CgcLWsvoKBXxAFfY4EISeue8MFqpQ,2912
3
+ langroid/mytypes.py,sha256=ZW06CyhOPtemUvAGl5m4uPMHd8kEeEfwq04d4U8PntE,2975
4
4
  langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
6
6
  langroid/agent/base.py,sha256=U-UjdpxIFqkzRIB5-LYwHrhMSNI3sDbfnNRqIhrtsyI,79568
@@ -81,17 +81,17 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
81
81
  langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
82
82
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
83
83
  langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
84
- langroid/parsing/document_parser.py,sha256=fyCx4X1192asom5tp3DNV4J5Em2u4Z7rCC0FA8dNsSQ,52954
84
+ langroid/parsing/document_parser.py,sha256=72g9EUuLlCAAXGD9-8UPe7_l7JnZ7vgc764g_17EPWA,54454
85
85
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
86
86
  langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
87
- langroid/parsing/parser.py,sha256=ZUvBhzMZQWKerbb9UECbcqkNc9wWKuUgPyC8L6baxao,14295
87
+ langroid/parsing/parser.py,sha256=bxBXiyRnUBhS5Ng6s4OhAUpxqCSUXwNn4c7DaDSiWnE,14314
88
88
  langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
89
89
  langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
90
90
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
91
91
  langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
92
92
  langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
93
93
  langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
94
- langroid/parsing/url_loader.py,sha256=tNLyCo8A08GcB8KFr04YKDO9KFHyqNacKU0-DuWlu4I,11721
94
+ langroid/parsing/url_loader.py,sha256=Y1kFi6DoIjIxuQmMwR9SPVyHfeCJAe41eofdXUIA1fQ,14833
95
95
  langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
96
96
  langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
97
97
  langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
127
127
  langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
128
128
  langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
129
129
  langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
130
- langroid-0.47.2.dist-info/METADATA,sha256=1CsoTeRCHzsHCxHGocZ44e21J0anN0xNUtEamlfh85s,63473
131
- langroid-0.47.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
- langroid-0.47.2.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
- langroid-0.47.2.dist-info/RECORD,,
130
+ langroid-0.48.1.dist-info/METADATA,sha256=5tA8WlsZ5n91APjQVDaNBVmUNwOgZ11jfdQunonoW5w,63606
131
+ langroid-0.48.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
+ langroid-0.48.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
+ langroid-0.48.1.dist-info/RECORD,,