langroid 0.47.2__py3-none-any.whl → 0.48.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/parsing/document_parser.py +40 -0
- langroid/parsing/parser.py +1 -1
- langroid/parsing/url_loader.py +65 -0
- {langroid-0.47.2.dist-info → langroid-0.48.0.dist-info}/METADATA +5 -3
- {langroid-0.47.2.dist-info → langroid-0.48.0.dist-info}/RECORD +7 -7
- {langroid-0.47.2.dist-info → langroid-0.48.0.dist-info}/WHEEL +0 -0
- {langroid-0.47.2.dist-info → langroid-0.48.0.dist-info}/licenses/LICENSE +0 -0
@@ -161,6 +161,8 @@ class DocumentParser(Parser):
|
|
161
161
|
return UnstructuredDocxParser(source, config)
|
162
162
|
elif config.docx.library == "python-docx":
|
163
163
|
return PythonDocxParser(source, config)
|
164
|
+
elif config.docx.library == "markitdown-docx":
|
165
|
+
return MarkitdownDocxParser(source, config)
|
164
166
|
else:
|
165
167
|
raise ValueError(
|
166
168
|
f"Unsupported DOCX library specified: {config.docx.library}"
|
@@ -887,6 +889,44 @@ class PythonDocxParser(DocumentParser):
|
|
887
889
|
)
|
888
890
|
|
889
891
|
|
892
|
+
class MarkitdownDocxParser(DocumentParser):
|
893
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
894
|
+
try:
|
895
|
+
from markitdown import MarkItDown
|
896
|
+
except ImportError:
|
897
|
+
LangroidImportError("markitdown", ["markitdown", "doc-parsers"])
|
898
|
+
md = MarkItDown()
|
899
|
+
self.doc_bytes.seek(0) # Reset to start
|
900
|
+
|
901
|
+
# Direct conversion from stream works for DOCX (unlike XLSX)
|
902
|
+
result = md.convert_stream(self.doc_bytes, file_extension=".docx")
|
903
|
+
|
904
|
+
# Split content into logical sections (paragraphs, sections, etc.)
|
905
|
+
# This approach differs from the strict page-based approach used for PDFs
|
906
|
+
sections = re.split(r"(?=# |\n## |\n### )", result.text_content)
|
907
|
+
|
908
|
+
# Filter out empty sections
|
909
|
+
sections = [section for section in sections if section.strip()]
|
910
|
+
|
911
|
+
for i, section in enumerate(sections):
|
912
|
+
yield i, section
|
913
|
+
|
914
|
+
def get_document_from_page(self, md_content: str) -> Document:
|
915
|
+
"""
|
916
|
+
Get Document object from a given markdown section.
|
917
|
+
|
918
|
+
Args:
|
919
|
+
md_content (str): The markdown content for the section.
|
920
|
+
|
921
|
+
Returns:
|
922
|
+
Document: Document object, with content and possible metadata.
|
923
|
+
"""
|
924
|
+
return Document(
|
925
|
+
content=self.fix_text(md_content),
|
926
|
+
metadata=DocMetaData(source=self.source),
|
927
|
+
)
|
928
|
+
|
929
|
+
|
890
930
|
class MarkitdownXLSXParser(DocumentParser):
|
891
931
|
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
892
932
|
try:
|
langroid/parsing/parser.py
CHANGED
@@ -78,7 +78,7 @@ class PdfParsingConfig(BaseParsingConfig):
|
|
78
78
|
|
79
79
|
|
80
80
|
class DocxParsingConfig(BaseSettings):
|
81
|
-
library: Literal["python-docx", "unstructured"] = "unstructured"
|
81
|
+
library: Literal["python-docx", "unstructured", "markitdown-docx"] = "unstructured"
|
82
82
|
|
83
83
|
|
84
84
|
class DocParsingConfig(BaseSettings):
|
langroid/parsing/url_loader.py
CHANGED
@@ -48,6 +48,15 @@ class FirecrawlConfig(BaseCrawlerConfig):
|
|
48
48
|
env_prefix = "FIRECRAWL_"
|
49
49
|
|
50
50
|
|
51
|
+
class ExaCrawlerConfig(BaseCrawlerConfig):
|
52
|
+
api_key: str = ""
|
53
|
+
|
54
|
+
class Config:
|
55
|
+
# Allow setting of fields via env vars with prefix EXA_
|
56
|
+
# e.g., EXA_API_KEY=your_api_key
|
57
|
+
env_prefix = "EXA_"
|
58
|
+
|
59
|
+
|
51
60
|
class BaseCrawler(ABC):
|
52
61
|
"""Abstract base class for web crawlers."""
|
53
62
|
|
@@ -150,6 +159,8 @@ class CrawlerFactory:
|
|
150
159
|
return TrafilaturaCrawler(config)
|
151
160
|
elif isinstance(config, FirecrawlConfig):
|
152
161
|
return FirecrawlCrawler(config)
|
162
|
+
elif isinstance(config, ExaCrawlerConfig):
|
163
|
+
return ExaCrawler(config)
|
153
164
|
else:
|
154
165
|
raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
|
155
166
|
|
@@ -311,6 +322,60 @@ class FirecrawlCrawler(BaseCrawler):
|
|
311
322
|
return docs
|
312
323
|
|
313
324
|
|
325
|
+
class ExaCrawler(BaseCrawler):
|
326
|
+
"""Crawler implementation using Exa API."""
|
327
|
+
|
328
|
+
def __init__(self, config: ExaCrawlerConfig) -> None:
|
329
|
+
"""Initialize the Exa crawler.
|
330
|
+
|
331
|
+
Args:
|
332
|
+
config: Configuration for the crawler
|
333
|
+
"""
|
334
|
+
super().__init__(config)
|
335
|
+
self.config: ExaCrawlerConfig = config
|
336
|
+
|
337
|
+
@property
|
338
|
+
def needs_parser(self) -> bool:
|
339
|
+
return False
|
340
|
+
|
341
|
+
def crawl(self, urls: List[str]) -> List[Document]:
|
342
|
+
"""Crawl the given URLs using Exa SDK.
|
343
|
+
|
344
|
+
Args:
|
345
|
+
urls: List of URLs to crawl
|
346
|
+
|
347
|
+
Returns:
|
348
|
+
List of Documents with content extracted from the URLs
|
349
|
+
|
350
|
+
Raises:
|
351
|
+
LangroidImportError: If the exa package is not installed
|
352
|
+
ValueError: If the Exa API key is not set
|
353
|
+
"""
|
354
|
+
try:
|
355
|
+
from exa_py import Exa
|
356
|
+
except ImportError:
|
357
|
+
raise LangroidImportError("exa", "exa")
|
358
|
+
|
359
|
+
if not self.config.api_key:
|
360
|
+
raise ValueError("EXA_API_KEY key is required in your env or .env")
|
361
|
+
|
362
|
+
exa = Exa(self.config.api_key)
|
363
|
+
docs = []
|
364
|
+
|
365
|
+
try:
|
366
|
+
results = exa.get_contents(urls, text=True)
|
367
|
+
|
368
|
+
for result in results.results:
|
369
|
+
if result.text:
|
370
|
+
metadata = DocMetaData(source=result.url)
|
371
|
+
docs.append(Document(content=result.text, metadata=metadata))
|
372
|
+
|
373
|
+
except Exception as e:
|
374
|
+
logging.error(f"Error retrieving content from Exa API: {e}")
|
375
|
+
|
376
|
+
return docs
|
377
|
+
|
378
|
+
|
314
379
|
class URLLoader:
|
315
380
|
"""Loads URLs and extracts text using a specified crawler."""
|
316
381
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.48.0
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -108,7 +108,7 @@ Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
|
|
108
108
|
Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
|
109
109
|
Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
|
110
110
|
Provides-Extra: doc-parsers
|
111
|
-
Requires-Dist: markitdown>=0.0.1a3; extra == 'doc-parsers'
|
111
|
+
Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'doc-parsers'
|
112
112
|
Requires-Dist: openpyxl>=3.1.5; extra == 'doc-parsers'
|
113
113
|
Requires-Dist: python-docx>=1.1.2; extra == 'doc-parsers'
|
114
114
|
Requires-Dist: python-pptx>=1.0.2; extra == 'doc-parsers'
|
@@ -144,6 +144,8 @@ Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'litellm'
|
|
144
144
|
Provides-Extra: marker-pdf
|
145
145
|
Requires-Dist: marker-pdf[full]>=1.6.0; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'marker-pdf'
|
146
146
|
Requires-Dist: opencv-python>=4.11.0.86; extra == 'marker-pdf'
|
147
|
+
Provides-Extra: markitdown
|
148
|
+
Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'markitdown'
|
147
149
|
Provides-Extra: meilisearch
|
148
150
|
Requires-Dist: meilisearch-python-sdk<3.0.0,>=2.2.3; extra == 'meilisearch'
|
149
151
|
Provides-Extra: metaphor
|
@@ -157,7 +159,7 @@ Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
|
|
157
159
|
Provides-Extra: pdf-parsers
|
158
160
|
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
|
159
161
|
Requires-Dist: marker-pdf; extra == 'pdf-parsers'
|
160
|
-
Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
|
162
|
+
Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'pdf-parsers'
|
161
163
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
|
162
164
|
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
|
163
165
|
Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'
|
@@ -81,17 +81,17 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
|
|
81
81
|
langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
|
82
82
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
83
83
|
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
84
|
-
langroid/parsing/document_parser.py,sha256=
|
84
|
+
langroid/parsing/document_parser.py,sha256=72g9EUuLlCAAXGD9-8UPe7_l7JnZ7vgc764g_17EPWA,54454
|
85
85
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
86
86
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
87
|
-
langroid/parsing/parser.py,sha256=
|
87
|
+
langroid/parsing/parser.py,sha256=bxBXiyRnUBhS5Ng6s4OhAUpxqCSUXwNn4c7DaDSiWnE,14314
|
88
88
|
langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
|
89
89
|
langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
|
90
90
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
91
91
|
langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
|
92
92
|
langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
|
93
93
|
langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
|
94
|
-
langroid/parsing/url_loader.py,sha256=
|
94
|
+
langroid/parsing/url_loader.py,sha256=UiKlokh8AE0Qz9d4uIGPGJ-1yUfXrJjjuRBzsbxvurg,13552
|
95
95
|
langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
|
96
96
|
langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
|
97
97
|
langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
|
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
127
127
|
langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
|
128
128
|
langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
|
129
129
|
langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
|
130
|
-
langroid-0.
|
131
|
-
langroid-0.
|
132
|
-
langroid-0.
|
133
|
-
langroid-0.
|
130
|
+
langroid-0.48.0.dist-info/METADATA,sha256=mhJmePv93Tsvw2Q1C2EeKrmtFzLmzxGBobBU8RAqpZs,63606
|
131
|
+
langroid-0.48.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
132
|
+
langroid-0.48.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
133
|
+
langroid-0.48.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|