PyPI - langroid - Versions diffs - 0.47.2__py3-none-any.whl → 0.48.1__py3-none-any.whl - Mend

langroid 0.47.2py3-none-any.whl → 0.48.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

langroid/mytypes.py CHANGED Viewed

@@ -45,6 +45,8 @@ class DocMetaData(BaseModel):
     source: str = "context"  # just reference
     source_content: str = "context"  # reference and content
+    title: str = "unknown"
+    published_date: str = "unknown"
     is_chunk: bool = False  # if it is a chunk, don't split
     id: str = Field(default_factory=lambda: str(uuid4()))
     window_ids: List[str] = []  # for RAG: ids of chunks around this one

langroid/parsing/document_parser.py CHANGED Viewed

@@ -161,6 +161,8 @@ class DocumentParser(Parser):
                 return UnstructuredDocxParser(source, config)
             elif config.docx.library == "python-docx":
                 return PythonDocxParser(source, config)
+            elif config.docx.library == "markitdown-docx":
+                return MarkitdownDocxParser(source, config)
             else:
                 raise ValueError(
                     f"Unsupported DOCX library specified: {config.docx.library}"
@@ -887,6 +889,44 @@ class PythonDocxParser(DocumentParser):
         )
+class MarkitdownDocxParser(DocumentParser):
+    def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
+        try:
+            from markitdown import MarkItDown
+        except ImportError:
+            LangroidImportError("markitdown", ["markitdown", "doc-parsers"])
+        md = MarkItDown()
+        self.doc_bytes.seek(0)  # Reset to start
+        # Direct conversion from stream works for DOCX (unlike XLSX)
+        result = md.convert_stream(self.doc_bytes, file_extension=".docx")
+        # Split content into logical sections (paragraphs, sections, etc.)
+        # This approach differs from the strict page-based approach used for PDFs
+        sections = re.split(r"(?=# |\n## |\n### )", result.text_content)
+        # Filter out empty sections
+        sections = [section for section in sections if section.strip()]
+        for i, section in enumerate(sections):
+            yield i, section
+    def get_document_from_page(self, md_content: str) -> Document:
+        """
+        Get Document object from a given markdown section.
+        Args:
+            md_content (str): The markdown content for the section.
+        Returns:
+            Document: Document object, with content and possible metadata.
+        """
+        return Document(
+            content=self.fix_text(md_content),
+            metadata=DocMetaData(source=self.source),
+        )
 class MarkitdownXLSXParser(DocumentParser):
     def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
         try:

langroid/parsing/parser.py CHANGED Viewed

@@ -78,7 +78,7 @@ class PdfParsingConfig(BaseParsingConfig):
 class DocxParsingConfig(BaseSettings):
-    library: Literal["python-docx", "unstructured"] = "unstructured"
+    library: Literal["python-docx", "unstructured", "markitdown-docx"] = "unstructured"
 class DocParsingConfig(BaseSettings):

langroid/parsing/url_loader.py CHANGED Viewed

@@ -48,6 +48,15 @@ class FirecrawlConfig(BaseCrawlerConfig):
         env_prefix = "FIRECRAWL_"
+class ExaCrawlerConfig(BaseCrawlerConfig):
+    api_key: str = ""
+    class Config:
+        # Allow setting of fields via env vars with prefix EXA_
+        # e.g., EXA_API_KEY=your_api_key
+        env_prefix = "EXA_"
 class BaseCrawler(ABC):
     """Abstract base class for web crawlers."""
@@ -150,6 +159,8 @@ class CrawlerFactory:
             return TrafilaturaCrawler(config)
         elif isinstance(config, FirecrawlConfig):
             return FirecrawlCrawler(config)
+        elif isinstance(config, ExaCrawlerConfig):
+            return ExaCrawler(config)
         else:
             raise ValueError(f"Unsupported crawler configuration type: {type(config)}")
@@ -247,7 +258,13 @@ class FirecrawlCrawler(BaseCrawler):
                     with open(filename, "w") as f:
                         f.write(content)
                     docs.append(
-                        Document(content=content, metadata=DocMetaData(source=url))
+                        Document(
+                            content=content,
+                            metadata=DocMetaData(
+                                source=url,
+                                title=page["metadata"].get("title", ""),
+                            ),
+                        )
                     )
                     processed_urls.add(url)
                     new_pages += 1
@@ -289,7 +306,10 @@ class FirecrawlCrawler(BaseCrawler):
                         docs.append(
                             Document(
                                 content=result["markdown"],
-                                metadata=DocMetaData(source=url),
+                                metadata=DocMetaData(
+                                    source=url,
+                                    title=metadata.get("title", ""),
+                                ),
                             )
                         )
                 except Exception as e:
@@ -311,6 +331,77 @@ class FirecrawlCrawler(BaseCrawler):
         return docs
+class ExaCrawler(BaseCrawler):
+    """Crawler implementation using Exa API."""
+    def __init__(self, config: ExaCrawlerConfig) -> None:
+        """Initialize the Exa crawler.
+        Args:
+            config: Configuration for the crawler
+        """
+        super().__init__(config)
+        self.config: ExaCrawlerConfig = config
+    @property
+    def needs_parser(self) -> bool:
+        return True
+    def crawl(self, urls: List[str]) -> List[Document]:
+        """Crawl the given URLs using Exa SDK.
+        Args:
+            urls: List of URLs to crawl
+        Returns:
+            List of Documents with content extracted from the URLs
+        Raises:
+            LangroidImportError: If the exa package is not installed
+            ValueError: If the Exa API key is not set
+        """
+        try:
+            from exa_py import Exa
+        except ImportError:
+            raise LangroidImportError("exa", "exa")
+        if not self.config.api_key:
+            raise ValueError("EXA_API_KEY key is required in your env or .env")
+        exa = Exa(self.config.api_key)
+        docs = []
+        try:
+            for url in urls:
+                parsed_doc_chunks = self._process_document(url)
+                if parsed_doc_chunks:
+                    docs.extend(parsed_doc_chunks)
+                    continue
+                else:
+                    results = exa.get_contents([url], livecrawl="always", text=True)
+                    result = results.results[0]
+                    if result.text:
+                        # append a NON-chunked document
+                        # (metadata.is_chunk = False, so will be chunked downstream)
+                        docs.append(
+                            Document(
+                                content=result.text,
+                                metadata=DocMetaData(
+                                    source=url,
+                                    title=getattr(result, "title", ""),
+                                    published_date=getattr(
+                                        result, "published_date", ""
+                                    ),
+                                ),
+                            )
+                        )
+        except Exception as e:
+            logging.error(f"Error retrieving content from Exa API: {e}")
+        return docs
 class URLLoader:
     """Loads URLs and extracts text using a specified crawler."""
@@ -334,6 +425,8 @@ class URLLoader:
             crawler_config = TrafilaturaConfig(parser=Parser(parsing_config))
         self.crawler = CrawlerFactory.create_crawler(crawler_config)
+        if self.crawler.needs_parser:
+            self.crawler.parser = Parser(parsing_config)
     def load(self) -> List[Document]:
         """Load the URLs using the specified crawler."""

{langroid-0.47.2.dist-info → langroid-0.48.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langroid
-Version: 0.47.2
+Version: 0.48.1
 Summary: Harness LLMs with Multi-Agent Programming
 Author-email: Prasad Chalasani <pchalasani@gmail.com>
 License: MIT
@@ -108,7 +108,7 @@ Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
 Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
 Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
 Provides-Extra: doc-parsers
-Requires-Dist: markitdown>=0.0.1a3; extra == 'doc-parsers'
+Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'doc-parsers'
 Requires-Dist: openpyxl>=3.1.5; extra == 'doc-parsers'
 Requires-Dist: python-docx>=1.1.2; extra == 'doc-parsers'
 Requires-Dist: python-pptx>=1.0.2; extra == 'doc-parsers'
@@ -144,6 +144,8 @@ Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'litellm'
 Provides-Extra: marker-pdf
 Requires-Dist: marker-pdf[full]>=1.6.0; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'marker-pdf'
 Requires-Dist: opencv-python>=4.11.0.86; extra == 'marker-pdf'
+Provides-Extra: markitdown
+Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'markitdown'
 Provides-Extra: meilisearch
 Requires-Dist: meilisearch-python-sdk<3.0.0,>=2.2.3; extra == 'meilisearch'
 Provides-Extra: metaphor
@@ -157,7 +159,7 @@ Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
 Provides-Extra: pdf-parsers
 Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
 Requires-Dist: marker-pdf; extra == 'pdf-parsers'
-Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
+Requires-Dist: markitdown[docx,pptx,xlsx]>=0.0.1a3; extra == 'pdf-parsers'
 Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
 Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
 Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'

{langroid-0.47.2.dist-info → langroid-0.48.1.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 langroid/__init__.py,sha256=z_fCOLQJPOw3LLRPBlFB5-2HyCjpPgQa4m4iY5Fvb8Y,1800
 langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
-langroid/mytypes.py,sha256=wfb320SFnZVTv_CgcLWsvoKBXxAFfY4EISeue8MFqpQ,2912
+langroid/mytypes.py,sha256=ZW06CyhOPtemUvAGl5m4uPMHd8kEeEfwq04d4U8PntE,2975
 langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
 langroid/agent/base.py,sha256=U-UjdpxIFqkzRIB5-LYwHrhMSNI3sDbfnNRqIhrtsyI,79568
@@ -81,17 +81,17 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
 langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
 langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
 langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
-langroid/parsing/document_parser.py,sha256=fyCx4X1192asom5tp3DNV4J5Em2u4Z7rCC0FA8dNsSQ,52954
+langroid/parsing/document_parser.py,sha256=72g9EUuLlCAAXGD9-8UPe7_l7JnZ7vgc764g_17EPWA,54454
 langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
 langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
-langroid/parsing/parser.py,sha256=ZUvBhzMZQWKerbb9UECbcqkNc9wWKuUgPyC8L6baxao,14295
+langroid/parsing/parser.py,sha256=bxBXiyRnUBhS5Ng6s4OhAUpxqCSUXwNn4c7DaDSiWnE,14314
 langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
 langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
 langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
 langroid/parsing/search.py,sha256=0NJ5-Rou_BbrHAD7O9b20bKjZJnbadjObvGm4Zq8Kis,9818
 langroid/parsing/spider.py,sha256=hAVM6wxh1pQ0EN4tI5wMBtAjIk0T-xnpi-ZUzWybhos,3258
 langroid/parsing/table_loader.py,sha256=qNM4obT_0Y4tjrxNBCNUYjKQ9oETCZ7FbolKBTcz-GM,3410
-langroid/parsing/url_loader.py,sha256=tNLyCo8A08GcB8KFr04YKDO9KFHyqNacKU0-DuWlu4I,11721
+langroid/parsing/url_loader.py,sha256=Y1kFi6DoIjIxuQmMwR9SPVyHfeCJAe41eofdXUIA1fQ,14833
 langroid/parsing/urls.py,sha256=Tjzr64YsCusiYkY0LEGB5-rSuX8T2P_4DVoOFKAeKuI,8081
 langroid/parsing/utils.py,sha256=WwqzOhbQRlorbVvddDIZKv9b1KqZCBDm955lgIHDXRw,12828
 langroid/parsing/web_search.py,sha256=sARV1Tku4wiInhuCz0kRaMHcoF6Ok6CLu7vapLS8hjs,8222
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
 langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
 langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
 langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
-langroid-0.47.2.dist-info/METADATA,sha256=1CsoTeRCHzsHCxHGocZ44e21J0anN0xNUtEamlfh85s,63473
-langroid-0.47.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-langroid-0.47.2.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.47.2.dist-info/RECORD,,
+langroid-0.48.1.dist-info/METADATA,sha256=5tA8WlsZ5n91APjQVDaNBVmUNwOgZ11jfdQunonoW5w,63606
+langroid-0.48.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+langroid-0.48.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.48.1.dist-info/RECORD,,

{langroid-0.47.2.dist-info → langroid-0.48.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{langroid-0.47.2.dist-info → langroid-0.48.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

langroid 0.47.2__py3-none-any.whl → 0.48.1__py3-none-any.whl

langroid 0.47.2py3-none-any.whl → 0.48.1py3-none-any.whl