PyPI - langroid - Versions diffs - 0.41.5__py3-none-any.whl → 0.42.1__py3-none-any.whl - Mend

langroid 0.41.5py3-none-any.whl → 0.42.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

langroid/parsing/__init__.py CHANGED Viewed

@@ -14,6 +14,9 @@ from . import web_search
 from .parser import (
     Splitter,
+    MarkitdownXLSParsingConfig,
+    MarkitdownXLSXParsingConfig,
+    MarkitdownPPTXParsingConfig,
     PdfParsingConfig,
     DocxParsingConfig,
     DocParsingConfig,
@@ -40,6 +43,9 @@ __all__ = [
     "DocxParsingConfig",
     "DocParsingConfig",
     "ParsingConfig",
+    "MarkitdownXLSXParsingConfig",
+    "MarkitdownXLSParsingConfig",
+    "MarkitdownPPTXParsingConfig",
     "Parser",
 ]

langroid/parsing/document_parser.py CHANGED Viewed

@@ -56,6 +56,9 @@ class DocumentType(str, Enum):
     DOCX = "docx"
     DOC = "doc"
     TXT = "txt"
+    XLSX = "xlsx"
+    XLS = "xls"
+    PPTX = "pptx"
 def find_last_full_char(possible_unicode: bytes) -> int:
@@ -175,6 +178,12 @@ class DocumentParser(Parser):
                 )
         elif inferred_doc_type == DocumentType.DOC:
             return UnstructuredDocParser(source, config)
+        elif inferred_doc_type == DocumentType.XLS:
+            return MarkitdownXLSXParser(source, config)
+        elif inferred_doc_type == DocumentType.XLSX:
+            return MarkitdownXLSXParser(source, config)
+        elif inferred_doc_type == DocumentType.PPTX:
+            return MarkitdownPPTXParser(source, config)
         else:
             source_name = source if isinstance(source, str) else "bytes"
             raise ValueError(f"Unsupported document type: {source_name}")
@@ -223,6 +232,12 @@ class DocumentParser(Parser):
                 return DocumentType.DOCX
             elif source.lower().endswith(".doc"):
                 return DocumentType.DOC
+            elif source.lower().endswith(".xlsx"):
+                return DocumentType.XLSX
+            elif source.lower().endswith(".xls"):
+                return DocumentType.XLS
+            elif source.lower().endswith(".pptx"):
+                return DocumentType.PPTX
             else:
                 raise ValueError(f"Unsupported document type: {source}")
         else:
@@ -236,13 +251,17 @@ class DocumentParser(Parser):
             elif mime_type in [
                 "application/vnd.openxmlformats-officedocument"
                 ".wordprocessingml.document",
-                "application/zip",
             ]:
-                # DOCX files are essentially ZIP files,
-                # but this might catch other ZIP-based formats too!
                 return DocumentType.DOCX
             elif mime_type == "application/msword":
                 return DocumentType.DOC
+            elif (
+                mime_type
+                == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+            ):
+                return DocumentType.XLSX
+            elif mime_type == "application/vnd.ms-excel":
+                return DocumentType.XLS
             else:
                 raise ValueError("Unsupported document type from bytes")
@@ -281,7 +300,14 @@ class DocumentParser(Parser):
                 chunking and splitting settings in the parser config.
         """
         dtype: DocumentType = DocumentParser._document_type(source, doc_type)
-        if dtype in [DocumentType.PDF, DocumentType.DOC, DocumentType.DOCX]:
+        if dtype in [
+            DocumentType.PDF,
+            DocumentType.DOC,
+            DocumentType.DOCX,
+            DocumentType.PPTX,
+            DocumentType.XLS,
+            DocumentType.XLSX,
+        ]:
             doc_parser = DocumentParser.create(
                 source,
                 parser.config,
@@ -857,3 +883,72 @@ class PythonDocxParser(DocumentParser):
             content=self.fix_text(paragraph.text),
             metadata=DocMetaData(source=self.source),
         )
+class MarkitdownXLSXParser(DocumentParser):
+    def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
+        try:
+            from markitdown import MarkItDown
+        except ImportError:
+            LangroidImportError("markitdown", "doc-parsers")
+        md = MarkItDown()
+        self.doc_bytes.seek(0)  # Reset to start
+        # Save stream to a temp file since md.convert() expects a path or URL
+        # Temporary workaround until markitdown fixes convert_stream function
+        # for xls and xlsx files
+        # See issue here https://github.com/microsoft/markitdown/issues/321
+        with tempfile.NamedTemporaryFile(delete=True, suffix=".xlsx") as temp_file:
+            temp_file.write(self.doc_bytes.read())
+            temp_file.flush()  # Ensure data is written before reading
+            result = md.convert(temp_file.name)
+        sheets = re.split(r"(?=## Sheet\d+)", result.text_content)
+        for i, sheet in enumerate(sheets):
+            yield i, sheet
+    def get_document_from_page(self, md_content: str) -> Document:
+        """
+        Get Document object from a given 1-page markdown string.
+        Args:
+            md_content (str): The markdown content for the page.
+        Returns:
+            Document: Document object, with content and possible metadata.
+        """
+        return Document(
+            content=self.fix_text(md_content),
+            metadata=DocMetaData(source=self.source),
+        )
+class MarkitdownPPTXParser(DocumentParser):
+    def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
+        try:
+            from markitdown import MarkItDown
+        except ImportError:
+            LangroidImportError("markitdown", "doc-parsers")
+        md = MarkItDown()
+        self.doc_bytes.seek(0)
+        result = md.convert_stream(self.doc_bytes, file_extension=".pptx")
+        slides = re.split(r"(?=<!-- Slide number: \d+ -->)", result.text_content)
+        for i, slide in enumerate(slides):
+            yield i, slide
+    def get_document_from_page(self, md_content: str) -> Document:
+        """
+        Get Document object from a given 1-page markdown string.
+        Args:
+            md_content (str): The markdown content for the page.
+        Returns:
+            Document: Document object, with content and possible metadata.
+        """
+        return Document(
+            content=self.fix_text(md_content),
+            metadata=DocMetaData(source=self.source),
+        )

langroid/parsing/parser.py CHANGED Viewed

@@ -28,6 +28,7 @@ class PdfParsingConfig(BaseSettings):
         "pypdf",
         "unstructured",
         "pdf2image",
+        "markitdown",
     ] = "pymupdf4llm"
@@ -39,6 +40,18 @@ class DocParsingConfig(BaseSettings):
     library: Literal["unstructured"] = "unstructured"
+class MarkitdownPPTXParsingConfig(BaseSettings):
+    library: Literal["markitdown"] = "markitdown"
+class MarkitdownXLSXParsingConfig(BaseSettings):
+    library: Literal["markitdown"] = "markitdown"
+class MarkitdownXLSParsingConfig(BaseSettings):
+    library: Literal["markitdown"] = "markitdown"
 class ParsingConfig(BaseSettings):
     splitter: str = Splitter.TOKENS
     chunk_by_page: bool = False  # split by page?
@@ -55,6 +68,9 @@ class ParsingConfig(BaseSettings):
     pdf: PdfParsingConfig = PdfParsingConfig()
     docx: DocxParsingConfig = DocxParsingConfig()
     doc: DocParsingConfig = DocParsingConfig()
+    pptx: MarkitdownPPTXParsingConfig = MarkitdownPPTXParsingConfig()
+    xls: MarkitdownXLSParsingConfig = MarkitdownXLSParsingConfig()
+    xlsx: MarkitdownXLSXParsingConfig = MarkitdownXLSXParsingConfig()
 class Parser:
@@ -66,7 +82,7 @@ class Parser:
             self.tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")
     def num_tokens(self, text: str) -> int:
-        tokens = self.tokenizer.encode(text)
+        tokens = self.tokenizer.encode(text, allowed_special={"<|endoftext|>"})
         return len(tokens)
     def truncate_tokens(self, text: str, max_tokens: int) -> str:

{langroid-0.41.5.dist-info → langroid-0.42.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langroid
-Version: 0.41.5
+Version: 0.42.1
 Summary: Harness LLMs with Multi-Agent Programming
 Author-email: Prasad Chalasani <pchalasani@gmail.com>
 License: MIT
@@ -104,6 +104,12 @@ Requires-Dist: pypdf>=5.1.0; extra == 'doc-chat'
 Requires-Dist: pytesseract<0.4.0,>=0.3.10; extra == 'doc-chat'
 Requires-Dist: python-docx<2.0.0,>=1.1.0; extra == 'doc-chat'
 Requires-Dist: unstructured[docx,pdf,pptx]<1.0.0,>=0.16.15; extra == 'doc-chat'
+Provides-Extra: doc-parsers
+Requires-Dist: markitdown>=0.0.1a3; extra == 'doc-parsers'
+Requires-Dist: openpyxl>=3.1.5; extra == 'doc-parsers'
+Requires-Dist: python-docx>=1.1.2; extra == 'doc-parsers'
+Requires-Dist: python-pptx>=1.0.2; extra == 'doc-parsers'
+Requires-Dist: xlrd>=2.0.1; extra == 'doc-parsers'
 Provides-Extra: docling
 Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'docling'
 Provides-Extra: docx
@@ -142,6 +148,7 @@ Provides-Extra: neo4j
 Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
 Provides-Extra: pdf-parsers
 Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
+Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
 Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
 Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
 Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'pdf-parsers'

{langroid-0.41.5.dist-info → langroid-0.42.1.dist-info}/RECORD RENAMED Viewed

@@ -78,13 +78,13 @@ langroid/language_models/prompt_formatter/__init__.py,sha256=2-5cdE24XoFDhifOLl8
 langroid/language_models/prompt_formatter/base.py,sha256=eDS1sgRNZVnoajwV_ZIha6cba5Dt8xjgzdRbPITwx3Q,1221
 langroid/language_models/prompt_formatter/hf_formatter.py,sha256=PVJppmjRvD-2DF-XNC6mE05vTZ9wbu37SmXwZBQhad0,5055
 langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeuMENVIVvVqSYuEpvYSTndUe_jd6hVTko4,2899
-langroid/parsing/__init__.py,sha256=ZgSAfgTC6VsTLFlRSWT-TwYco7SQeRMeZG-49MnKYGY,936
+langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
 langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
 langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
-langroid/parsing/document_parser.py,sha256=WSdNAiFDMVDS7wIF6XNIkRbE2BFLr1YYtgsitWkb4xM,30233
+langroid/parsing/document_parser.py,sha256=NKmN_HjwNdfUjTbXhpyK_Wjay3QYEA26ZnewmbO6moA,33632
 langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
 langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
-langroid/parsing/parser.py,sha256=pPzM3zXQvFtwTyQPtDha15oZhu1O3OKDLECnkB8waxg,12276
+langroid/parsing/parser.py,sha256=moJKI5Cn_Pxd7xbNrY220dqQu-0FeEWUI7ogeq63Kec,12842
 langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
 langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
 langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
@@ -128,7 +128,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
 langroid/vector_store/postgres.py,sha256=DQHd6dt-OcV_QVNm-ymn28rlTfhI6hqgcpLTPCsm0jI,15990
 langroid/vector_store/qdrantdb.py,sha256=v7TAsIoj_vxeKDYS9tpwJLBZA8fuTweTYxHo0X_uawM,17949
 langroid/vector_store/weaviatedb.py,sha256=tjlqEtkwrhykelt-nbr2WIuHWJBuSAGjZuG6gsAMBsc,11753
-langroid-0.41.5.dist-info/METADATA,sha256=Ii6nQYMNZlMmeJ27AVkt7m6d5useGfOXIunF3BAlVzI,61331
-langroid-0.41.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-langroid-0.41.5.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.41.5.dist-info/RECORD,,
+langroid-0.42.1.dist-info/METADATA,sha256=MASCH2qw35RSfqTiDsVpYMj1OfzMJpFBCJdNxJ7jDPA,61699
+langroid-0.42.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+langroid-0.42.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.42.1.dist-info/RECORD,,

{langroid-0.41.5.dist-info → langroid-0.42.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{langroid-0.41.5.dist-info → langroid-0.42.1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

langroid 0.41.5__py3-none-any.whl → 0.42.1__py3-none-any.whl

langroid 0.41.5py3-none-any.whl → 0.42.1py3-none-any.whl