PyPI - langroid - Versions diffs - 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl - Mend

langroid 0.44.0py3-none-any.whl → 0.45.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

langroid/agent/base.py CHANGED Viewed

@@ -1016,7 +1016,7 @@ class Agent(ABC):
             # we would have already displayed the msg "live" ONLY if
             # streaming was enabled, AND we did not find a cached response
             # If we are here, it means the response has not yet been displayed.
-            cached = f"[red]{self.indent}(cached)[/red]" if response.cached else ""
+            cached = "[red](cached)[/red]" if response.cached else ""
             console.print(f"[green]{self.indent}", end="")
             print(cached + "[green]" + escape(response.message))
         self.update_token_usage(

langroid/parsing/document_parser.py CHANGED Viewed

@@ -150,6 +150,8 @@ class DocumentParser(Parser):
                 return ImagePdfParser(source, config)
             elif config.pdf.library == "gemini":
                 return GeminiPdfParser(source, config)
+            elif config.pdf.library == "marker":
+                return MarkerPdfParser(source, config)
             else:
                 raise ValueError(
                     f"Unsupported PDF library specified: {config.pdf.library}"
@@ -1356,3 +1358,85 @@ class GeminiPdfParser(DocumentParser):
             content=page,
             metadata=DocMetaData(source=self.source),
         )
+class MarkerPdfParser(DocumentParser):
+    DEFAULT_CONFIG = {"paginate_output": True, "output_format": "markdown"}
+    def __init__(self, source: Union[str, bytes], config: ParsingConfig):
+        super().__init__(source, config)
+        user_config = (
+            config.pdf.marker_config.config_dict if config.pdf.marker_config else {}
+        )
+        self.config_dict = {**MarkerPdfParser.DEFAULT_CONFIG, **user_config}
+    def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
+        """
+        Yield each page in the PDF using `marker`.
+        """
+        try:
+            import marker  # noqa
+        except ImportError:
+            raise LangroidImportError(
+                "marker-pdf", ["marker-pdf", "pdf-parsers", "all", "doc-chat"]
+            )
+        import re
+        from marker.config.parser import ConfigParser
+        from marker.converters.pdf import PdfConverter
+        from marker.models import create_model_dict
+        from marker.output import save_output
+        config_parser = ConfigParser(self.config_dict)
+        converter = PdfConverter(
+            config=config_parser.generate_config_dict(),
+            artifact_dict=create_model_dict(),
+            processor_list=config_parser.get_processors(),
+            renderer=config_parser.get_renderer(),
+            llm_service=config_parser.get_llm_service(),
+        )
+        doc_path = self.source
+        if doc_path == "bytes":
+            # write to tmp file, then use that path
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
+                temp_file.write(self.doc_bytes.getvalue())
+                doc_path = temp_file.name
+        output_dir = Path(str(Path(doc_path).with_suffix("")) + "-pages")
+        os.makedirs(output_dir, exist_ok=True)
+        filename = Path(doc_path).stem + "_converted"
+        rendered = converter(doc_path)
+        save_output(rendered, output_dir=output_dir, fname_base=filename)
+        file_path = output_dir / f"{filename}.md"
+        with open(file_path, "r", encoding="utf-8") as f:
+            full_markdown = f.read()
+        # Regex for splitting pages
+        pages = re.split(r"\{\d+\}----+", full_markdown)
+        page_no = 0
+        for page in pages:
+            if page.strip():
+                yield page_no, page
+            page_no += 1
+    def get_document_from_page(self, page: str) -> Document:
+        """
+        Get Document object from a given 1-page markdown file,
+        possibly containing image refs.
+        Args:
+            page (str): The page we get by splitting large md file from
+            marker
+        Returns:
+            Document: Document object, with content and possible metadata.
+        """
+        return Document(
+            content=self.fix_text(page),
+            metadata=DocMetaData(source=self.source),
+        )

langroid/parsing/parser.py CHANGED Viewed

@@ -38,8 +38,13 @@ class GeminiConfig(BaseSettings):
     requests_per_minute: Optional[int] = 5
-class PdfParsingConfig(BaseParsingConfig):
+class MarkerConfig(BaseSettings):
+    """Configuration for Markitdown-based parsing."""
+    config_dict: Dict[str, Any] = {}
+class PdfParsingConfig(BaseParsingConfig):
     library: Literal[
         "fitz",
         "pymupdf4llm",
@@ -49,16 +54,26 @@ class PdfParsingConfig(BaseParsingConfig):
         "pdf2image",
         "markitdown",
         "gemini",
+        "marker",
     ] = "pymupdf4llm"
     gemini_config: Optional[GeminiConfig] = None
+    marker_config: Optional[MarkerConfig] = None
     @root_validator(pre=True)
-    def enable_gemini_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
-        """Ensure GeminiConfig is set only when library is 'gemini'."""
-        if values.get("library") == "gemini":
-            values["gemini_config"] = values.get("gemini_config") or GeminiConfig()
+    def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        """Ensure correct config is set based on library selection."""
+        library = values.get("library")
+        if library == "gemini":
+            values.setdefault("gemini_config", GeminiConfig())
         else:
             values["gemini_config"] = None
+        if library == "marker":
+            values.setdefault("marker_config", MarkerConfig())
+        else:
+            values["marker_config"] = None
         return values

{langroid-0.44.0.dist-info → langroid-0.45.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langroid
-Version: 0.44.0
+Version: 0.45.0
 Summary: Harness LLMs with Multi-Agent Programming
 Author-email: Prasad Chalasani <pchalasani@gmail.com>
 License: MIT
@@ -63,6 +63,7 @@ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'all'
 Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'all'
 Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'all'
 Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'all'
+Requires-Dist: marker-pdf; extra == 'all'
 Requires-Dist: metaphor-python<0.2.0,>=0.1.23; extra == 'all'
 Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'all'
 Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'all'
@@ -99,6 +100,7 @@ Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'db'
 Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'db'
 Provides-Extra: doc-chat
 Requires-Dist: docling<3.0.0,>=2.20.0; extra == 'doc-chat'
+Requires-Dist: marker-pdf; extra == 'doc-chat'
 Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'doc-chat'
 Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'doc-chat'
 Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'doc-chat'
@@ -138,6 +140,9 @@ Requires-Dist: pyarrow<16.0.0,>=15.0.0; extra == 'lancedb'
 Requires-Dist: tantivy<0.22.0,>=0.21.0; extra == 'lancedb'
 Provides-Extra: litellm
 Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'litellm'
+Provides-Extra: marker-pdf
+Requires-Dist: marker-pdf[full]>=1.6.0; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'marker-pdf'
+Requires-Dist: opencv-python>=4.11.0.86; extra == 'marker-pdf'
 Provides-Extra: meilisearch
 Requires-Dist: meilisearch-python-sdk<3.0.0,>=2.2.3; extra == 'meilisearch'
 Provides-Extra: metaphor
@@ -150,6 +155,7 @@ Provides-Extra: neo4j
 Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
 Provides-Extra: pdf-parsers
 Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
+Requires-Dist: marker-pdf; extra == 'pdf-parsers'
 Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
 Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
 Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
@@ -791,8 +797,8 @@ wget -O .env https://raw.githubusercontent.com/langroid/langroid/main/.env-templ
 # Edit the .env file with your favorite editor (here nano), and remove any un-used settings. E.g. there are "dummy" values like "your-redis-port" etc -- if you are not using them, you MUST remove them.
 nano .env
-# launch the container
-docker run -it --rm  -v ./.env:/langroid/.env langroid/langroid
+# launch the container (the appropriate image for your architecture will be pulled automatically)
+docker run -it --rm  -v ./.env:/langroid/.env langroid/langroid:latest
 # Use this command to run any of the scripts in the `examples` directory
 python examples/<Path/To/Example.py>

{langroid-0.44.0.dist-info → langroid-0.45.0.dist-info}/RECORD RENAMED Viewed

@@ -3,7 +3,7 @@ langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
 langroid/mytypes.py,sha256=FXSH62MUCeMCJP-66RVmbNaHCDLMxllEShZ-xEeTn9A,2833
 langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
-langroid/agent/base.py,sha256=0szJ5ZxNSmobFO5805ur2cqKfD6vUP4ooN76Z5qAeyw,78677
+langroid/agent/base.py,sha256=JRN8R6-H142NL2_asruYozfW1Na0j5tmjSvV3bhgzTo,78663
 langroid/agent/batch.py,sha256=vi1r5i1-vN80WfqHDSwjEym_KfGsqPGUtwktmiK1nuk,20635
 langroid/agent/chat_agent.py,sha256=be7GlySBCuZ4jGQzk0FdVKlqhGeAuewfDywmHDACjh8,84924
 langroid/agent/chat_document.py,sha256=xzMtrPbaW-Y-BnF7kuhr2dorsD-D5rMWzfOqJ8HAoo8,17885
@@ -81,10 +81,10 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
 langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
 langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
 langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
-langroid/parsing/document_parser.py,sha256=QThgCm9iZyRZd1pmANZ3lO20p2TNH0NIU5_a5v8q8Ck,49649
+langroid/parsing/document_parser.py,sha256=JzieD1tDJo7SJt5wTftDllSPGlEVT6gd2-q4zVcJSrU,52625
 langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
 langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
-langroid/parsing/parser.py,sha256=8MDoKQO60RGXod9E5jMj-k90QNhdim4blVJB9L0rrSA,13789
+langroid/parsing/parser.py,sha256=ArAPWQ2Op_1B8i26xpkWHwnZiXgDrcyih2A6l8R49aI,14136
 langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
 langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
 langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
 langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
 langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
 langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
-langroid-0.44.0.dist-info/METADATA,sha256=mKlCCdQQhV31aMCklT9QcRpUs5iHsOeDGAd55axAevU,62973
-langroid-0.44.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-langroid-0.44.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.44.0.dist-info/RECORD,,
+langroid-0.45.0.dist-info/METADATA,sha256=ojPk96xfeDC6ddCqKqbp_HYwlq2eiDTwm1dXGjroUpA,63409
+langroid-0.45.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+langroid-0.45.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.45.0.dist-info/RECORD,,

{langroid-0.44.0.dist-info → langroid-0.45.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{langroid-0.44.0.dist-info → langroid-0.45.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

langroid 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl

langroid 0.44.0py3-none-any.whl → 0.45.0py3-none-any.whl