langroid 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langroid/agent/base.py CHANGED
@@ -1016,7 +1016,7 @@ class Agent(ABC):
1016
1016
  # we would have already displayed the msg "live" ONLY if
1017
1017
  # streaming was enabled, AND we did not find a cached response
1018
1018
  # If we are here, it means the response has not yet been displayed.
1019
- cached = f"[red]{self.indent}(cached)[/red]" if response.cached else ""
1019
+ cached = "[red](cached)[/red]" if response.cached else ""
1020
1020
  console.print(f"[green]{self.indent}", end="")
1021
1021
  print(cached + "[green]" + escape(response.message))
1022
1022
  self.update_token_usage(
@@ -150,6 +150,8 @@ class DocumentParser(Parser):
150
150
  return ImagePdfParser(source, config)
151
151
  elif config.pdf.library == "gemini":
152
152
  return GeminiPdfParser(source, config)
153
+ elif config.pdf.library == "marker":
154
+ return MarkerPdfParser(source, config)
153
155
  else:
154
156
  raise ValueError(
155
157
  f"Unsupported PDF library specified: {config.pdf.library}"
@@ -1356,3 +1358,85 @@ class GeminiPdfParser(DocumentParser):
1356
1358
  content=page,
1357
1359
  metadata=DocMetaData(source=self.source),
1358
1360
  )
1361
+
1362
+
1363
+ class MarkerPdfParser(DocumentParser):
1364
+ DEFAULT_CONFIG = {"paginate_output": True, "output_format": "markdown"}
1365
+
1366
+ def __init__(self, source: Union[str, bytes], config: ParsingConfig):
1367
+ super().__init__(source, config)
1368
+ user_config = (
1369
+ config.pdf.marker_config.config_dict if config.pdf.marker_config else {}
1370
+ )
1371
+
1372
+ self.config_dict = {**MarkerPdfParser.DEFAULT_CONFIG, **user_config}
1373
+
1374
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
1375
+ """
1376
+ Yield each page in the PDF using `marker`.
1377
+ """
1378
+ try:
1379
+ import marker # noqa
1380
+ except ImportError:
1381
+ raise LangroidImportError(
1382
+ "marker-pdf", ["marker-pdf", "pdf-parsers", "all", "doc-chat"]
1383
+ )
1384
+
1385
+ import re
1386
+
1387
+ from marker.config.parser import ConfigParser
1388
+ from marker.converters.pdf import PdfConverter
1389
+ from marker.models import create_model_dict
1390
+ from marker.output import save_output
1391
+
1392
+ config_parser = ConfigParser(self.config_dict)
1393
+ converter = PdfConverter(
1394
+ config=config_parser.generate_config_dict(),
1395
+ artifact_dict=create_model_dict(),
1396
+ processor_list=config_parser.get_processors(),
1397
+ renderer=config_parser.get_renderer(),
1398
+ llm_service=config_parser.get_llm_service(),
1399
+ )
1400
+ doc_path = self.source
1401
+ if doc_path == "bytes":
1402
+ # write to tmp file, then use that path
1403
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
1404
+ temp_file.write(self.doc_bytes.getvalue())
1405
+ doc_path = temp_file.name
1406
+
1407
+ output_dir = Path(str(Path(doc_path).with_suffix("")) + "-pages")
1408
+ os.makedirs(output_dir, exist_ok=True)
1409
+ filename = Path(doc_path).stem + "_converted"
1410
+
1411
+ rendered = converter(doc_path)
1412
+ save_output(rendered, output_dir=output_dir, fname_base=filename)
1413
+ file_path = output_dir / f"{filename}.md"
1414
+
1415
+ with open(file_path, "r", encoding="utf-8") as f:
1416
+ full_markdown = f.read()
1417
+
1418
+ # Regex for splitting pages
1419
+ pages = re.split(r"\{\d+\}----+", full_markdown)
1420
+
1421
+ page_no = 0
1422
+ for page in pages:
1423
+ if page.strip():
1424
+ yield page_no, page
1425
+ page_no += 1
1426
+
1427
+ def get_document_from_page(self, page: str) -> Document:
1428
+ """
1429
+ Get Document object from a given 1-page markdown file,
1430
+ possibly containing image refs.
1431
+
1432
+ Args:
1433
+ page (str): The page we get by splitting large md file from
1434
+ marker
1435
+
1436
+ Returns:
1437
+ Document: Document object, with content and possible metadata.
1438
+ """
1439
+ return Document(
1440
+ content=self.fix_text(page),
1441
+ metadata=DocMetaData(source=self.source),
1442
+ )
@@ -38,8 +38,13 @@ class GeminiConfig(BaseSettings):
38
38
  requests_per_minute: Optional[int] = 5
39
39
 
40
40
 
41
- class PdfParsingConfig(BaseParsingConfig):
41
+ class MarkerConfig(BaseSettings):
42
+ """Configuration for Markitdown-based parsing."""
43
+
44
+ config_dict: Dict[str, Any] = {}
42
45
 
46
+
47
+ class PdfParsingConfig(BaseParsingConfig):
43
48
  library: Literal[
44
49
  "fitz",
45
50
  "pymupdf4llm",
@@ -49,16 +54,26 @@ class PdfParsingConfig(BaseParsingConfig):
49
54
  "pdf2image",
50
55
  "markitdown",
51
56
  "gemini",
57
+ "marker",
52
58
  ] = "pymupdf4llm"
53
59
  gemini_config: Optional[GeminiConfig] = None
60
+ marker_config: Optional[MarkerConfig] = None
54
61
 
55
62
  @root_validator(pre=True)
56
- def enable_gemini_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
57
- """Ensure GeminiConfig is set only when library is 'gemini'."""
58
- if values.get("library") == "gemini":
59
- values["gemini_config"] = values.get("gemini_config") or GeminiConfig()
63
+ def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
64
+ """Ensure correct config is set based on library selection."""
65
+ library = values.get("library")
66
+
67
+ if library == "gemini":
68
+ values.setdefault("gemini_config", GeminiConfig())
60
69
  else:
61
70
  values["gemini_config"] = None
71
+
72
+ if library == "marker":
73
+ values.setdefault("marker_config", MarkerConfig())
74
+ else:
75
+ values["marker_config"] = None
76
+
62
77
  return values
63
78
 
64
79
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.44.0
3
+ Version: 0.45.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -63,6 +63,7 @@ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'all'
63
63
  Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'all'
64
64
  Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'all'
65
65
  Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'all'
66
+ Requires-Dist: marker-pdf; extra == 'all'
66
67
  Requires-Dist: metaphor-python<0.2.0,>=0.1.23; extra == 'all'
67
68
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'all'
68
69
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'all'
@@ -99,6 +100,7 @@ Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'db'
99
100
  Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'db'
100
101
  Provides-Extra: doc-chat
101
102
  Requires-Dist: docling<3.0.0,>=2.20.0; extra == 'doc-chat'
103
+ Requires-Dist: marker-pdf; extra == 'doc-chat'
102
104
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'doc-chat'
103
105
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'doc-chat'
104
106
  Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'doc-chat'
@@ -138,6 +140,9 @@ Requires-Dist: pyarrow<16.0.0,>=15.0.0; extra == 'lancedb'
138
140
  Requires-Dist: tantivy<0.22.0,>=0.21.0; extra == 'lancedb'
139
141
  Provides-Extra: litellm
140
142
  Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'litellm'
143
+ Provides-Extra: marker-pdf
144
+ Requires-Dist: marker-pdf[full]>=1.6.0; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'marker-pdf'
145
+ Requires-Dist: opencv-python>=4.11.0.86; extra == 'marker-pdf'
141
146
  Provides-Extra: meilisearch
142
147
  Requires-Dist: meilisearch-python-sdk<3.0.0,>=2.2.3; extra == 'meilisearch'
143
148
  Provides-Extra: metaphor
@@ -150,6 +155,7 @@ Provides-Extra: neo4j
150
155
  Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
151
156
  Provides-Extra: pdf-parsers
152
157
  Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
158
+ Requires-Dist: marker-pdf; extra == 'pdf-parsers'
153
159
  Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
154
160
  Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
155
161
  Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
@@ -791,8 +797,8 @@ wget -O .env https://raw.githubusercontent.com/langroid/langroid/main/.env-templ
791
797
  # Edit the .env file with your favorite editor (here nano), and remove any un-used settings. E.g. there are "dummy" values like "your-redis-port" etc -- if you are not using them, you MUST remove them.
792
798
  nano .env
793
799
 
794
- # launch the container
795
- docker run -it --rm -v ./.env:/langroid/.env langroid/langroid
800
+ # launch the container (the appropriate image for your architecture will be pulled automatically)
801
+ docker run -it --rm -v ./.env:/langroid/.env langroid/langroid:latest
796
802
 
797
803
  # Use this command to run any of the scripts in the `examples` directory
798
804
  python examples/<Path/To/Example.py>
@@ -3,7 +3,7 @@ langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
3
3
  langroid/mytypes.py,sha256=FXSH62MUCeMCJP-66RVmbNaHCDLMxllEShZ-xEeTn9A,2833
4
4
  langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
6
- langroid/agent/base.py,sha256=0szJ5ZxNSmobFO5805ur2cqKfD6vUP4ooN76Z5qAeyw,78677
6
+ langroid/agent/base.py,sha256=JRN8R6-H142NL2_asruYozfW1Na0j5tmjSvV3bhgzTo,78663
7
7
  langroid/agent/batch.py,sha256=vi1r5i1-vN80WfqHDSwjEym_KfGsqPGUtwktmiK1nuk,20635
8
8
  langroid/agent/chat_agent.py,sha256=be7GlySBCuZ4jGQzk0FdVKlqhGeAuewfDywmHDACjh8,84924
9
9
  langroid/agent/chat_document.py,sha256=xzMtrPbaW-Y-BnF7kuhr2dorsD-D5rMWzfOqJ8HAoo8,17885
@@ -81,10 +81,10 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
81
81
  langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
82
82
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
83
83
  langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
84
- langroid/parsing/document_parser.py,sha256=QThgCm9iZyRZd1pmANZ3lO20p2TNH0NIU5_a5v8q8Ck,49649
84
+ langroid/parsing/document_parser.py,sha256=JzieD1tDJo7SJt5wTftDllSPGlEVT6gd2-q4zVcJSrU,52625
85
85
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
86
86
  langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
87
- langroid/parsing/parser.py,sha256=8MDoKQO60RGXod9E5jMj-k90QNhdim4blVJB9L0rrSA,13789
87
+ langroid/parsing/parser.py,sha256=ArAPWQ2Op_1B8i26xpkWHwnZiXgDrcyih2A6l8R49aI,14136
88
88
  langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
89
89
  langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
90
90
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
127
127
  langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
128
128
  langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
129
129
  langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
130
- langroid-0.44.0.dist-info/METADATA,sha256=mKlCCdQQhV31aMCklT9QcRpUs5iHsOeDGAd55axAevU,62973
131
- langroid-0.44.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
- langroid-0.44.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
- langroid-0.44.0.dist-info/RECORD,,
130
+ langroid-0.45.0.dist-info/METADATA,sha256=ojPk96xfeDC6ddCqKqbp_HYwlq2eiDTwm1dXGjroUpA,63409
131
+ langroid-0.45.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
+ langroid-0.45.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
+ langroid-0.45.0.dist-info/RECORD,,