langroid 0.44.0__py3-none-any.whl → 0.45.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/base.py +1 -1
- langroid/parsing/document_parser.py +84 -0
- langroid/parsing/parser.py +20 -5
- {langroid-0.44.0.dist-info → langroid-0.45.1.dist-info}/METADATA +9 -5
- {langroid-0.44.0.dist-info → langroid-0.45.1.dist-info}/RECORD +7 -7
- {langroid-0.44.0.dist-info → langroid-0.45.1.dist-info}/WHEEL +0 -0
- {langroid-0.44.0.dist-info → langroid-0.45.1.dist-info}/licenses/LICENSE +0 -0
langroid/agent/base.py
CHANGED
@@ -1016,7 +1016,7 @@ class Agent(ABC):
|
|
1016
1016
|
# we would have already displayed the msg "live" ONLY if
|
1017
1017
|
# streaming was enabled, AND we did not find a cached response
|
1018
1018
|
# If we are here, it means the response has not yet been displayed.
|
1019
|
-
cached =
|
1019
|
+
cached = "[red](cached)[/red]" if response.cached else ""
|
1020
1020
|
console.print(f"[green]{self.indent}", end="")
|
1021
1021
|
print(cached + "[green]" + escape(response.message))
|
1022
1022
|
self.update_token_usage(
|
@@ -150,6 +150,8 @@ class DocumentParser(Parser):
|
|
150
150
|
return ImagePdfParser(source, config)
|
151
151
|
elif config.pdf.library == "gemini":
|
152
152
|
return GeminiPdfParser(source, config)
|
153
|
+
elif config.pdf.library == "marker":
|
154
|
+
return MarkerPdfParser(source, config)
|
153
155
|
else:
|
154
156
|
raise ValueError(
|
155
157
|
f"Unsupported PDF library specified: {config.pdf.library}"
|
@@ -1356,3 +1358,85 @@ class GeminiPdfParser(DocumentParser):
|
|
1356
1358
|
content=page,
|
1357
1359
|
metadata=DocMetaData(source=self.source),
|
1358
1360
|
)
|
1361
|
+
|
1362
|
+
|
1363
|
+
class MarkerPdfParser(DocumentParser):
|
1364
|
+
DEFAULT_CONFIG = {"paginate_output": True, "output_format": "markdown"}
|
1365
|
+
|
1366
|
+
def __init__(self, source: Union[str, bytes], config: ParsingConfig):
|
1367
|
+
super().__init__(source, config)
|
1368
|
+
user_config = (
|
1369
|
+
config.pdf.marker_config.config_dict if config.pdf.marker_config else {}
|
1370
|
+
)
|
1371
|
+
|
1372
|
+
self.config_dict = {**MarkerPdfParser.DEFAULT_CONFIG, **user_config}
|
1373
|
+
|
1374
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
1375
|
+
"""
|
1376
|
+
Yield each page in the PDF using `marker`.
|
1377
|
+
"""
|
1378
|
+
try:
|
1379
|
+
import marker # noqa
|
1380
|
+
except ImportError:
|
1381
|
+
raise LangroidImportError(
|
1382
|
+
"marker-pdf", ["marker-pdf", "pdf-parsers", "all", "doc-chat"]
|
1383
|
+
)
|
1384
|
+
|
1385
|
+
import re
|
1386
|
+
|
1387
|
+
from marker.config.parser import ConfigParser
|
1388
|
+
from marker.converters.pdf import PdfConverter
|
1389
|
+
from marker.models import create_model_dict
|
1390
|
+
from marker.output import save_output
|
1391
|
+
|
1392
|
+
config_parser = ConfigParser(self.config_dict)
|
1393
|
+
converter = PdfConverter(
|
1394
|
+
config=config_parser.generate_config_dict(),
|
1395
|
+
artifact_dict=create_model_dict(),
|
1396
|
+
processor_list=config_parser.get_processors(),
|
1397
|
+
renderer=config_parser.get_renderer(),
|
1398
|
+
llm_service=config_parser.get_llm_service(),
|
1399
|
+
)
|
1400
|
+
doc_path = self.source
|
1401
|
+
if doc_path == "bytes":
|
1402
|
+
# write to tmp file, then use that path
|
1403
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
1404
|
+
temp_file.write(self.doc_bytes.getvalue())
|
1405
|
+
doc_path = temp_file.name
|
1406
|
+
|
1407
|
+
output_dir = Path(str(Path(doc_path).with_suffix("")) + "-pages")
|
1408
|
+
os.makedirs(output_dir, exist_ok=True)
|
1409
|
+
filename = Path(doc_path).stem + "_converted"
|
1410
|
+
|
1411
|
+
rendered = converter(doc_path)
|
1412
|
+
save_output(rendered, output_dir=output_dir, fname_base=filename)
|
1413
|
+
file_path = output_dir / f"{filename}.md"
|
1414
|
+
|
1415
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
1416
|
+
full_markdown = f.read()
|
1417
|
+
|
1418
|
+
# Regex for splitting pages
|
1419
|
+
pages = re.split(r"\{\d+\}----+", full_markdown)
|
1420
|
+
|
1421
|
+
page_no = 0
|
1422
|
+
for page in pages:
|
1423
|
+
if page.strip():
|
1424
|
+
yield page_no, page
|
1425
|
+
page_no += 1
|
1426
|
+
|
1427
|
+
def get_document_from_page(self, page: str) -> Document:
|
1428
|
+
"""
|
1429
|
+
Get Document object from a given 1-page markdown file,
|
1430
|
+
possibly containing image refs.
|
1431
|
+
|
1432
|
+
Args:
|
1433
|
+
page (str): The page we get by splitting large md file from
|
1434
|
+
marker
|
1435
|
+
|
1436
|
+
Returns:
|
1437
|
+
Document: Document object, with content and possible metadata.
|
1438
|
+
"""
|
1439
|
+
return Document(
|
1440
|
+
content=self.fix_text(page),
|
1441
|
+
metadata=DocMetaData(source=self.source),
|
1442
|
+
)
|
langroid/parsing/parser.py
CHANGED
@@ -38,8 +38,13 @@ class GeminiConfig(BaseSettings):
|
|
38
38
|
requests_per_minute: Optional[int] = 5
|
39
39
|
|
40
40
|
|
41
|
-
class
|
41
|
+
class MarkerConfig(BaseSettings):
|
42
|
+
"""Configuration for Markitdown-based parsing."""
|
43
|
+
|
44
|
+
config_dict: Dict[str, Any] = {}
|
42
45
|
|
46
|
+
|
47
|
+
class PdfParsingConfig(BaseParsingConfig):
|
43
48
|
library: Literal[
|
44
49
|
"fitz",
|
45
50
|
"pymupdf4llm",
|
@@ -49,16 +54,26 @@ class PdfParsingConfig(BaseParsingConfig):
|
|
49
54
|
"pdf2image",
|
50
55
|
"markitdown",
|
51
56
|
"gemini",
|
57
|
+
"marker",
|
52
58
|
] = "pymupdf4llm"
|
53
59
|
gemini_config: Optional[GeminiConfig] = None
|
60
|
+
marker_config: Optional[MarkerConfig] = None
|
54
61
|
|
55
62
|
@root_validator(pre=True)
|
56
|
-
def
|
57
|
-
"""Ensure
|
58
|
-
|
59
|
-
|
63
|
+
def enable_configs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
64
|
+
"""Ensure correct config is set based on library selection."""
|
65
|
+
library = values.get("library")
|
66
|
+
|
67
|
+
if library == "gemini":
|
68
|
+
values.setdefault("gemini_config", GeminiConfig())
|
60
69
|
else:
|
61
70
|
values["gemini_config"] = None
|
71
|
+
|
72
|
+
if library == "marker":
|
73
|
+
values.setdefault("marker_config", MarkerConfig())
|
74
|
+
else:
|
75
|
+
values["marker_config"] = None
|
76
|
+
|
62
77
|
return values
|
63
78
|
|
64
79
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.45.1
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -12,7 +12,6 @@ Requires-Dist: async-generator<2.0,>=1.10
|
|
12
12
|
Requires-Dist: bs4<1.0.0,>=0.0.1
|
13
13
|
Requires-Dist: cerebras-cloud-sdk<2.0.0,>=1.1.0
|
14
14
|
Requires-Dist: colorlog<7.0.0,>=6.7.0
|
15
|
-
Requires-Dist: docling<3.0.0,>=2.20.0
|
16
15
|
Requires-Dist: docstring-parser<1.0,>=0.16
|
17
16
|
Requires-Dist: duckduckgo-search<7.0.0,>=6.0.0
|
18
17
|
Requires-Dist: exa-py>=1.8.7
|
@@ -49,7 +48,6 @@ Requires-Dist: redis<6.0.0,>=5.0.1
|
|
49
48
|
Requires-Dist: requests-oauthlib<2.0.0,>=1.3.1
|
50
49
|
Requires-Dist: requests<3.0.0,>=2.31.0
|
51
50
|
Requires-Dist: rich<14.0.0,>=13.3.4
|
52
|
-
Requires-Dist: tavily-python>=0.5.0
|
53
51
|
Requires-Dist: thefuzz<1.0.0,>=0.20.0
|
54
52
|
Requires-Dist: tiktoken<1.0.0,>=0.7.0
|
55
53
|
Requires-Dist: trafilatura<2.0.0,>=1.5.0
|
@@ -63,6 +61,7 @@ Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'all'
|
|
63
61
|
Requires-Dist: fastembed<0.4.0,>=0.3.1; extra == 'all'
|
64
62
|
Requires-Dist: huggingface-hub<1.0.0,>=0.21.2; extra == 'all'
|
65
63
|
Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'all'
|
64
|
+
Requires-Dist: marker-pdf; extra == 'all'
|
66
65
|
Requires-Dist: metaphor-python<0.2.0,>=0.1.23; extra == 'all'
|
67
66
|
Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'all'
|
68
67
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'all'
|
@@ -99,6 +98,7 @@ Requires-Dist: pymysql<2.0.0,>=1.1.0; extra == 'db'
|
|
99
98
|
Requires-Dist: sqlalchemy<3.0.0,>=2.0.19; extra == 'db'
|
100
99
|
Provides-Extra: doc-chat
|
101
100
|
Requires-Dist: docling<3.0.0,>=2.20.0; extra == 'doc-chat'
|
101
|
+
Requires-Dist: marker-pdf; extra == 'doc-chat'
|
102
102
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'doc-chat'
|
103
103
|
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'doc-chat'
|
104
104
|
Requires-Dist: pymupdf<2.0.0,>=1.23.3; extra == 'doc-chat'
|
@@ -138,6 +138,9 @@ Requires-Dist: pyarrow<16.0.0,>=15.0.0; extra == 'lancedb'
|
|
138
138
|
Requires-Dist: tantivy<0.22.0,>=0.21.0; extra == 'lancedb'
|
139
139
|
Provides-Extra: litellm
|
140
140
|
Requires-Dist: litellm<2.0.0,>=1.30.1; extra == 'litellm'
|
141
|
+
Provides-Extra: marker-pdf
|
142
|
+
Requires-Dist: marker-pdf[full]>=1.6.0; (sys_platform != 'darwin' or platform_machine != 'x86_64') and extra == 'marker-pdf'
|
143
|
+
Requires-Dist: opencv-python>=4.11.0.86; extra == 'marker-pdf'
|
141
144
|
Provides-Extra: meilisearch
|
142
145
|
Requires-Dist: meilisearch-python-sdk<3.0.0,>=2.2.3; extra == 'meilisearch'
|
143
146
|
Provides-Extra: metaphor
|
@@ -150,6 +153,7 @@ Provides-Extra: neo4j
|
|
150
153
|
Requires-Dist: neo4j<6.0.0,>=5.14.1; extra == 'neo4j'
|
151
154
|
Provides-Extra: pdf-parsers
|
152
155
|
Requires-Dist: docling<3.0.0,>=2.16.0; extra == 'pdf-parsers'
|
156
|
+
Requires-Dist: marker-pdf; extra == 'pdf-parsers'
|
153
157
|
Requires-Dist: markitdown>=0.0.1a3; extra == 'pdf-parsers'
|
154
158
|
Requires-Dist: pdf2image<2.0.0,>=1.17.0; extra == 'pdf-parsers'
|
155
159
|
Requires-Dist: pymupdf4llm<0.1.0,>=0.0.17; extra == 'pdf-parsers'
|
@@ -791,8 +795,8 @@ wget -O .env https://raw.githubusercontent.com/langroid/langroid/main/.env-templ
|
|
791
795
|
# Edit the .env file with your favorite editor (here nano), and remove any un-used settings. E.g. there are "dummy" values like "your-redis-port" etc -- if you are not using them, you MUST remove them.
|
792
796
|
nano .env
|
793
797
|
|
794
|
-
# launch the container
|
795
|
-
docker run -it --rm -v ./.env:/langroid/.env langroid/langroid
|
798
|
+
# launch the container (the appropriate image for your architecture will be pulled automatically)
|
799
|
+
docker run -it --rm -v ./.env:/langroid/.env langroid/langroid:latest
|
796
800
|
|
797
801
|
# Use this command to run any of the scripts in the `examples` directory
|
798
802
|
python examples/<Path/To/Example.py>
|
@@ -3,7 +3,7 @@ langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
|
|
3
3
|
langroid/mytypes.py,sha256=FXSH62MUCeMCJP-66RVmbNaHCDLMxllEShZ-xEeTn9A,2833
|
4
4
|
langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
|
6
|
-
langroid/agent/base.py,sha256=
|
6
|
+
langroid/agent/base.py,sha256=JRN8R6-H142NL2_asruYozfW1Na0j5tmjSvV3bhgzTo,78663
|
7
7
|
langroid/agent/batch.py,sha256=vi1r5i1-vN80WfqHDSwjEym_KfGsqPGUtwktmiK1nuk,20635
|
8
8
|
langroid/agent/chat_agent.py,sha256=be7GlySBCuZ4jGQzk0FdVKlqhGeAuewfDywmHDACjh8,84924
|
9
9
|
langroid/agent/chat_document.py,sha256=xzMtrPbaW-Y-BnF7kuhr2dorsD-D5rMWzfOqJ8HAoo8,17885
|
@@ -81,10 +81,10 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
|
|
81
81
|
langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
|
82
82
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
83
83
|
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
84
|
-
langroid/parsing/document_parser.py,sha256=
|
84
|
+
langroid/parsing/document_parser.py,sha256=JzieD1tDJo7SJt5wTftDllSPGlEVT6gd2-q4zVcJSrU,52625
|
85
85
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
86
86
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
87
|
-
langroid/parsing/parser.py,sha256=
|
87
|
+
langroid/parsing/parser.py,sha256=ArAPWQ2Op_1B8i26xpkWHwnZiXgDrcyih2A6l8R49aI,14136
|
88
88
|
langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
|
89
89
|
langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
|
90
90
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
127
127
|
langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
|
128
128
|
langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
|
129
129
|
langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
|
130
|
-
langroid-0.
|
131
|
-
langroid-0.
|
132
|
-
langroid-0.
|
133
|
-
langroid-0.
|
130
|
+
langroid-0.45.1.dist-info/METADATA,sha256=bnWkBCq4xp6YLKbUnvn06AcLEe_aSSB3dWj3yk2W7es,63335
|
131
|
+
langroid-0.45.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
132
|
+
langroid-0.45.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
133
|
+
langroid-0.45.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|