langroid 0.50.12__py3-none-any.whl → 0.51.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/parsing/document_parser.py +111 -50
- langroid/parsing/parser.py +8 -8
- {langroid-0.50.12.dist-info → langroid-0.51.1.dist-info}/METADATA +1 -1
- {langroid-0.50.12.dist-info → langroid-0.51.1.dist-info}/RECORD +6 -6
- {langroid-0.50.12.dist-info → langroid-0.51.1.dist-info}/WHEEL +0 -0
- {langroid-0.50.12.dist-info → langroid-0.51.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import base64
|
3
4
|
import itertools
|
4
5
|
import logging
|
5
6
|
import os
|
@@ -148,8 +149,8 @@ class DocumentParser(Parser):
|
|
148
149
|
return UnstructuredPDFParser(source, config)
|
149
150
|
elif config.pdf.library == "pdf2image":
|
150
151
|
return ImagePdfParser(source, config)
|
151
|
-
elif config.pdf.library == "
|
152
|
-
return
|
152
|
+
elif config.pdf.library == "llm-pdf-parser":
|
153
|
+
return LLMPdfParser(source, config)
|
153
154
|
elif config.pdf.library == "marker":
|
154
155
|
return MarkerPdfParser(source, config)
|
155
156
|
else:
|
@@ -993,13 +994,13 @@ class MarkitdownPPTXParser(DocumentParser):
|
|
993
994
|
)
|
994
995
|
|
995
996
|
|
996
|
-
class
|
997
|
+
class LLMPdfParser(DocumentParser):
|
997
998
|
"""
|
998
|
-
This class converts PDFs to Markdown using
|
999
|
+
This class converts PDFs to Markdown using multimodal LLMs.
|
999
1000
|
|
1000
1001
|
It extracts pages, converts them with the LLM (replacing images with
|
1001
1002
|
detailed descriptions), and outputs Markdown page by page. The
|
1002
|
-
conversion follows `
|
1003
|
+
conversion follows `LLM_PDF_MD_SYSTEM_INSTRUCTION`. It employs
|
1003
1004
|
multiprocessing for speed, async requests with rate limiting, and
|
1004
1005
|
handles errors.
|
1005
1006
|
|
@@ -1008,9 +1009,9 @@ class GeminiPdfParser(DocumentParser):
|
|
1008
1009
|
"""
|
1009
1010
|
|
1010
1011
|
DEFAULT_MAX_TOKENS = 7000
|
1011
|
-
OUTPUT_DIR = Path(".
|
1012
|
+
OUTPUT_DIR = Path(".llm_pdfparser") # Fixed output directory
|
1012
1013
|
|
1013
|
-
|
1014
|
+
LLM_PDF_MD_SYSTEM_INSTRUCTION = """
|
1014
1015
|
### **Convert PDF to Markdown**
|
1015
1016
|
1. **Text:**
|
1016
1017
|
* Preserve structure, formatting (**bold**, *italic*), lists, and indentation.
|
@@ -1035,11 +1036,11 @@ class GeminiPdfParser(DocumentParser):
|
|
1035
1036
|
|
1036
1037
|
def __init__(self, source: Union[str, bytes], config: ParsingConfig):
|
1037
1038
|
super().__init__(source, config)
|
1038
|
-
if not config.pdf.
|
1039
|
+
if not config.pdf.llm_parser_config:
|
1039
1040
|
raise ValueError(
|
1040
|
-
"
|
1041
|
+
"LLMPdfParser requires a llm-based config in pdf parsing config"
|
1041
1042
|
)
|
1042
|
-
self.model_name = config.pdf.
|
1043
|
+
self.model_name = config.pdf.llm_parser_config.model_name
|
1043
1044
|
|
1044
1045
|
# Ensure output directory exists
|
1045
1046
|
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
@@ -1058,7 +1059,9 @@ class GeminiPdfParser(DocumentParser):
|
|
1058
1059
|
temp_file.close()
|
1059
1060
|
self.output_filename = Path(temp_file.name)
|
1060
1061
|
|
1061
|
-
self.max_tokens =
|
1062
|
+
self.max_tokens = (
|
1063
|
+
config.pdf.llm_parser_config.max_tokens or self.DEFAULT_MAX_TOKENS
|
1064
|
+
)
|
1062
1065
|
|
1063
1066
|
"""
|
1064
1067
|
If True, each PDF page is processed as a separate chunk,
|
@@ -1066,12 +1069,12 @@ class GeminiPdfParser(DocumentParser):
|
|
1066
1069
|
grouped into chunks based on `max_token_limit` before being sent
|
1067
1070
|
to the LLM.
|
1068
1071
|
"""
|
1069
|
-
self.split_on_page = config.pdf.
|
1072
|
+
self.split_on_page = config.pdf.llm_parser_config.split_on_page or False
|
1070
1073
|
|
1071
1074
|
# Rate limiting parameters
|
1072
1075
|
import asyncio
|
1073
1076
|
|
1074
|
-
self.requests_per_minute = config.pdf.
|
1077
|
+
self.requests_per_minute = config.pdf.llm_parser_config.requests_per_minute or 5
|
1075
1078
|
|
1076
1079
|
"""
|
1077
1080
|
A semaphore to control the number of concurrent requests to the LLM,
|
@@ -1175,7 +1178,7 @@ class GeminiPdfParser(DocumentParser):
|
|
1175
1178
|
"page_numbers": page_numbers, # List of page numbers in this chunk
|
1176
1179
|
}
|
1177
1180
|
|
1178
|
-
def
|
1181
|
+
def _prepare_pdf_chunks_for_llm(
|
1179
1182
|
self,
|
1180
1183
|
num_workers: Optional[int] = None,
|
1181
1184
|
max_tokens: int = DEFAULT_MAX_TOKENS,
|
@@ -1198,37 +1201,102 @@ class GeminiPdfParser(DocumentParser):
|
|
1198
1201
|
pdf_chunks = pool.map(self._merge_pages_into_pdf_with_metadata, chunks)
|
1199
1202
|
return pdf_chunks
|
1200
1203
|
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
+
@staticmethod
|
1205
|
+
def _page_num_str(page_numbers: Any) -> str:
|
1206
|
+
"""
|
1207
|
+
Converts page numbers to a formatted string.
|
1208
|
+
"""
|
1209
|
+
if isinstance(page_numbers, list):
|
1210
|
+
if len(page_numbers) == 0:
|
1211
|
+
return ""
|
1212
|
+
return str(page_numbers[0]) + "-" + str(page_numbers[-1])
|
1213
|
+
elif isinstance(page_numbers, int):
|
1214
|
+
return str(page_numbers)
|
1215
|
+
else:
|
1216
|
+
return str(page_numbers).replace(" ", "-")
|
1217
|
+
|
1218
|
+
async def _send_chunk_to_llm(self, chunk: Dict[str, Any]) -> str:
|
1204
1219
|
"""
|
1205
|
-
Sends a PDF chunk to the
|
1220
|
+
Sends a PDF chunk to the LLM API and returns the response text.
|
1206
1221
|
Uses retries with exponential backoff to handle transient failures.
|
1207
1222
|
"""
|
1208
1223
|
import asyncio
|
1209
1224
|
import logging
|
1210
1225
|
|
1211
|
-
from
|
1212
|
-
from google.genai import types
|
1226
|
+
from langroid.language_models.openai_gpt import OpenAIGPT, OpenAIGPTConfig
|
1213
1227
|
|
1214
1228
|
async with self.semaphore: # Limit concurrent API requests
|
1215
1229
|
for attempt in range(self.max_retries):
|
1216
1230
|
try:
|
1217
|
-
|
1231
|
+
llm_config = OpenAIGPTConfig(
|
1232
|
+
chat_model=self.model_name,
|
1233
|
+
max_output_tokens=self.max_tokens,
|
1234
|
+
)
|
1235
|
+
llm = OpenAIGPT(config=llm_config)
|
1236
|
+
page_nums = self._page_num_str(chunk.get("page_numbers", "?"))
|
1237
|
+
base64_string = base64.b64encode(chunk["pdf_bytes"]).decode("utf-8")
|
1238
|
+
data_uri = f"data:application/pdf;base64,{base64_string}"
|
1239
|
+
if "gemini" in self.model_name.lower():
|
1240
|
+
file_content = dict(
|
1241
|
+
type="image_url",
|
1242
|
+
image_url=dict(url=data_uri),
|
1243
|
+
)
|
1244
|
+
elif "claude" in self.model_name.lower():
|
1245
|
+
# optimistrally try this: some API proxies like litellm
|
1246
|
+
# support this, and others may not.
|
1247
|
+
file_content = dict(
|
1248
|
+
type="file",
|
1249
|
+
file=dict(
|
1250
|
+
file_data=data_uri,
|
1251
|
+
),
|
1252
|
+
)
|
1253
|
+
else:
|
1254
|
+
# fallback: assume file upload is similar to OpenAI API
|
1255
|
+
file_content = dict(
|
1256
|
+
type="file",
|
1257
|
+
file=dict(
|
1258
|
+
filename=f"pages-{page_nums}.pdf",
|
1259
|
+
file_data=data_uri,
|
1260
|
+
),
|
1261
|
+
)
|
1218
1262
|
|
1219
1263
|
# Send the request with PDF content and system instructions
|
1220
|
-
response = await
|
1221
|
-
model=self.model_name,
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1264
|
+
response = await llm.async_client.chat.completions.create( # type: ignore
|
1265
|
+
model=self.model_name.split("/")[-1],
|
1266
|
+
messages=[
|
1267
|
+
dict(
|
1268
|
+
role="system",
|
1269
|
+
content="""
|
1270
|
+
You are an expert pdf -> markdown converter.
|
1271
|
+
Do NOT use any triple backquotes when you present the
|
1272
|
+
markdown content,like ```markdown etc.
|
1273
|
+
FAITHFULLY CONVERT THE PDF TO MARKDOWN,
|
1274
|
+
retaining ALL content as you find it.
|
1275
|
+
""",
|
1276
|
+
),
|
1277
|
+
dict( # type: ignore
|
1278
|
+
role="user",
|
1279
|
+
content=[
|
1280
|
+
dict(
|
1281
|
+
type="text",
|
1282
|
+
text=self.LLM_PDF_MD_SYSTEM_INSTRUCTION,
|
1283
|
+
),
|
1284
|
+
file_content,
|
1285
|
+
],
|
1225
1286
|
),
|
1226
|
-
self.GEMINI_SYSTEM_INSTRUCTION,
|
1227
1287
|
],
|
1228
1288
|
)
|
1229
1289
|
|
1230
1290
|
# Return extracted text if available
|
1231
|
-
return
|
1291
|
+
return (
|
1292
|
+
""
|
1293
|
+
if (
|
1294
|
+
response is None
|
1295
|
+
or not hasattr(response, "choices")
|
1296
|
+
or not isinstance(response.choices, list)
|
1297
|
+
)
|
1298
|
+
else (response.choices[0].message.content)
|
1299
|
+
)
|
1232
1300
|
|
1233
1301
|
except Exception as e:
|
1234
1302
|
# Log error with page numbers for debugging
|
@@ -1246,33 +1314,34 @@ class GeminiPdfParser(DocumentParser):
|
|
1246
1314
|
await asyncio.sleep(delay)
|
1247
1315
|
else:
|
1248
1316
|
# Log failure after max retries
|
1317
|
+
page_nums = chunk.get("page_numbers", "Unknown")
|
1249
1318
|
logging.error(
|
1250
|
-
"
|
1251
|
-
|
1319
|
+
f"""
|
1320
|
+
Max retries reached for pages {page_nums}.
|
1321
|
+
It is possible your LLM API provider for
|
1322
|
+
the model {self.model_name} does not support
|
1323
|
+
file uploads via an OpenAI-compatible API.
|
1324
|
+
""",
|
1252
1325
|
)
|
1253
1326
|
break
|
1254
|
-
|
1255
1327
|
return "" # Return empty string if all retries fail
|
1256
1328
|
|
1257
|
-
async def process_chunks(
|
1258
|
-
self, chunks: List[Dict[str, Any]], api_key: str
|
1259
|
-
) -> List[str]:
|
1329
|
+
async def process_chunks(self, chunks: List[Dict[str, Any]]) -> List[str]:
|
1260
1330
|
"""
|
1261
|
-
Processes PDF chunks by sending them to the
|
1331
|
+
Processes PDF chunks by sending them to the LLM API and
|
1262
1332
|
collecting the results.
|
1263
1333
|
|
1264
1334
|
Args:
|
1265
1335
|
chunks: A list of dictionaries, where each dictionary represents
|
1266
1336
|
a PDF chunk and contains the PDF data and page numbers.
|
1267
|
-
api_key: The Gemini API key.
|
1268
1337
|
"""
|
1269
1338
|
# To show nice progress bar
|
1270
1339
|
from tqdm.asyncio import tqdm_asyncio
|
1271
1340
|
|
1272
|
-
# Create a list of asynchronous tasks to send each chunk to
|
1341
|
+
# Create a list of asynchronous tasks to send each chunk to the LLM.
|
1273
1342
|
# Chunk in this case might be single page or group of pages returned
|
1274
1343
|
# by prepare_pdf_chunks function
|
1275
|
-
tasks = [self.
|
1344
|
+
tasks = [self._send_chunk_to_llm(chunk) for chunk in chunks]
|
1276
1345
|
|
1277
1346
|
# Gather the results from all tasks, allowing exceptions to be returned.
|
1278
1347
|
# tqdm_asyncio is wrapper around asyncio.gather
|
@@ -1311,7 +1380,7 @@ class GeminiPdfParser(DocumentParser):
|
|
1311
1380
|
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
1312
1381
|
"""
|
1313
1382
|
Iterates over the document pages, extracting content using the
|
1314
|
-
|
1383
|
+
LLM API, saves them to a markdown file, and yields page numbers
|
1315
1384
|
along with their corresponding content.
|
1316
1385
|
|
1317
1386
|
Yields:
|
@@ -1319,14 +1388,8 @@ class GeminiPdfParser(DocumentParser):
|
|
1319
1388
|
(int) and the page content (Any).
|
1320
1389
|
"""
|
1321
1390
|
import asyncio
|
1322
|
-
import os
|
1323
1391
|
|
1324
|
-
# Load environment variables (e.g., GEMINI_API_KEY) from a .env file.
|
1325
1392
|
load_dotenv()
|
1326
|
-
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
1327
|
-
if not gemini_api_key:
|
1328
|
-
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
1329
|
-
|
1330
1393
|
try:
|
1331
1394
|
# This involves extracting pages, grouping them according to the
|
1332
1395
|
# `max_tokens` limit (if `split_on_page` is False), and
|
@@ -1335,18 +1398,16 @@ class GeminiPdfParser(DocumentParser):
|
|
1335
1398
|
# PDF bytes and the associated page numbers or single page if
|
1336
1399
|
# `split_on_page` is true
|
1337
1400
|
|
1338
|
-
pdf_chunks = self.
|
1401
|
+
pdf_chunks = self._prepare_pdf_chunks_for_llm(
|
1339
1402
|
num_workers=8,
|
1340
1403
|
max_tokens=self.max_tokens,
|
1341
1404
|
split_on_page=self.split_on_page,
|
1342
1405
|
)
|
1343
1406
|
|
1344
1407
|
# We asynchronously processes each chunk, sending it
|
1345
|
-
# to
|
1408
|
+
# to the LLM and retrieving the Markdown output. It handles rate
|
1346
1409
|
# limiting and retries.
|
1347
|
-
markdown_results = asyncio.run(
|
1348
|
-
self.process_chunks(pdf_chunks, gemini_api_key)
|
1349
|
-
)
|
1410
|
+
markdown_results = asyncio.run(self.process_chunks(pdf_chunks))
|
1350
1411
|
|
1351
1412
|
# This file serves as an intermediate storage location for the
|
1352
1413
|
# complete Markdown output.
|
langroid/parsing/parser.py
CHANGED
@@ -36,10 +36,10 @@ class BaseParsingConfig(BaseSettings):
|
|
36
36
|
extra = "ignore" # Ignore unknown settings
|
37
37
|
|
38
38
|
|
39
|
-
class
|
40
|
-
"""Configuration for
|
39
|
+
class LLMPdfParserConfig(BaseSettings):
|
40
|
+
"""Configuration for LLM-based parsing."""
|
41
41
|
|
42
|
-
model_name: str = "gemini-2.0-flash" # Default model
|
42
|
+
model_name: str = "gemini/gemini-2.0-flash" # Default model
|
43
43
|
max_tokens: Optional[int] = None
|
44
44
|
split_on_page: Optional[bool] = True
|
45
45
|
requests_per_minute: Optional[int] = 5
|
@@ -60,10 +60,10 @@ class PdfParsingConfig(BaseParsingConfig):
|
|
60
60
|
"unstructured",
|
61
61
|
"pdf2image",
|
62
62
|
"markitdown",
|
63
|
-
"
|
63
|
+
"llm-pdf-parser",
|
64
64
|
"marker",
|
65
65
|
] = "pymupdf4llm"
|
66
|
-
|
66
|
+
llm_parser_config: Optional[LLMPdfParserConfig] = None
|
67
67
|
marker_config: Optional[MarkerConfig] = None
|
68
68
|
|
69
69
|
@root_validator(pre=True)
|
@@ -71,10 +71,10 @@ class PdfParsingConfig(BaseParsingConfig):
|
|
71
71
|
"""Ensure correct config is set based on library selection."""
|
72
72
|
library = values.get("library")
|
73
73
|
|
74
|
-
if library == "
|
75
|
-
values.setdefault("
|
74
|
+
if library == "llm-pdf-parser":
|
75
|
+
values.setdefault("llm_parser_config", LLMPdfParserConfig())
|
76
76
|
else:
|
77
|
-
values["
|
77
|
+
values["llm_parser_config"] = None
|
78
78
|
|
79
79
|
if library == "marker":
|
80
80
|
values.setdefault("marker_config", MarkerConfig())
|
@@ -82,11 +82,11 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
|
|
82
82
|
langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
|
83
83
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
84
84
|
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
85
|
-
langroid/parsing/document_parser.py,sha256=
|
85
|
+
langroid/parsing/document_parser.py,sha256=7_pHu-_yQOETtDATv5VRdVSvac9kJRuZiwQ6EbJqJ_o,57403
|
86
86
|
langroid/parsing/md_parser.py,sha256=JUgsUpCaeAuBndmtDaJR9HMZaje1gmtXtaLXJHst3i8,21340
|
87
87
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
88
88
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
89
|
-
langroid/parsing/parser.py,sha256=
|
89
|
+
langroid/parsing/parser.py,sha256=Tbe1mQ7wp6GVx2xMWv1raIkpepTN0qNrqOxakWY6Zkc,15437
|
90
90
|
langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
|
91
91
|
langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
|
92
92
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
@@ -129,7 +129,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
129
129
|
langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
|
130
130
|
langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
|
131
131
|
langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
|
132
|
-
langroid-0.
|
133
|
-
langroid-0.
|
134
|
-
langroid-0.
|
135
|
-
langroid-0.
|
132
|
+
langroid-0.51.1.dist-info/METADATA,sha256=9E0M5JzLk_fuMOLH918i7fIBwWKMm1O6J3VY8DoG3NM,63641
|
133
|
+
langroid-0.51.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
134
|
+
langroid-0.51.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
135
|
+
langroid-0.51.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|