langroid 0.42.10__py3-none-any.whl → 0.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/chat_agent.py +16 -2
- langroid/parsing/document_parser.py +411 -1
- langroid/parsing/parser.py +33 -3
- langroid/utils/system.py +6 -1
- {langroid-0.42.10.dist-info → langroid-0.43.1.dist-info}/METADATA +3 -1
- {langroid-0.42.10.dist-info → langroid-0.43.1.dist-info}/RECORD +8 -8
- {langroid-0.42.10.dist-info → langroid-0.43.1.dist-info}/WHEEL +0 -0
- {langroid-0.42.10.dist-info → langroid-0.43.1.dist-info}/licenses/LICENSE +0 -0
langroid/agent/chat_agent.py
CHANGED
@@ -1069,6 +1069,13 @@ class ChatAgent(Agent):
|
|
1069
1069
|
was enabled, disables it for the tool, else triggers strict recovery.
|
1070
1070
|
"""
|
1071
1071
|
self.tool_error = False
|
1072
|
+
most_recent_sent_by_llm = (
|
1073
|
+
len(self.message_history) > 0
|
1074
|
+
and self.message_history[-1].role == Role.ASSISTANT
|
1075
|
+
)
|
1076
|
+
was_llm = most_recent_sent_by_llm or (
|
1077
|
+
isinstance(msg, ChatDocument) and msg.metadata.sender == Entity.LLM
|
1078
|
+
)
|
1072
1079
|
try:
|
1073
1080
|
tools = super().get_tool_messages(msg, all_tools)
|
1074
1081
|
except ValidationError as ve:
|
@@ -1099,9 +1106,16 @@ class ChatAgent(Agent):
|
|
1099
1106
|
if isinstance(msg, ChatDocument):
|
1100
1107
|
self.tool_error = msg.metadata.sender == Entity.LLM
|
1101
1108
|
else:
|
1102
|
-
self.tool_error =
|
1109
|
+
self.tool_error = most_recent_sent_by_llm
|
1103
1110
|
|
1104
|
-
|
1111
|
+
if was_llm:
|
1112
|
+
raise ve
|
1113
|
+
else:
|
1114
|
+
self.tool_error = False
|
1115
|
+
return []
|
1116
|
+
|
1117
|
+
if not was_llm:
|
1118
|
+
self.tool_error = False
|
1105
1119
|
|
1106
1120
|
return tools
|
1107
1121
|
|
@@ -9,7 +9,9 @@ from enum import Enum
|
|
9
9
|
from io import BytesIO
|
10
10
|
from itertools import accumulate
|
11
11
|
from pathlib import Path
|
12
|
-
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
|
12
|
+
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple, Union
|
13
|
+
|
14
|
+
from dotenv import load_dotenv
|
13
15
|
|
14
16
|
from langroid.exceptions import LangroidImportError
|
15
17
|
from langroid.utils.object_registry import ObjectRegistry
|
@@ -163,6 +165,8 @@ class DocumentParser(Parser):
|
|
163
165
|
return UnstructuredPDFParser(source, config)
|
164
166
|
elif config.pdf.library == "pdf2image":
|
165
167
|
return ImagePdfParser(source, config)
|
168
|
+
elif config.pdf.library == "gemini":
|
169
|
+
return GeminiPdfParser(source, config)
|
166
170
|
else:
|
167
171
|
raise ValueError(
|
168
172
|
f"Unsupported PDF library specified: {config.pdf.library}"
|
@@ -954,3 +958,409 @@ class MarkitdownPPTXParser(DocumentParser):
|
|
954
958
|
content=self.fix_text(md_content),
|
955
959
|
metadata=DocMetaData(source=self.source),
|
956
960
|
)
|
961
|
+
|
962
|
+
|
963
|
+
class GeminiPdfParser(DocumentParser):
|
964
|
+
"""
|
965
|
+
This class converts PDFs to Markdown using Gemini multimodal LLMs.
|
966
|
+
|
967
|
+
It extracts pages, converts them with the LLM (replacing images with
|
968
|
+
detailed descriptions), and outputs Markdown page by page. The
|
969
|
+
conversion follows `GEMINI_SYSTEM_INSTRUCTION`. It employs
|
970
|
+
multiprocessing for speed, async requests with rate limiting, and
|
971
|
+
handles errors.
|
972
|
+
|
973
|
+
It supports page-by-page splitting or chunking multiple pages into
|
974
|
+
one, respecting page boundaries and a `max_token_limit`.
|
975
|
+
"""
|
976
|
+
|
977
|
+
DEFAULT_MAX_TOKENS = 7000
|
978
|
+
OUTPUT_DIR = Path(".gemini_pdfparser") # Fixed output directory
|
979
|
+
|
980
|
+
GEMINI_SYSTEM_INSTRUCTION = """
|
981
|
+
### **Convert PDF to Markdown**
|
982
|
+
1. **Text:**
|
983
|
+
* Preserve structure, formatting (**bold**, *italic*), lists, and indentation.
|
984
|
+
* **Remove running heads (page numbers, headers/footers).**
|
985
|
+
* Keep section and chapter titles; discard repeated page headers.
|
986
|
+
2. **Images:** Replace with **detailed, creative descriptions**
|
987
|
+
optimized for clarity and understanding.
|
988
|
+
3. **Tables:** Convert to Markdown tables with proper structure.
|
989
|
+
4. **Math:** Use LaTeX (`...` inline, `$...$` block).
|
990
|
+
5. **Code:** Wrap in fenced blocks without specifying a language:
|
991
|
+
|
992
|
+
```
|
993
|
+
code
|
994
|
+
```
|
995
|
+
6. **Clean Output:**
|
996
|
+
* No system messages, metadata, or artifacts or ```markdown``` identifier.
|
997
|
+
* Do **not** include introductory or explanatory messages
|
998
|
+
like "Here is your output."
|
999
|
+
* Ensure formatting is **consistent and structured**
|
1000
|
+
for feeding into a markdown parser.
|
1001
|
+
""".strip()
|
1002
|
+
|
1003
|
+
def __init__(self, source: Union[str, bytes], config: ParsingConfig):
|
1004
|
+
super().__init__(source, config)
|
1005
|
+
if not config.pdf.gemini_config:
|
1006
|
+
raise ValueError(
|
1007
|
+
"GeminiPdfParser requires a Gemini-based config in pdf parsing config"
|
1008
|
+
)
|
1009
|
+
self.model_name = config.pdf.gemini_config.model_name
|
1010
|
+
|
1011
|
+
# Ensure output directory exists
|
1012
|
+
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
1013
|
+
|
1014
|
+
prefix = (
|
1015
|
+
Path(source).stem + "_"
|
1016
|
+
if isinstance(source, str) and Path(source).exists()
|
1017
|
+
else "output_"
|
1018
|
+
)
|
1019
|
+
temp_file = tempfile.NamedTemporaryFile(
|
1020
|
+
suffix=".md",
|
1021
|
+
prefix=prefix,
|
1022
|
+
dir=str(self.OUTPUT_DIR),
|
1023
|
+
delete=False,
|
1024
|
+
)
|
1025
|
+
temp_file.close()
|
1026
|
+
self.output_filename = Path(temp_file.name)
|
1027
|
+
|
1028
|
+
self.max_tokens = config.pdf.gemini_config.max_tokens or self.DEFAULT_MAX_TOKENS
|
1029
|
+
|
1030
|
+
"""
|
1031
|
+
If True, each PDF page is processed as a separate chunk,
|
1032
|
+
resulting in one LLM request per page. If False, pages are
|
1033
|
+
grouped into chunks based on `max_token_limit` before being sent
|
1034
|
+
to the LLM.
|
1035
|
+
"""
|
1036
|
+
self.split_on_page = config.pdf.gemini_config.split_on_page or False
|
1037
|
+
|
1038
|
+
# Rate limiting parameters
|
1039
|
+
import asyncio
|
1040
|
+
|
1041
|
+
self.requests_per_minute = config.pdf.gemini_config.requests_per_minute or 5
|
1042
|
+
|
1043
|
+
"""
|
1044
|
+
A semaphore to control the number of concurrent requests to the LLM,
|
1045
|
+
preventing rate limit errors. A semaphore slot is acquired before
|
1046
|
+
making an LLM request and released after the request is complete.
|
1047
|
+
"""
|
1048
|
+
self.semaphore = asyncio.Semaphore(self.requests_per_minute)
|
1049
|
+
self.retry_delay = 5 # seconds, for exponential backoff
|
1050
|
+
self.max_retries = 3
|
1051
|
+
|
1052
|
+
def _extract_page(self, page_num: int) -> Dict[str, Any]:
|
1053
|
+
"""
|
1054
|
+
Extracts a single page and estimates token count.
|
1055
|
+
Opens the PDF from self.doc_bytes (a BytesIO object).
|
1056
|
+
"""
|
1057
|
+
import fitz
|
1058
|
+
|
1059
|
+
try:
|
1060
|
+
# Always open the document from in-memory bytes.
|
1061
|
+
doc = fitz.open(stream=self.doc_bytes.getvalue(), filetype="pdf")
|
1062
|
+
new_pdf = fitz.open()
|
1063
|
+
new_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
|
1064
|
+
pdf_bytes = new_pdf.write()
|
1065
|
+
text = doc[page_num].get_text("text")
|
1066
|
+
token_count = len(text) // 4 if text else len(pdf_bytes) // 4
|
1067
|
+
|
1068
|
+
return {
|
1069
|
+
"page_numbers": page_num + 1,
|
1070
|
+
"pdf_bytes": pdf_bytes,
|
1071
|
+
"token_count": token_count,
|
1072
|
+
}
|
1073
|
+
except Exception as e:
|
1074
|
+
raise ValueError(f"Error processing PDF document: {e}") from e
|
1075
|
+
|
1076
|
+
def _extract_pdf_pages_parallel(
|
1077
|
+
self, num_workers: Optional[int] = None
|
1078
|
+
) -> List[Dict[str, Any]]:
|
1079
|
+
"""Parallel PDF page extraction using self.doc_bytes."""
|
1080
|
+
from multiprocessing import Pool, cpu_count
|
1081
|
+
|
1082
|
+
import fitz
|
1083
|
+
from tqdm import tqdm
|
1084
|
+
|
1085
|
+
try:
|
1086
|
+
doc = fitz.open(stream=self.doc_bytes.getvalue(), filetype="pdf")
|
1087
|
+
total_pages = len(doc)
|
1088
|
+
except Exception as e:
|
1089
|
+
raise ValueError(f"Error opening PDF document: {e}") from e
|
1090
|
+
|
1091
|
+
num_workers = num_workers or cpu_count()
|
1092
|
+
with Pool(num_workers) as pool:
|
1093
|
+
with tqdm(total=total_pages, desc="Extracting pages", unit="page") as pbar:
|
1094
|
+
results = []
|
1095
|
+
for result in pool.imap(self._extract_page, range(total_pages)):
|
1096
|
+
results.append(result)
|
1097
|
+
pbar.update(1)
|
1098
|
+
|
1099
|
+
return results
|
1100
|
+
|
1101
|
+
def _group_pages_by_token_limit(
|
1102
|
+
self, pages: List[Dict[str, Any]], max_tokens: int = DEFAULT_MAX_TOKENS
|
1103
|
+
) -> List[List[Dict[str, Any]]]:
|
1104
|
+
"""Groups pages into chunks where each chunk is approximately `max_tokens`."""
|
1105
|
+
chunks: List[List[Dict[str, Any]]] = []
|
1106
|
+
current_chunk: List[Dict[str, Any]] = []
|
1107
|
+
current_tokens = 0
|
1108
|
+
|
1109
|
+
for page in pages:
|
1110
|
+
if current_tokens + page["token_count"] > max_tokens and current_chunk:
|
1111
|
+
chunks.append(current_chunk)
|
1112
|
+
current_chunk = []
|
1113
|
+
current_tokens = 0
|
1114
|
+
|
1115
|
+
current_chunk.append(page)
|
1116
|
+
current_tokens += page["token_count"]
|
1117
|
+
|
1118
|
+
if current_chunk: # Add remaining pages
|
1119
|
+
chunks.append(current_chunk)
|
1120
|
+
|
1121
|
+
return chunks
|
1122
|
+
|
1123
|
+
def _merge_pages_into_pdf_with_metadata(
|
1124
|
+
self, page_group: List[Dict[str, Any]]
|
1125
|
+
) -> Dict[str, Any]:
|
1126
|
+
"""
|
1127
|
+
Merges grouped pages into a single binary chunk so that
|
1128
|
+
it does not exceed max token limit
|
1129
|
+
"""
|
1130
|
+
import fitz
|
1131
|
+
|
1132
|
+
merged_pdf = fitz.open()
|
1133
|
+
page_numbers = []
|
1134
|
+
|
1135
|
+
for page in page_group:
|
1136
|
+
temp_pdf = fitz.open("pdf", page["pdf_bytes"])
|
1137
|
+
merged_pdf.insert_pdf(temp_pdf)
|
1138
|
+
page_numbers.append(page["page_numbers"])
|
1139
|
+
|
1140
|
+
return {
|
1141
|
+
"pdf_bytes": merged_pdf.write(), # Binary PDF data
|
1142
|
+
"page_numbers": page_numbers, # List of page numbers in this chunk
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
def _prepare_pdf_chunks_for_gemini(
|
1146
|
+
self,
|
1147
|
+
num_workers: Optional[int] = None,
|
1148
|
+
max_tokens: int = DEFAULT_MAX_TOKENS,
|
1149
|
+
split_on_page: bool = False,
|
1150
|
+
) -> List[Dict[str, Any]]:
|
1151
|
+
"""
|
1152
|
+
Extracts, groups, and merges PDF pages into chunks with embedded page markers.
|
1153
|
+
"""
|
1154
|
+
from multiprocessing import Pool
|
1155
|
+
|
1156
|
+
pages = self._extract_pdf_pages_parallel(num_workers)
|
1157
|
+
|
1158
|
+
if split_on_page:
|
1159
|
+
# Each page becomes its own chunk
|
1160
|
+
return pages
|
1161
|
+
else:
|
1162
|
+
# Group pages based on token limit
|
1163
|
+
chunks = self._group_pages_by_token_limit(pages, max_tokens)
|
1164
|
+
with Pool(num_workers) as pool:
|
1165
|
+
pdf_chunks = pool.map(self._merge_pages_into_pdf_with_metadata, chunks)
|
1166
|
+
return pdf_chunks
|
1167
|
+
|
1168
|
+
async def _send_chunk_to_gemini(
|
1169
|
+
self, chunk: Dict[str, Any], gemini_api_key: str
|
1170
|
+
) -> str:
|
1171
|
+
"""
|
1172
|
+
Sends a PDF chunk to the Gemini API and returns the response text.
|
1173
|
+
Uses retries with exponential backoff to handle transient failures.
|
1174
|
+
"""
|
1175
|
+
import asyncio
|
1176
|
+
import logging
|
1177
|
+
|
1178
|
+
from google import genai
|
1179
|
+
from google.genai import types
|
1180
|
+
|
1181
|
+
async with self.semaphore: # Limit concurrent API requests
|
1182
|
+
for attempt in range(self.max_retries):
|
1183
|
+
try:
|
1184
|
+
client = genai.Client(api_key=gemini_api_key)
|
1185
|
+
|
1186
|
+
# Send the request with PDF content and system instructions
|
1187
|
+
response = await client.aio.models.generate_content(
|
1188
|
+
model=self.model_name,
|
1189
|
+
contents=[
|
1190
|
+
types.Part.from_bytes(
|
1191
|
+
data=chunk["pdf_bytes"], mime_type="application/pdf"
|
1192
|
+
),
|
1193
|
+
self.GEMINI_SYSTEM_INSTRUCTION,
|
1194
|
+
],
|
1195
|
+
)
|
1196
|
+
|
1197
|
+
# Return extracted text if available
|
1198
|
+
return str(response.text) if response.text else ""
|
1199
|
+
|
1200
|
+
except Exception as e:
|
1201
|
+
# Log error with page numbers for debugging
|
1202
|
+
logging.error(
|
1203
|
+
"Attempt %d failed for pages %s: %s",
|
1204
|
+
attempt + 1,
|
1205
|
+
chunk.get("page_numbers", "Unknown"),
|
1206
|
+
e,
|
1207
|
+
)
|
1208
|
+
|
1209
|
+
if attempt < self.max_retries - 1:
|
1210
|
+
# Apply exponential backoff before retrying
|
1211
|
+
delay = self.retry_delay * (2**attempt)
|
1212
|
+
logging.info("Retrying in %s sec...", delay)
|
1213
|
+
await asyncio.sleep(delay)
|
1214
|
+
else:
|
1215
|
+
# Log failure after max retries
|
1216
|
+
logging.error(
|
1217
|
+
"Max retries reached for pages %s",
|
1218
|
+
chunk.get("page_numbers", "Unknown"),
|
1219
|
+
)
|
1220
|
+
break
|
1221
|
+
|
1222
|
+
return "" # Return empty string if all retries fail
|
1223
|
+
|
1224
|
+
async def process_chunks(
|
1225
|
+
self, chunks: List[Dict[str, Any]], api_key: str
|
1226
|
+
) -> List[str]:
|
1227
|
+
"""
|
1228
|
+
Processes PDF chunks by sending them to the Gemini API and
|
1229
|
+
collecting the results.
|
1230
|
+
|
1231
|
+
Args:
|
1232
|
+
chunks: A list of dictionaries, where each dictionary represents
|
1233
|
+
a PDF chunk and contains the PDF data and page numbers.
|
1234
|
+
api_key: The Gemini API key.
|
1235
|
+
"""
|
1236
|
+
# To show nice progress bar
|
1237
|
+
from tqdm.asyncio import tqdm_asyncio
|
1238
|
+
|
1239
|
+
# Create a list of asynchronous tasks to send each chunk to Gemini.
|
1240
|
+
# Chunk in this case might be single page or group of pages returned
|
1241
|
+
# by prepare_pdf_chunks function
|
1242
|
+
tasks = [self._send_chunk_to_gemini(chunk, api_key) for chunk in chunks]
|
1243
|
+
|
1244
|
+
# Gather the results from all tasks, allowing exceptions to be returned.
|
1245
|
+
# tqdm_asyncio is wrapper around asyncio.gather
|
1246
|
+
gathered_results = await tqdm_asyncio.gather(
|
1247
|
+
*tasks, desc="Processing chunks(pages)", unit="chunk"
|
1248
|
+
)
|
1249
|
+
results = []
|
1250
|
+
for i, result in enumerate(gathered_results):
|
1251
|
+
chunk = chunks[i] # Get the corresponding chunk.
|
1252
|
+
|
1253
|
+
if isinstance(result, Exception):
|
1254
|
+
# Handle exceptions that occurred during chunk processing.
|
1255
|
+
logging.error(
|
1256
|
+
"Failed to process chunk %s: %s",
|
1257
|
+
chunk.get("page_numbers", "Unknown"),
|
1258
|
+
result,
|
1259
|
+
)
|
1260
|
+
results.append(
|
1261
|
+
"<!----Error: Could not process chunk %s---->"
|
1262
|
+
% chunk.get("page_numbers", "Unknown")
|
1263
|
+
)
|
1264
|
+
else:
|
1265
|
+
# Process successful results and append page/chunk markers.
|
1266
|
+
markdown = str(result)
|
1267
|
+
if self.split_on_page:
|
1268
|
+
results.append(
|
1269
|
+
markdown + f"<!----Page-{chunk['page_numbers']}---->"
|
1270
|
+
)
|
1271
|
+
else:
|
1272
|
+
results.append(
|
1273
|
+
markdown + f"<!----Chunk-{chunk['page_numbers']}---->"
|
1274
|
+
)
|
1275
|
+
|
1276
|
+
return results # Return the list of results.
|
1277
|
+
|
1278
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
1279
|
+
"""
|
1280
|
+
Iterates over the document pages, extracting content using the
|
1281
|
+
Gemini API, saves them to a markdown file, and yields page numbers
|
1282
|
+
along with their corresponding content.
|
1283
|
+
|
1284
|
+
Yields:
|
1285
|
+
A generator of tuples, where each tuple contains the page number
|
1286
|
+
(int) and the page content (Any).
|
1287
|
+
"""
|
1288
|
+
import asyncio
|
1289
|
+
import os
|
1290
|
+
|
1291
|
+
# Load environment variables (e.g., GEMINI_API_KEY) from a .env file.
|
1292
|
+
load_dotenv()
|
1293
|
+
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
1294
|
+
if not gemini_api_key:
|
1295
|
+
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
1296
|
+
|
1297
|
+
try:
|
1298
|
+
# This involves extracting pages, grouping them according to the
|
1299
|
+
# `max_tokens` limit (if `split_on_page` is False), and
|
1300
|
+
# merging pages into larger PDF chunks. The result
|
1301
|
+
# is a list of dictionaries, where each dictionary contains the
|
1302
|
+
# PDF bytes and the associated page numbers or single page if
|
1303
|
+
# `split_on_page` is true
|
1304
|
+
|
1305
|
+
pdf_chunks = self._prepare_pdf_chunks_for_gemini(
|
1306
|
+
num_workers=8,
|
1307
|
+
max_tokens=self.max_tokens,
|
1308
|
+
split_on_page=self.split_on_page,
|
1309
|
+
)
|
1310
|
+
|
1311
|
+
# We asynchronously processes each chunk, sending it
|
1312
|
+
# to Gemini and retrieving the Markdown output. It handles rate
|
1313
|
+
# limiting and retries.
|
1314
|
+
markdown_results = asyncio.run(
|
1315
|
+
self.process_chunks(pdf_chunks, gemini_api_key)
|
1316
|
+
)
|
1317
|
+
|
1318
|
+
# This file serves as an intermediate storage location for the
|
1319
|
+
# complete Markdown output.
|
1320
|
+
with open(self.output_filename, "w", encoding="utf-8") as outfile:
|
1321
|
+
outfile.write("\n\n".join(markdown_results))
|
1322
|
+
|
1323
|
+
# Read the full Markdown content from the temporary file.
|
1324
|
+
with open(self.output_filename, "r", encoding="utf-8") as infile:
|
1325
|
+
full_markdown = infile.read()
|
1326
|
+
|
1327
|
+
# The splitting is based on the `split_on_page` setting. If True,
|
1328
|
+
# the Markdown is split using the "Page-" marker. Otherwise, it's
|
1329
|
+
# split using the "Chunk-" marker.
|
1330
|
+
if self.split_on_page:
|
1331
|
+
pages = full_markdown.split("<!----Page-")
|
1332
|
+
else:
|
1333
|
+
pages = full_markdown.split("<!----Chunk-")
|
1334
|
+
|
1335
|
+
# Remove the first element if it's empty (due to the split).
|
1336
|
+
if pages and pages[0] == "":
|
1337
|
+
pages = pages[1:]
|
1338
|
+
|
1339
|
+
# Iterate over the pages or chunks and yield their content.
|
1340
|
+
for i, page in enumerate(pages):
|
1341
|
+
# Check for errors during processing.
|
1342
|
+
if "<!----Error:" in page:
|
1343
|
+
page_content = page
|
1344
|
+
logging.warning(f"Page {i}: Error processing chunk.")
|
1345
|
+
else:
|
1346
|
+
# Extract the actual page content by removing the marker.
|
1347
|
+
page_content = (
|
1348
|
+
page.split("---->", 1)[1]
|
1349
|
+
if len(page.split("---->", 1)) > 1
|
1350
|
+
else page
|
1351
|
+
)
|
1352
|
+
|
1353
|
+
# Yield the page number and content.
|
1354
|
+
yield i, page_content
|
1355
|
+
|
1356
|
+
except Exception as e:
|
1357
|
+
raise ValueError(f"Error processing document: {e}") from e
|
1358
|
+
|
1359
|
+
def get_document_from_page(self, page: str) -> Document:
|
1360
|
+
"""
|
1361
|
+
Get a Document object from a given markdown page.
|
1362
|
+
"""
|
1363
|
+
return Document(
|
1364
|
+
content=page,
|
1365
|
+
metadata=DocMetaData(source=self.source),
|
1366
|
+
)
|
langroid/parsing/parser.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
3
|
from enum import Enum
|
4
|
-
from typing import Dict, List, Literal
|
4
|
+
from typing import Any, Dict, List, Literal, Optional
|
5
5
|
|
6
6
|
import tiktoken
|
7
7
|
|
8
8
|
from langroid.mytypes import Document
|
9
9
|
from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
|
10
|
-
from langroid.pydantic_v1 import BaseSettings
|
10
|
+
from langroid.pydantic_v1 import BaseSettings, root_validator
|
11
11
|
from langroid.utils.object_registry import ObjectRegistry
|
12
12
|
|
13
13
|
logger = logging.getLogger(__name__)
|
@@ -20,7 +20,26 @@ class Splitter(str, Enum):
|
|
20
20
|
SIMPLE = "simple"
|
21
21
|
|
22
22
|
|
23
|
-
class
|
23
|
+
class BaseParsingConfig(BaseSettings):
|
24
|
+
"""Base class for document parsing configurations."""
|
25
|
+
|
26
|
+
library: str
|
27
|
+
|
28
|
+
class Config:
|
29
|
+
extra = "ignore" # Ignore unknown settings
|
30
|
+
|
31
|
+
|
32
|
+
class GeminiConfig(BaseSettings):
|
33
|
+
"""Configuration for Gemini-based parsing."""
|
34
|
+
|
35
|
+
model_name: str = "gemini-2.0-flash" # Default model
|
36
|
+
max_tokens: Optional[int] = None
|
37
|
+
split_on_page: Optional[bool] = True
|
38
|
+
requests_per_minute: Optional[int] = 5
|
39
|
+
|
40
|
+
|
41
|
+
class PdfParsingConfig(BaseParsingConfig):
|
42
|
+
|
24
43
|
library: Literal[
|
25
44
|
"fitz",
|
26
45
|
"pymupdf4llm",
|
@@ -29,7 +48,18 @@ class PdfParsingConfig(BaseSettings):
|
|
29
48
|
"unstructured",
|
30
49
|
"pdf2image",
|
31
50
|
"markitdown",
|
51
|
+
"gemini",
|
32
52
|
] = "pymupdf4llm"
|
53
|
+
gemini_config: Optional[GeminiConfig] = None
|
54
|
+
|
55
|
+
@root_validator(pre=True)
|
56
|
+
def enable_gemini_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
57
|
+
"""Ensure GeminiConfig is set only when library is 'gemini'."""
|
58
|
+
if values.get("library") == "gemini":
|
59
|
+
values["gemini_config"] = values.get("gemini_config") or GeminiConfig()
|
60
|
+
else:
|
61
|
+
values["gemini_config"] = None
|
62
|
+
return values
|
33
63
|
|
34
64
|
|
35
65
|
class DocxParsingConfig(BaseSettings):
|
langroid/utils/system.py
CHANGED
@@ -14,7 +14,12 @@ from typing import Any, Literal
|
|
14
14
|
|
15
15
|
logger = logging.getLogger(__name__)
|
16
16
|
|
17
|
-
DELETION_ALLOWED_PATHS = [
|
17
|
+
DELETION_ALLOWED_PATHS = [
|
18
|
+
".qdrant",
|
19
|
+
".chroma",
|
20
|
+
".lancedb",
|
21
|
+
".weaviate",
|
22
|
+
]
|
18
23
|
|
19
24
|
|
20
25
|
def pydantic_major_version() -> int:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.43.1
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -86,6 +86,8 @@ Requires-Dist: weaviate-client>=4.9.6; extra == 'all'
|
|
86
86
|
Provides-Extra: arango
|
87
87
|
Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'arango'
|
88
88
|
Requires-Dist: python-arango<9.0.0,>=8.1.2; extra == 'arango'
|
89
|
+
Provides-Extra: asyncio
|
90
|
+
Requires-Dist: asyncio>=3.4.3; extra == 'asyncio'
|
89
91
|
Provides-Extra: chainlit
|
90
92
|
Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'chainlit'
|
91
93
|
Requires-Dist: python-socketio<6.0.0,>=5.11.0; extra == 'chainlit'
|
@@ -5,7 +5,7 @@ langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
|
6
6
|
langroid/agent/base.py,sha256=0szJ5ZxNSmobFO5805ur2cqKfD6vUP4ooN76Z5qAeyw,78677
|
7
7
|
langroid/agent/batch.py,sha256=vi1r5i1-vN80WfqHDSwjEym_KfGsqPGUtwktmiK1nuk,20635
|
8
|
-
langroid/agent/chat_agent.py,sha256=
|
8
|
+
langroid/agent/chat_agent.py,sha256=be7GlySBCuZ4jGQzk0FdVKlqhGeAuewfDywmHDACjh8,84924
|
9
9
|
langroid/agent/chat_document.py,sha256=xzMtrPbaW-Y-BnF7kuhr2dorsD-D5rMWzfOqJ8HAoo8,17885
|
10
10
|
langroid/agent/openai_assistant.py,sha256=JkAcs02bIrgPNVvUWVR06VCthc5-ulla2QMBzux_q6o,34340
|
11
11
|
langroid/agent/task.py,sha256=HB6N-Jn80HFqCf0ZYOC1v3Bn3oO7NLjShHQJJFwW0q4,90557
|
@@ -81,10 +81,10 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
|
|
81
81
|
langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
|
82
82
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
83
83
|
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
84
|
-
langroid/parsing/document_parser.py,sha256=
|
84
|
+
langroid/parsing/document_parser.py,sha256=tov34uYB_2ecq7-G7P7CWSOv5alcfwkrrwfsnCCVdIk,49714
|
85
85
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
86
86
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
87
|
-
langroid/parsing/parser.py,sha256=
|
87
|
+
langroid/parsing/parser.py,sha256=8MDoKQO60RGXod9E5jMj-k90QNhdim4blVJB9L0rrSA,13789
|
88
88
|
langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
|
89
89
|
langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
|
90
90
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
@@ -110,7 +110,7 @@ langroid/utils/logging.py,sha256=mwxHimq1wtVQ64PvDyfJJ7Upj-rjHLNHgx8EC2wClvo,402
|
|
110
110
|
langroid/utils/object_registry.py,sha256=iPz9GHzvmCeVoidB3JdAMEKcxJEqTdUr0otQEexDZ5s,2100
|
111
111
|
langroid/utils/pandas_utils.py,sha256=UctS986Jtl_MvU5rA7-GfrjEHXP7MNu8ePhepv0bTn0,755
|
112
112
|
langroid/utils/pydantic_utils.py,sha256=R7Ps8VP56-eSo-LYHWllFo-SJ2zDmdItuuYpUq2gGJ8,20854
|
113
|
-
langroid/utils/system.py,sha256=
|
113
|
+
langroid/utils/system.py,sha256=q3QJtTSapIwNe8MMhGEM03wgxPLmZiD47_sF1pKx53I,8472
|
114
114
|
langroid/utils/types.py,sha256=-BvyIf_LmAJ5jR9NC7S4CSVNEr3XayAaxJ5o0TiIej0,2992
|
115
115
|
langroid/utils/algorithms/__init__.py,sha256=WylYoZymA0fnzpB4vrsH_0n7WsoLhmuZq8qxsOCjUpM,41
|
116
116
|
langroid/utils/algorithms/graph.py,sha256=JbdpPnUOhw4-D6O7ou101JLA3xPCD0Lr3qaPoFCaRfo,2866
|
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
127
127
|
langroid/vector_store/postgres.py,sha256=DQHd6dt-OcV_QVNm-ymn28rlTfhI6hqgcpLTPCsm0jI,15990
|
128
128
|
langroid/vector_store/qdrantdb.py,sha256=v7TAsIoj_vxeKDYS9tpwJLBZA8fuTweTYxHo0X_uawM,17949
|
129
129
|
langroid/vector_store/weaviatedb.py,sha256=tjlqEtkwrhykelt-nbr2WIuHWJBuSAGjZuG6gsAMBsc,11753
|
130
|
-
langroid-0.
|
131
|
-
langroid-0.
|
132
|
-
langroid-0.
|
133
|
-
langroid-0.
|
130
|
+
langroid-0.43.1.dist-info/METADATA,sha256=AQaUq3J9kszROM1HO3-8s9us3eGpSt9yJy7SI8eznkU,61773
|
131
|
+
langroid-0.43.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
132
|
+
langroid-0.43.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
133
|
+
langroid-0.43.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|