langroid 0.50.12__py3-none-any.whl → 0.51.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import base64
3
4
  import itertools
4
5
  import logging
5
6
  import os
@@ -148,8 +149,8 @@ class DocumentParser(Parser):
148
149
  return UnstructuredPDFParser(source, config)
149
150
  elif config.pdf.library == "pdf2image":
150
151
  return ImagePdfParser(source, config)
151
- elif config.pdf.library == "gemini":
152
- return GeminiPdfParser(source, config)
152
+ elif config.pdf.library == "llm-pdf-parser":
153
+ return LLMPdfParser(source, config)
153
154
  elif config.pdf.library == "marker":
154
155
  return MarkerPdfParser(source, config)
155
156
  else:
@@ -993,13 +994,13 @@ class MarkitdownPPTXParser(DocumentParser):
993
994
  )
994
995
 
995
996
 
996
- class GeminiPdfParser(DocumentParser):
997
+ class LLMPdfParser(DocumentParser):
997
998
  """
998
- This class converts PDFs to Markdown using Gemini multimodal LLMs.
999
+ This class converts PDFs to Markdown using multimodal LLMs.
999
1000
 
1000
1001
  It extracts pages, converts them with the LLM (replacing images with
1001
1002
  detailed descriptions), and outputs Markdown page by page. The
1002
- conversion follows `GEMINI_SYSTEM_INSTRUCTION`. It employs
1003
+ conversion follows `LLM_PDF_MD_SYSTEM_INSTRUCTION`. It employs
1003
1004
  multiprocessing for speed, async requests with rate limiting, and
1004
1005
  handles errors.
1005
1006
 
@@ -1008,9 +1009,9 @@ class GeminiPdfParser(DocumentParser):
1008
1009
  """
1009
1010
 
1010
1011
  DEFAULT_MAX_TOKENS = 7000
1011
- OUTPUT_DIR = Path(".gemini_pdfparser") # Fixed output directory
1012
+ OUTPUT_DIR = Path(".llm_pdfparser") # Fixed output directory
1012
1013
 
1013
- GEMINI_SYSTEM_INSTRUCTION = """
1014
+ LLM_PDF_MD_SYSTEM_INSTRUCTION = """
1014
1015
  ### **Convert PDF to Markdown**
1015
1016
  1. **Text:**
1016
1017
  * Preserve structure, formatting (**bold**, *italic*), lists, and indentation.
@@ -1035,11 +1036,11 @@ class GeminiPdfParser(DocumentParser):
1035
1036
 
1036
1037
  def __init__(self, source: Union[str, bytes], config: ParsingConfig):
1037
1038
  super().__init__(source, config)
1038
- if not config.pdf.gemini_config:
1039
+ if not config.pdf.llm_parser_config:
1039
1040
  raise ValueError(
1040
- "GeminiPdfParser requires a Gemini-based config in pdf parsing config"
1041
+ "LLMPdfParser requires a llm-based config in pdf parsing config"
1041
1042
  )
1042
- self.model_name = config.pdf.gemini_config.model_name
1043
+ self.model_name = config.pdf.llm_parser_config.model_name
1043
1044
 
1044
1045
  # Ensure output directory exists
1045
1046
  self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
@@ -1058,7 +1059,9 @@ class GeminiPdfParser(DocumentParser):
1058
1059
  temp_file.close()
1059
1060
  self.output_filename = Path(temp_file.name)
1060
1061
 
1061
- self.max_tokens = config.pdf.gemini_config.max_tokens or self.DEFAULT_MAX_TOKENS
1062
+ self.max_tokens = (
1063
+ config.pdf.llm_parser_config.max_tokens or self.DEFAULT_MAX_TOKENS
1064
+ )
1062
1065
 
1063
1066
  """
1064
1067
  If True, each PDF page is processed as a separate chunk,
@@ -1066,12 +1069,12 @@ class GeminiPdfParser(DocumentParser):
1066
1069
  grouped into chunks based on `max_token_limit` before being sent
1067
1070
  to the LLM.
1068
1071
  """
1069
- self.split_on_page = config.pdf.gemini_config.split_on_page or False
1072
+ self.split_on_page = config.pdf.llm_parser_config.split_on_page or False
1070
1073
 
1071
1074
  # Rate limiting parameters
1072
1075
  import asyncio
1073
1076
 
1074
- self.requests_per_minute = config.pdf.gemini_config.requests_per_minute or 5
1077
+ self.requests_per_minute = config.pdf.llm_parser_config.requests_per_minute or 5
1075
1078
 
1076
1079
  """
1077
1080
  A semaphore to control the number of concurrent requests to the LLM,
@@ -1175,7 +1178,7 @@ class GeminiPdfParser(DocumentParser):
1175
1178
  "page_numbers": page_numbers, # List of page numbers in this chunk
1176
1179
  }
1177
1180
 
1178
- def _prepare_pdf_chunks_for_gemini(
1181
+ def _prepare_pdf_chunks_for_llm(
1179
1182
  self,
1180
1183
  num_workers: Optional[int] = None,
1181
1184
  max_tokens: int = DEFAULT_MAX_TOKENS,
@@ -1198,37 +1201,102 @@ class GeminiPdfParser(DocumentParser):
1198
1201
  pdf_chunks = pool.map(self._merge_pages_into_pdf_with_metadata, chunks)
1199
1202
  return pdf_chunks
1200
1203
 
1201
- async def _send_chunk_to_gemini(
1202
- self, chunk: Dict[str, Any], gemini_api_key: str
1203
- ) -> str:
1204
+ @staticmethod
1205
+ def _page_num_str(page_numbers: Any) -> str:
1206
+ """
1207
+ Converts page numbers to a formatted string.
1208
+ """
1209
+ if isinstance(page_numbers, list):
1210
+ if len(page_numbers) == 0:
1211
+ return ""
1212
+ return str(page_numbers[0]) + "-" + str(page_numbers[-1])
1213
+ elif isinstance(page_numbers, int):
1214
+ return str(page_numbers)
1215
+ else:
1216
+ return str(page_numbers).replace(" ", "-")
1217
+
1218
+ async def _send_chunk_to_llm(self, chunk: Dict[str, Any]) -> str:
1204
1219
  """
1205
- Sends a PDF chunk to the Gemini API and returns the response text.
1220
+ Sends a PDF chunk to the LLM API and returns the response text.
1206
1221
  Uses retries with exponential backoff to handle transient failures.
1207
1222
  """
1208
1223
  import asyncio
1209
1224
  import logging
1210
1225
 
1211
- from google import genai
1212
- from google.genai import types
1226
+ from langroid.language_models.openai_gpt import OpenAIGPT, OpenAIGPTConfig
1213
1227
 
1214
1228
  async with self.semaphore: # Limit concurrent API requests
1215
1229
  for attempt in range(self.max_retries):
1216
1230
  try:
1217
- client = genai.Client(api_key=gemini_api_key)
1231
+ llm_config = OpenAIGPTConfig(
1232
+ chat_model=self.model_name,
1233
+ max_output_tokens=self.max_tokens,
1234
+ )
1235
+ llm = OpenAIGPT(config=llm_config)
1236
+ page_nums = self._page_num_str(chunk.get("page_numbers", "?"))
1237
+ base64_string = base64.b64encode(chunk["pdf_bytes"]).decode("utf-8")
1238
+ data_uri = f"data:application/pdf;base64,{base64_string}"
1239
+ if "gemini" in self.model_name.lower():
1240
+ file_content = dict(
1241
+ type="image_url",
1242
+ image_url=dict(url=data_uri),
1243
+ )
1244
+ elif "claude" in self.model_name.lower():
1245
+ # optimistrally try this: some API proxies like litellm
1246
+ # support this, and others may not.
1247
+ file_content = dict(
1248
+ type="file",
1249
+ file=dict(
1250
+ file_data=data_uri,
1251
+ ),
1252
+ )
1253
+ else:
1254
+ # fallback: assume file upload is similar to OpenAI API
1255
+ file_content = dict(
1256
+ type="file",
1257
+ file=dict(
1258
+ filename=f"pages-{page_nums}.pdf",
1259
+ file_data=data_uri,
1260
+ ),
1261
+ )
1218
1262
 
1219
1263
  # Send the request with PDF content and system instructions
1220
- response = await client.aio.models.generate_content(
1221
- model=self.model_name,
1222
- contents=[
1223
- types.Part.from_bytes(
1224
- data=chunk["pdf_bytes"], mime_type="application/pdf"
1264
+ response = await llm.async_client.chat.completions.create( # type: ignore
1265
+ model=self.model_name.split("/")[-1],
1266
+ messages=[
1267
+ dict(
1268
+ role="system",
1269
+ content="""
1270
+ You are an expert pdf -> markdown converter.
1271
+ Do NOT use any triple backquotes when you present the
1272
+ markdown content,like ```markdown etc.
1273
+ FAITHFULLY CONVERT THE PDF TO MARKDOWN,
1274
+ retaining ALL content as you find it.
1275
+ """,
1276
+ ),
1277
+ dict( # type: ignore
1278
+ role="user",
1279
+ content=[
1280
+ dict(
1281
+ type="text",
1282
+ text=self.LLM_PDF_MD_SYSTEM_INSTRUCTION,
1283
+ ),
1284
+ file_content,
1285
+ ],
1225
1286
  ),
1226
- self.GEMINI_SYSTEM_INSTRUCTION,
1227
1287
  ],
1228
1288
  )
1229
1289
 
1230
1290
  # Return extracted text if available
1231
- return str(response.text) if response.text else ""
1291
+ return (
1292
+ ""
1293
+ if (
1294
+ response is None
1295
+ or not hasattr(response, "choices")
1296
+ or not isinstance(response.choices, list)
1297
+ )
1298
+ else (response.choices[0].message.content)
1299
+ )
1232
1300
 
1233
1301
  except Exception as e:
1234
1302
  # Log error with page numbers for debugging
@@ -1246,33 +1314,34 @@ class GeminiPdfParser(DocumentParser):
1246
1314
  await asyncio.sleep(delay)
1247
1315
  else:
1248
1316
  # Log failure after max retries
1317
+ page_nums = chunk.get("page_numbers", "Unknown")
1249
1318
  logging.error(
1250
- "Max retries reached for pages %s",
1251
- chunk.get("page_numbers", "Unknown"),
1319
+ f"""
1320
+ Max retries reached for pages {page_nums}.
1321
+ It is possible your LLM API provider for
1322
+ the model {self.model_name} does not support
1323
+ file uploads via an OpenAI-compatible API.
1324
+ """,
1252
1325
  )
1253
1326
  break
1254
-
1255
1327
  return "" # Return empty string if all retries fail
1256
1328
 
1257
- async def process_chunks(
1258
- self, chunks: List[Dict[str, Any]], api_key: str
1259
- ) -> List[str]:
1329
+ async def process_chunks(self, chunks: List[Dict[str, Any]]) -> List[str]:
1260
1330
  """
1261
- Processes PDF chunks by sending them to the Gemini API and
1331
+ Processes PDF chunks by sending them to the LLM API and
1262
1332
  collecting the results.
1263
1333
 
1264
1334
  Args:
1265
1335
  chunks: A list of dictionaries, where each dictionary represents
1266
1336
  a PDF chunk and contains the PDF data and page numbers.
1267
- api_key: The Gemini API key.
1268
1337
  """
1269
1338
  # To show nice progress bar
1270
1339
  from tqdm.asyncio import tqdm_asyncio
1271
1340
 
1272
- # Create a list of asynchronous tasks to send each chunk to Gemini.
1341
+ # Create a list of asynchronous tasks to send each chunk to the LLM.
1273
1342
  # Chunk in this case might be single page or group of pages returned
1274
1343
  # by prepare_pdf_chunks function
1275
- tasks = [self._send_chunk_to_gemini(chunk, api_key) for chunk in chunks]
1344
+ tasks = [self._send_chunk_to_llm(chunk) for chunk in chunks]
1276
1345
 
1277
1346
  # Gather the results from all tasks, allowing exceptions to be returned.
1278
1347
  # tqdm_asyncio is wrapper around asyncio.gather
@@ -1311,7 +1380,7 @@ class GeminiPdfParser(DocumentParser):
1311
1380
  def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
1312
1381
  """
1313
1382
  Iterates over the document pages, extracting content using the
1314
- Gemini API, saves them to a markdown file, and yields page numbers
1383
+ LLM API, saves them to a markdown file, and yields page numbers
1315
1384
  along with their corresponding content.
1316
1385
 
1317
1386
  Yields:
@@ -1319,14 +1388,8 @@ class GeminiPdfParser(DocumentParser):
1319
1388
  (int) and the page content (Any).
1320
1389
  """
1321
1390
  import asyncio
1322
- import os
1323
1391
 
1324
- # Load environment variables (e.g., GEMINI_API_KEY) from a .env file.
1325
1392
  load_dotenv()
1326
- gemini_api_key = os.getenv("GEMINI_API_KEY")
1327
- if not gemini_api_key:
1328
- raise ValueError("GEMINI_API_KEY not found in environment variables.")
1329
-
1330
1393
  try:
1331
1394
  # This involves extracting pages, grouping them according to the
1332
1395
  # `max_tokens` limit (if `split_on_page` is False), and
@@ -1335,18 +1398,16 @@ class GeminiPdfParser(DocumentParser):
1335
1398
  # PDF bytes and the associated page numbers or single page if
1336
1399
  # `split_on_page` is true
1337
1400
 
1338
- pdf_chunks = self._prepare_pdf_chunks_for_gemini(
1401
+ pdf_chunks = self._prepare_pdf_chunks_for_llm(
1339
1402
  num_workers=8,
1340
1403
  max_tokens=self.max_tokens,
1341
1404
  split_on_page=self.split_on_page,
1342
1405
  )
1343
1406
 
1344
1407
  # We asynchronously processes each chunk, sending it
1345
- # to Gemini and retrieving the Markdown output. It handles rate
1408
+ # to the LLM and retrieving the Markdown output. It handles rate
1346
1409
  # limiting and retries.
1347
- markdown_results = asyncio.run(
1348
- self.process_chunks(pdf_chunks, gemini_api_key)
1349
- )
1410
+ markdown_results = asyncio.run(self.process_chunks(pdf_chunks))
1350
1411
 
1351
1412
  # This file serves as an intermediate storage location for the
1352
1413
  # complete Markdown output.
@@ -36,10 +36,10 @@ class BaseParsingConfig(BaseSettings):
36
36
  extra = "ignore" # Ignore unknown settings
37
37
 
38
38
 
39
- class GeminiConfig(BaseSettings):
40
- """Configuration for Gemini-based parsing."""
39
+ class LLMPdfParserConfig(BaseSettings):
40
+ """Configuration for LLM-based parsing."""
41
41
 
42
- model_name: str = "gemini-2.0-flash" # Default model
42
+ model_name: str = "gemini/gemini-2.0-flash" # Default model
43
43
  max_tokens: Optional[int] = None
44
44
  split_on_page: Optional[bool] = True
45
45
  requests_per_minute: Optional[int] = 5
@@ -60,10 +60,10 @@ class PdfParsingConfig(BaseParsingConfig):
60
60
  "unstructured",
61
61
  "pdf2image",
62
62
  "markitdown",
63
- "gemini",
63
+ "llm-pdf-parser",
64
64
  "marker",
65
65
  ] = "pymupdf4llm"
66
- gemini_config: Optional[GeminiConfig] = None
66
+ llm_parser_config: Optional[LLMPdfParserConfig] = None
67
67
  marker_config: Optional[MarkerConfig] = None
68
68
 
69
69
  @root_validator(pre=True)
@@ -71,10 +71,10 @@ class PdfParsingConfig(BaseParsingConfig):
71
71
  """Ensure correct config is set based on library selection."""
72
72
  library = values.get("library")
73
73
 
74
- if library == "gemini":
75
- values.setdefault("gemini_config", GeminiConfig())
74
+ if library == "llm-pdf-parser":
75
+ values.setdefault("llm_parser_config", LLMPdfParserConfig())
76
76
  else:
77
- values["gemini_config"] = None
77
+ values["llm_parser_config"] = None
78
78
 
79
79
  if library == "marker":
80
80
  values.setdefault("marker_config", MarkerConfig())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.50.12
3
+ Version: 0.51.1
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -82,11 +82,11 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
82
82
  langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
83
83
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
84
84
  langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
85
- langroid/parsing/document_parser.py,sha256=XihXwhp--Nxhb8xoh6wth_isJCGUROKiVr3rPDOJodU,54359
85
+ langroid/parsing/document_parser.py,sha256=7_pHu-_yQOETtDATv5VRdVSvac9kJRuZiwQ6EbJqJ_o,57403
86
86
  langroid/parsing/md_parser.py,sha256=JUgsUpCaeAuBndmtDaJR9HMZaje1gmtXtaLXJHst3i8,21340
87
87
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
88
88
  langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
89
- langroid/parsing/parser.py,sha256=YPE6X6efimz2bYbardrhHHKw7V1LZvq-vF0q5p5XzOk,15387
89
+ langroid/parsing/parser.py,sha256=Tbe1mQ7wp6GVx2xMWv1raIkpepTN0qNrqOxakWY6Zkc,15437
90
90
  langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
91
91
  langroid/parsing/repo_loader.py,sha256=NpysuyzRHvgL3F4BB_wGo5sCUnZ3FOlVCJmZ7CaUdbs,30202
92
92
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
@@ -129,7 +129,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
129
129
  langroid/vector_store/postgres.py,sha256=wHPtIi2qM4fhO4pMQr95pz1ZCe7dTb2hxl4VYspGZoA,16104
130
130
  langroid/vector_store/qdrantdb.py,sha256=O6dSBoDZ0jzfeVBd7LLvsXu083xs2fxXtPa9gGX3JX4,18443
131
131
  langroid/vector_store/weaviatedb.py,sha256=Yn8pg139gOy3zkaPfoTbMXEEBCiLiYa1MU5d_3UA1K4,11847
132
- langroid-0.50.12.dist-info/METADATA,sha256=b1vQBIkydfimg9r80ud7w07d7540XJAdhpegeqAPPTw,63642
133
- langroid-0.50.12.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
134
- langroid-0.50.12.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
135
- langroid-0.50.12.dist-info/RECORD,,
132
+ langroid-0.51.1.dist-info/METADATA,sha256=9E0M5JzLk_fuMOLH918i7fIBwWKMm1O6J3VY8DoG3NM,63641
133
+ langroid-0.51.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
134
+ langroid-0.51.1.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
135
+ langroid-0.51.1.dist-info/RECORD,,