langroid 0.42.9__py3-none-any.whl → 0.43.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
langroid/agent/base.py CHANGED
@@ -1148,7 +1148,9 @@ class Agent(ABC):
1148
1148
  and msg.function_call is None
1149
1149
  ):
1150
1150
 
1151
- tools = self.get_formatted_tool_messages(msg.content)
1151
+ tools = self.get_formatted_tool_messages(
1152
+ msg.content, from_llm=msg.metadata.sender == Entity.LLM
1153
+ )
1152
1154
  msg.all_tool_messages = tools
1153
1155
  # filter for actually handle-able tools, and recipient is this agent
1154
1156
  my_tools = [t for t in tools if self._tool_recipient_match(t)]
@@ -1177,7 +1179,9 @@ class Agent(ABC):
1177
1179
  else:
1178
1180
  return my_tools
1179
1181
 
1180
- def get_formatted_tool_messages(self, input_str: str) -> List[ToolMessage]:
1182
+ def get_formatted_tool_messages(
1183
+ self, input_str: str, from_llm: bool = True
1184
+ ) -> List[ToolMessage]:
1181
1185
  """
1182
1186
  Returns ToolMessage objects (tools) corresponding to
1183
1187
  tool-formatted substrings, if any.
@@ -1190,6 +1194,8 @@ class Agent(ABC):
1190
1194
 
1191
1195
  Args:
1192
1196
  input_str (str): input string, typically a message sent by an LLM
1197
+ from_llm (bool): whether the input was generated by the LLM. If so,
1198
+ we track malformed tool calls.
1193
1199
 
1194
1200
  Returns:
1195
1201
  List[ToolMessage]: list of ToolMessage objects
@@ -1203,7 +1209,7 @@ class Agent(ABC):
1203
1209
  if not is_json:
1204
1210
  return []
1205
1211
 
1206
- results = [self._get_one_tool_message(j, is_json) for j in substrings]
1212
+ results = [self._get_one_tool_message(j, is_json, from_llm) for j in substrings]
1207
1213
  valid_results = [r for r in results if r is not None]
1208
1214
  # If any tool is correctly formed we do not set the flag
1209
1215
  if len(valid_results) > 0:
@@ -1219,6 +1225,7 @@ class Agent(ABC):
1219
1225
  return None
1220
1226
  tool_name = msg.function_call.name
1221
1227
  tool_msg = msg.function_call.arguments or {}
1228
+ self.tool_error = False
1222
1229
  if tool_name not in self.llm_tools_handled:
1223
1230
  logger.warning(
1224
1231
  f"""
@@ -1230,10 +1237,12 @@ class Agent(ABC):
1230
1237
  or you need to enable this agent to handle this fn-call.
1231
1238
  """
1232
1239
  )
1233
- if tool_name not in self.all_llm_tools_known:
1240
+ if (
1241
+ tool_name not in self.all_llm_tools_known
1242
+ and msg.metadata.sender == Entity.LLM
1243
+ ):
1234
1244
  self.tool_error = True
1235
1245
  return None
1236
- self.tool_error = False
1237
1246
  tool_class = self.llm_tools_map[tool_name]
1238
1247
  tool_msg.update(dict(request=tool_name))
1239
1248
  tool = tool_class.parse_obj(tool_msg)
@@ -1272,8 +1281,9 @@ class Agent(ABC):
1272
1281
  tool = tool_class.parse_obj(tool_msg)
1273
1282
  tool.id = tc.id or ""
1274
1283
  tools.append(tool)
1275
- # When no tool is valid, set the recovery flag
1276
- self.tool_error = all_errors
1284
+ # When no tool is valid and the message was produced
1285
+ # by the LLM, set the recovery flag
1286
+ self.tool_error = all_errors and msg.metadata.sender == Entity.LLM
1277
1287
  return tools
1278
1288
 
1279
1289
  def tool_validation_error(self, ve: ValidationError) -> str:
@@ -1508,7 +1518,7 @@ class Agent(ABC):
1508
1518
  return None
1509
1519
 
1510
1520
  def _get_one_tool_message(
1511
- self, tool_candidate_str: str, is_json: bool = True
1521
+ self, tool_candidate_str: str, is_json: bool = True, from_llm: bool = True
1512
1522
  ) -> Optional[ToolMessage]:
1513
1523
  """
1514
1524
  Parse the tool_candidate_str into ANY ToolMessage KNOWN to agent --
@@ -1545,7 +1555,7 @@ class Agent(ABC):
1545
1555
  # }
1546
1556
 
1547
1557
  if not isinstance(maybe_tool_dict, dict):
1548
- self.tool_error = True
1558
+ self.tool_error = from_llm
1549
1559
  return None
1550
1560
 
1551
1561
  properties = maybe_tool_dict.get("properties")
@@ -1593,23 +1603,23 @@ class Agent(ABC):
1593
1603
  if len(candidate_tools) == 1:
1594
1604
  return candidate_tools[0]
1595
1605
  else:
1596
- self.tool_error = True
1606
+ self.tool_error = from_llm
1597
1607
  return None
1598
1608
 
1599
1609
  if not isinstance(request, str) or request not in self.all_llm_tools_known:
1600
- self.tool_error = True
1610
+ self.tool_error = from_llm
1601
1611
  return None
1602
1612
 
1603
1613
  message_class = self.llm_tools_map.get(request)
1604
1614
  if message_class is None:
1605
1615
  logger.warning(f"No message class found for request '{request}'")
1606
- self.tool_error = True
1616
+ self.tool_error = from_llm
1607
1617
  return None
1608
1618
 
1609
1619
  try:
1610
1620
  message = message_class.parse_obj(maybe_tool_dict)
1611
1621
  except ValidationError as ve:
1612
- self.tool_error = True
1622
+ self.tool_error = from_llm
1613
1623
  raise ve
1614
1624
  return message
1615
1625
 
@@ -1096,7 +1096,10 @@ class ChatAgent(Agent):
1096
1096
  else:
1097
1097
  # We will trigger the strict recovery mechanism to force
1098
1098
  # the LLM to correct its output, allowing us to parse
1099
- self.tool_error = True
1099
+ if isinstance(msg, ChatDocument):
1100
+ self.tool_error = msg.metadata.sender == Entity.LLM
1101
+ else:
1102
+ self.tool_error = True
1100
1103
 
1101
1104
  raise ve
1102
1105
 
@@ -1265,6 +1268,7 @@ class ChatAgent(Agent):
1265
1268
  and self._json_schema_available()
1266
1269
  and self.config.strict_recovery
1267
1270
  ):
1271
+ self.tool_error = False
1268
1272
  AnyTool = self._get_any_tool_message()
1269
1273
  if AnyTool is None:
1270
1274
  return None
@@ -1352,6 +1356,7 @@ class ChatAgent(Agent):
1352
1356
  and self._json_schema_available()
1353
1357
  and self.config.strict_recovery
1354
1358
  ):
1359
+ self.tool_error = False
1355
1360
  AnyTool = self._get_any_tool_message()
1356
1361
  self.set_output_format(
1357
1362
  AnyTool,
langroid/agent/task.py CHANGED
@@ -1572,7 +1572,10 @@ class Task:
1572
1572
  response_fn = self._entity_responder_async_map[cast(Entity, e)]
1573
1573
  result = await response_fn(self.pending_message)
1574
1574
  # update result.tool_messages if any
1575
- if isinstance(result, ChatDocument):
1575
+ if (
1576
+ isinstance(result, ChatDocument)
1577
+ and result.metadata.sender == Entity.LLM
1578
+ ):
1576
1579
  self.agent.try_get_tool_messages(result)
1577
1580
 
1578
1581
  result_chat_doc = self.agent.to_ChatDocument(
@@ -85,9 +85,6 @@ GLHF_BASE_URL = "https://glhf.chat/api/openai/v1"
85
85
  OLLAMA_API_KEY = "ollama"
86
86
  DUMMY_API_KEY = "xxx"
87
87
 
88
- VLLM_API_KEY = os.environ.get("VLLM_API_KEY", DUMMY_API_KEY)
89
- LLAMACPP_API_KEY = os.environ.get("LLAMA_API_KEY", DUMMY_API_KEY)
90
-
91
88
 
92
89
  openai_chat_model_pref_list = [
93
90
  OpenAIChatModel.GPT4o,
@@ -421,6 +418,9 @@ class OpenAIGPT(LanguageModel):
421
418
  self.supports_json_schema: bool = self.config.supports_json_schema or False
422
419
  self.supports_strict_tools: bool = self.config.supports_strict_tools or False
423
420
 
421
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", DUMMY_API_KEY)
422
+ self.api_key = config.api_key
423
+
424
424
  # if model name starts with "litellm",
425
425
  # set the actual model name by stripping the "litellm/" prefix
426
426
  # and set the litellm flag to True
@@ -449,12 +449,14 @@ class OpenAIGPT(LanguageModel):
449
449
 
450
450
  # use api_base from config if set, else fall back on OLLAMA_BASE_URL
451
451
  self.api_base = self.config.api_base or OLLAMA_BASE_URL
452
- self.api_key = OLLAMA_API_KEY
452
+ if self.api_key == OPENAI_API_KEY:
453
+ self.api_key = OLLAMA_API_KEY
453
454
  self.config.chat_model = self.config.chat_model.replace("ollama/", "")
454
455
  elif self.config.chat_model.startswith("vllm/"):
455
456
  self.supports_json_schema = True
456
457
  self.config.chat_model = self.config.chat_model.replace("vllm/", "")
457
- self.api_key = VLLM_API_KEY
458
+ if self.api_key == OPENAI_API_KEY:
459
+ self.api_key = os.environ.get("VLLM_API_KEY", DUMMY_API_KEY)
458
460
  self.api_base = self.config.api_base or "http://localhost:8000/v1"
459
461
  if not self.api_base.startswith("http"):
460
462
  self.api_base = "http://" + self.api_base
@@ -465,7 +467,8 @@ class OpenAIGPT(LanguageModel):
465
467
  self.api_base = self.config.chat_model.split("/", 1)[1]
466
468
  if not self.api_base.startswith("http"):
467
469
  self.api_base = "http://" + self.api_base
468
- self.api_key = LLAMACPP_API_KEY
470
+ if self.api_key == OPENAI_API_KEY:
471
+ self.api_key = os.environ.get("LLAMA_API_KEY", DUMMY_API_KEY)
469
472
  else:
470
473
  self.api_base = self.config.api_base
471
474
  # If api_base is unset we use OpenAI's endpoint, which supports
@@ -487,11 +490,6 @@ class OpenAIGPT(LanguageModel):
487
490
  if self.config.use_completion_for_chat:
488
491
  self.config.use_chat_for_completion = False
489
492
 
490
- self.api_key = config.api_key
491
- if self.is_openai_completion_model() or self.is_openai_chat_model():
492
- if self.api_key == DUMMY_API_KEY:
493
- self.api_key = os.getenv("OPENAI_API_KEY", DUMMY_API_KEY)
494
-
495
493
  self.is_groq = self.config.chat_model.startswith("groq/")
496
494
  self.is_cerebras = self.config.chat_model.startswith("cerebras/")
497
495
  self.is_gemini = self.is_gemini_model()
@@ -502,7 +500,7 @@ class OpenAIGPT(LanguageModel):
502
500
  if self.is_groq:
503
501
  # use groq-specific client
504
502
  self.config.chat_model = self.config.chat_model.replace("groq/", "")
505
- if self.api_key == DUMMY_API_KEY:
503
+ if self.api_key == OPENAI_API_KEY:
506
504
  self.api_key = os.getenv("GROQ_API_KEY", DUMMY_API_KEY)
507
505
  self.client = Groq(
508
506
  api_key=self.api_key,
@@ -513,7 +511,7 @@ class OpenAIGPT(LanguageModel):
513
511
  elif self.is_cerebras:
514
512
  # use cerebras-specific client
515
513
  self.config.chat_model = self.config.chat_model.replace("cerebras/", "")
516
- if self.api_key == DUMMY_API_KEY:
514
+ if self.api_key == OPENAI_API_KEY:
517
515
  self.api_key = os.getenv("CEREBRAS_API_KEY", DUMMY_API_KEY)
518
516
  self.client = Cerebras(
519
517
  api_key=self.api_key,
@@ -526,25 +524,25 @@ class OpenAIGPT(LanguageModel):
526
524
  # in these cases, there's no specific client: OpenAI python client suffices
527
525
  if self.is_gemini:
528
526
  self.config.chat_model = self.config.chat_model.replace("gemini/", "")
529
- if self.api_key == DUMMY_API_KEY:
527
+ if self.api_key == OPENAI_API_KEY:
530
528
  self.api_key = os.getenv("GEMINI_API_KEY", DUMMY_API_KEY)
531
529
  self.api_base = GEMINI_BASE_URL
532
530
  elif self.is_glhf:
533
531
  self.config.chat_model = self.config.chat_model.replace("glhf/", "")
534
- if self.api_key == DUMMY_API_KEY:
532
+ if self.api_key == OPENAI_API_KEY:
535
533
  self.api_key = os.getenv("GLHF_API_KEY", DUMMY_API_KEY)
536
534
  self.api_base = GLHF_BASE_URL
537
535
  elif self.is_openrouter:
538
536
  self.config.chat_model = self.config.chat_model.replace(
539
537
  "openrouter/", ""
540
538
  )
541
- if self.api_key == DUMMY_API_KEY:
539
+ if self.api_key == OPENAI_API_KEY:
542
540
  self.api_key = os.getenv("OPENROUTER_API_KEY", DUMMY_API_KEY)
543
541
  self.api_base = OPENROUTER_BASE_URL
544
542
  elif self.is_deepseek:
545
543
  self.config.chat_model = self.config.chat_model.replace("deepseek/", "")
546
544
  self.api_base = DEEPSEEK_BASE_URL
547
- if self.api_key == DUMMY_API_KEY:
545
+ if self.api_key == OPENAI_API_KEY:
548
546
  self.api_key = os.getenv("DEEPSEEK_API_KEY", DUMMY_API_KEY)
549
547
 
550
548
  self.client = OpenAI(
@@ -9,7 +9,9 @@ from enum import Enum
9
9
  from io import BytesIO
10
10
  from itertools import accumulate
11
11
  from pathlib import Path
12
- from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
12
+ from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple, Union
13
+
14
+ from dotenv import load_dotenv
13
15
 
14
16
  from langroid.exceptions import LangroidImportError
15
17
  from langroid.utils.object_registry import ObjectRegistry
@@ -163,6 +165,8 @@ class DocumentParser(Parser):
163
165
  return UnstructuredPDFParser(source, config)
164
166
  elif config.pdf.library == "pdf2image":
165
167
  return ImagePdfParser(source, config)
168
+ elif config.pdf.library == "gemini":
169
+ return GeminiPdfParser(source, config)
166
170
  else:
167
171
  raise ValueError(
168
172
  f"Unsupported PDF library specified: {config.pdf.library}"
@@ -415,13 +419,15 @@ class DocumentParser(Parser):
415
419
  # that it needs to be combined with the next chunk.
416
420
  while len(split) > self.config.chunk_size:
417
421
  # pretty formatting of pages (e.g. 1-3, 4, 5-7)
418
- pg = "-".join([pages[0], pages[-1]])
422
+ p_0 = int(pages[0])
423
+ p_n = int(pages[-1])
424
+ page_str = f"pages {p_0}-{p_n}" if p_0 != p_n else f"page {p_0}"
419
425
  text = self.tokenizer.decode(split[: self.config.chunk_size])
420
426
  docs.append(
421
427
  Document(
422
428
  content=text,
423
429
  metadata=DocMetaData(
424
- source=f"{self.source} pages {pg}",
430
+ source=f"{self.source} {page_str}",
425
431
  is_chunk=True,
426
432
  id=common_id,
427
433
  ),
@@ -952,3 +958,409 @@ class MarkitdownPPTXParser(DocumentParser):
952
958
  content=self.fix_text(md_content),
953
959
  metadata=DocMetaData(source=self.source),
954
960
  )
961
+
962
+
963
+ class GeminiPdfParser(DocumentParser):
964
+ """
965
+ This class converts PDFs to Markdown using Gemini multimodal LLMs.
966
+
967
+ It extracts pages, converts them with the LLM (replacing images with
968
+ detailed descriptions), and outputs Markdown page by page. The
969
+ conversion follows `GEMINI_SYSTEM_INSTRUCTION`. It employs
970
+ multiprocessing for speed, async requests with rate limiting, and
971
+ handles errors.
972
+
973
+ It supports page-by-page splitting or chunking multiple pages into
974
+ one, respecting page boundaries and a `max_token_limit`.
975
+ """
976
+
977
+ DEFAULT_MAX_TOKENS = 7000
978
+ OUTPUT_DIR = Path(".gemini_pdfparser") # Fixed output directory
979
+
980
+ GEMINI_SYSTEM_INSTRUCTION = """
981
+ ### **Convert PDF to Markdown**
982
+ 1. **Text:**
983
+ * Preserve structure, formatting (**bold**, *italic*), lists, and indentation.
984
+ * **Remove running heads (page numbers, headers/footers).**
985
+ * Keep section and chapter titles; discard repeated page headers.
986
+ 2. **Images:** Replace with **detailed, creative descriptions**
987
+ optimized for clarity and understanding.
988
+ 3. **Tables:** Convert to Markdown tables with proper structure.
989
+ 4. **Math:** Use LaTeX (`...` inline, `$...$` block).
990
+ 5. **Code:** Wrap in fenced blocks without specifying a language:
991
+
992
+ ```
993
+ code
994
+ ```
995
+ 6. **Clean Output:**
996
+ * No system messages, metadata, or artifacts or ```markdown``` identifier.
997
+ * Do **not** include introductory or explanatory messages
998
+ like "Here is your output."
999
+ * Ensure formatting is **consistent and structured**
1000
+ for feeding into a markdown parser.
1001
+ """.strip()
1002
+
1003
+ def __init__(self, source: Union[str, bytes], config: ParsingConfig):
1004
+ super().__init__(source, config)
1005
+ if not config.pdf.gemini_config:
1006
+ raise ValueError(
1007
+ "GeminiPdfParser requires a Gemini-based config in pdf parsing config"
1008
+ )
1009
+ self.model_name = config.pdf.gemini_config.model_name
1010
+
1011
+ # Ensure output directory exists
1012
+ self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
1013
+
1014
+ prefix = (
1015
+ Path(source).stem + "_"
1016
+ if isinstance(source, str) and Path(source).exists()
1017
+ else "output_"
1018
+ )
1019
+ temp_file = tempfile.NamedTemporaryFile(
1020
+ suffix=".md",
1021
+ prefix=prefix,
1022
+ dir=str(self.OUTPUT_DIR),
1023
+ delete=False,
1024
+ )
1025
+ temp_file.close()
1026
+ self.output_filename = Path(temp_file.name)
1027
+
1028
+ self.max_tokens = config.pdf.gemini_config.max_tokens or self.DEFAULT_MAX_TOKENS
1029
+
1030
+ """
1031
+ If True, each PDF page is processed as a separate chunk,
1032
+ resulting in one LLM request per page. If False, pages are
1033
+ grouped into chunks based on `max_token_limit` before being sent
1034
+ to the LLM.
1035
+ """
1036
+ self.split_on_page = config.pdf.gemini_config.split_on_page or False
1037
+
1038
+ # Rate limiting parameters
1039
+ import asyncio
1040
+
1041
+ self.requests_per_minute = config.pdf.gemini_config.requests_per_minute or 5
1042
+
1043
+ """
1044
+ A semaphore to control the number of concurrent requests to the LLM,
1045
+ preventing rate limit errors. A semaphore slot is acquired before
1046
+ making an LLM request and released after the request is complete.
1047
+ """
1048
+ self.semaphore = asyncio.Semaphore(self.requests_per_minute)
1049
+ self.retry_delay = 5 # seconds, for exponential backoff
1050
+ self.max_retries = 3
1051
+
1052
+ def _extract_page(self, page_num: int) -> Dict[str, Any]:
1053
+ """
1054
+ Extracts a single page and estimates token count.
1055
+ Opens the PDF from self.doc_bytes (a BytesIO object).
1056
+ """
1057
+ import fitz
1058
+
1059
+ try:
1060
+ # Always open the document from in-memory bytes.
1061
+ doc = fitz.open(stream=self.doc_bytes.getvalue(), filetype="pdf")
1062
+ new_pdf = fitz.open()
1063
+ new_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
1064
+ pdf_bytes = new_pdf.write()
1065
+ text = doc[page_num].get_text("text")
1066
+ token_count = len(text) // 4 if text else len(pdf_bytes) // 4
1067
+
1068
+ return {
1069
+ "page_numbers": page_num + 1,
1070
+ "pdf_bytes": pdf_bytes,
1071
+ "token_count": token_count,
1072
+ }
1073
+ except Exception as e:
1074
+ raise ValueError(f"Error processing PDF document: {e}") from e
1075
+
1076
+ def _extract_pdf_pages_parallel(
1077
+ self, num_workers: Optional[int] = None
1078
+ ) -> List[Dict[str, Any]]:
1079
+ """Parallel PDF page extraction using self.doc_bytes."""
1080
+ from multiprocessing import Pool, cpu_count
1081
+
1082
+ import fitz
1083
+ from tqdm import tqdm
1084
+
1085
+ try:
1086
+ doc = fitz.open(stream=self.doc_bytes.getvalue(), filetype="pdf")
1087
+ total_pages = len(doc)
1088
+ except Exception as e:
1089
+ raise ValueError(f"Error opening PDF document: {e}") from e
1090
+
1091
+ num_workers = num_workers or cpu_count()
1092
+ with Pool(num_workers) as pool:
1093
+ with tqdm(total=total_pages, desc="Extracting pages", unit="page") as pbar:
1094
+ results = []
1095
+ for result in pool.imap(self._extract_page, range(total_pages)):
1096
+ results.append(result)
1097
+ pbar.update(1)
1098
+
1099
+ return results
1100
+
1101
+ def _group_pages_by_token_limit(
1102
+ self, pages: List[Dict[str, Any]], max_tokens: int = DEFAULT_MAX_TOKENS
1103
+ ) -> List[List[Dict[str, Any]]]:
1104
+ """Groups pages into chunks where each chunk is approximately `max_tokens`."""
1105
+ chunks: List[List[Dict[str, Any]]] = []
1106
+ current_chunk: List[Dict[str, Any]] = []
1107
+ current_tokens = 0
1108
+
1109
+ for page in pages:
1110
+ if current_tokens + page["token_count"] > max_tokens and current_chunk:
1111
+ chunks.append(current_chunk)
1112
+ current_chunk = []
1113
+ current_tokens = 0
1114
+
1115
+ current_chunk.append(page)
1116
+ current_tokens += page["token_count"]
1117
+
1118
+ if current_chunk: # Add remaining pages
1119
+ chunks.append(current_chunk)
1120
+
1121
+ return chunks
1122
+
1123
+ def _merge_pages_into_pdf_with_metadata(
1124
+ self, page_group: List[Dict[str, Any]]
1125
+ ) -> Dict[str, Any]:
1126
+ """
1127
+ Merges grouped pages into a single binary chunk so that
1128
+ it does not exceed max token limit
1129
+ """
1130
+ import fitz
1131
+
1132
+ merged_pdf = fitz.open()
1133
+ page_numbers = []
1134
+
1135
+ for page in page_group:
1136
+ temp_pdf = fitz.open("pdf", page["pdf_bytes"])
1137
+ merged_pdf.insert_pdf(temp_pdf)
1138
+ page_numbers.append(page["page_numbers"])
1139
+
1140
+ return {
1141
+ "pdf_bytes": merged_pdf.write(), # Binary PDF data
1142
+ "page_numbers": page_numbers, # List of page numbers in this chunk
1143
+ }
1144
+
1145
+ def _prepare_pdf_chunks_for_gemini(
1146
+ self,
1147
+ num_workers: Optional[int] = None,
1148
+ max_tokens: int = DEFAULT_MAX_TOKENS,
1149
+ split_on_page: bool = False,
1150
+ ) -> List[Dict[str, Any]]:
1151
+ """
1152
+ Extracts, groups, and merges PDF pages into chunks with embedded page markers.
1153
+ """
1154
+ from multiprocessing import Pool
1155
+
1156
+ pages = self._extract_pdf_pages_parallel(num_workers)
1157
+
1158
+ if split_on_page:
1159
+ # Each page becomes its own chunk
1160
+ return pages
1161
+ else:
1162
+ # Group pages based on token limit
1163
+ chunks = self._group_pages_by_token_limit(pages, max_tokens)
1164
+ with Pool(num_workers) as pool:
1165
+ pdf_chunks = pool.map(self._merge_pages_into_pdf_with_metadata, chunks)
1166
+ return pdf_chunks
1167
+
1168
+ async def _send_chunk_to_gemini(
1169
+ self, chunk: Dict[str, Any], gemini_api_key: str
1170
+ ) -> str:
1171
+ """
1172
+ Sends a PDF chunk to the Gemini API and returns the response text.
1173
+ Uses retries with exponential backoff to handle transient failures.
1174
+ """
1175
+ import asyncio
1176
+ import logging
1177
+
1178
+ from google import genai
1179
+ from google.genai import types
1180
+
1181
+ async with self.semaphore: # Limit concurrent API requests
1182
+ for attempt in range(self.max_retries):
1183
+ try:
1184
+ client = genai.Client(api_key=gemini_api_key)
1185
+
1186
+ # Send the request with PDF content and system instructions
1187
+ response = await client.aio.models.generate_content(
1188
+ model=self.model_name,
1189
+ contents=[
1190
+ types.Part.from_bytes(
1191
+ data=chunk["pdf_bytes"], mime_type="application/pdf"
1192
+ ),
1193
+ self.GEMINI_SYSTEM_INSTRUCTION,
1194
+ ],
1195
+ )
1196
+
1197
+ # Return extracted text if available
1198
+ return str(response.text) if response.text else ""
1199
+
1200
+ except Exception as e:
1201
+ # Log error with page numbers for debugging
1202
+ logging.error(
1203
+ "Attempt %d failed for pages %s: %s",
1204
+ attempt + 1,
1205
+ chunk.get("page_numbers", "Unknown"),
1206
+ e,
1207
+ )
1208
+
1209
+ if attempt < self.max_retries - 1:
1210
+ # Apply exponential backoff before retrying
1211
+ delay = self.retry_delay * (2**attempt)
1212
+ logging.info("Retrying in %s sec...", delay)
1213
+ await asyncio.sleep(delay)
1214
+ else:
1215
+ # Log failure after max retries
1216
+ logging.error(
1217
+ "Max retries reached for pages %s",
1218
+ chunk.get("page_numbers", "Unknown"),
1219
+ )
1220
+ break
1221
+
1222
+ return "" # Return empty string if all retries fail
1223
+
1224
+ async def process_chunks(
1225
+ self, chunks: List[Dict[str, Any]], api_key: str
1226
+ ) -> List[str]:
1227
+ """
1228
+ Processes PDF chunks by sending them to the Gemini API and
1229
+ collecting the results.
1230
+
1231
+ Args:
1232
+ chunks: A list of dictionaries, where each dictionary represents
1233
+ a PDF chunk and contains the PDF data and page numbers.
1234
+ api_key: The Gemini API key.
1235
+ """
1236
+ # To show nice progress bar
1237
+ from tqdm.asyncio import tqdm_asyncio
1238
+
1239
+ # Create a list of asynchronous tasks to send each chunk to Gemini.
1240
+ # Chunk in this case might be single page or group of pages returned
1241
+ # by prepare_pdf_chunks function
1242
+ tasks = [self._send_chunk_to_gemini(chunk, api_key) for chunk in chunks]
1243
+
1244
+ # Gather the results from all tasks, allowing exceptions to be returned.
1245
+ # tqdm_asyncio is wrapper around asyncio.gather
1246
+ gathered_results = await tqdm_asyncio.gather(
1247
+ *tasks, desc="Processing chunks(pages)", unit="chunk"
1248
+ )
1249
+ results = []
1250
+ for i, result in enumerate(gathered_results):
1251
+ chunk = chunks[i] # Get the corresponding chunk.
1252
+
1253
+ if isinstance(result, Exception):
1254
+ # Handle exceptions that occurred during chunk processing.
1255
+ logging.error(
1256
+ "Failed to process chunk %s: %s",
1257
+ chunk.get("page_numbers", "Unknown"),
1258
+ result,
1259
+ )
1260
+ results.append(
1261
+ "<!----Error: Could not process chunk %s---->"
1262
+ % chunk.get("page_numbers", "Unknown")
1263
+ )
1264
+ else:
1265
+ # Process successful results and append page/chunk markers.
1266
+ markdown = str(result)
1267
+ if self.split_on_page:
1268
+ results.append(
1269
+ markdown + f"<!----Page-{chunk['page_numbers']}---->"
1270
+ )
1271
+ else:
1272
+ results.append(
1273
+ markdown + f"<!----Chunk-{chunk['page_numbers']}---->"
1274
+ )
1275
+
1276
+ return results # Return the list of results.
1277
+
1278
+ def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
1279
+ """
1280
+ Iterates over the document pages, extracting content using the
1281
+ Gemini API, saves them to a markdown file, and yields page numbers
1282
+ along with their corresponding content.
1283
+
1284
+ Yields:
1285
+ A generator of tuples, where each tuple contains the page number
1286
+ (int) and the page content (Any).
1287
+ """
1288
+ import asyncio
1289
+ import os
1290
+
1291
+ # Load environment variables (e.g., GEMINI_API_KEY) from a .env file.
1292
+ load_dotenv()
1293
+ gemini_api_key = os.getenv("GEMINI_API_KEY")
1294
+ if not gemini_api_key:
1295
+ raise ValueError("GEMINI_API_KEY not found in environment variables.")
1296
+
1297
+ try:
1298
+ # This involves extracting pages, grouping them according to the
1299
+ # `max_tokens` limit (if `split_on_page` is False), and
1300
+ # merging pages into larger PDF chunks. The result
1301
+ # is a list of dictionaries, where each dictionary contains the
1302
+ # PDF bytes and the associated page numbers or single page if
1303
+ # `split_on_page` is true
1304
+
1305
+ pdf_chunks = self._prepare_pdf_chunks_for_gemini(
1306
+ num_workers=8,
1307
+ max_tokens=self.max_tokens,
1308
+ split_on_page=self.split_on_page,
1309
+ )
1310
+
1311
+ # We asynchronously processes each chunk, sending it
1312
+ # to Gemini and retrieving the Markdown output. It handles rate
1313
+ # limiting and retries.
1314
+ markdown_results = asyncio.run(
1315
+ self.process_chunks(pdf_chunks, gemini_api_key)
1316
+ )
1317
+
1318
+ # This file serves as an intermediate storage location for the
1319
+ # complete Markdown output.
1320
+ with open(self.output_filename, "w", encoding="utf-8") as outfile:
1321
+ outfile.write("\n\n".join(markdown_results))
1322
+
1323
+ # Read the full Markdown content from the temporary file.
1324
+ with open(self.output_filename, "r", encoding="utf-8") as infile:
1325
+ full_markdown = infile.read()
1326
+
1327
+ # The splitting is based on the `split_on_page` setting. If True,
1328
+ # the Markdown is split using the "Page-" marker. Otherwise, it's
1329
+ # split using the "Chunk-" marker.
1330
+ if self.split_on_page:
1331
+ pages = full_markdown.split("<!----Page-")
1332
+ else:
1333
+ pages = full_markdown.split("<!----Chunk-")
1334
+
1335
+ # Remove the first element if it's empty (due to the split).
1336
+ if pages and pages[0] == "":
1337
+ pages = pages[1:]
1338
+
1339
+ # Iterate over the pages or chunks and yield their content.
1340
+ for i, page in enumerate(pages):
1341
+ # Check for errors during processing.
1342
+ if "<!----Error:" in page:
1343
+ page_content = page
1344
+ logging.warning(f"Page {i}: Error processing chunk.")
1345
+ else:
1346
+ # Extract the actual page content by removing the marker.
1347
+ page_content = (
1348
+ page.split("---->", 1)[1]
1349
+ if len(page.split("---->", 1)) > 1
1350
+ else page
1351
+ )
1352
+
1353
+ # Yield the page number and content.
1354
+ yield i, page_content
1355
+
1356
+ except Exception as e:
1357
+ raise ValueError(f"Error processing document: {e}") from e
1358
+
1359
+ def get_document_from_page(self, page: str) -> Document:
1360
+ """
1361
+ Get a Document object from a given markdown page.
1362
+ """
1363
+ return Document(
1364
+ content=page,
1365
+ metadata=DocMetaData(source=self.source),
1366
+ )
@@ -1,13 +1,13 @@
1
1
  import logging
2
2
  import re
3
3
  from enum import Enum
4
- from typing import Dict, List, Literal
4
+ from typing import Any, Dict, List, Literal, Optional
5
5
 
6
6
  import tiktoken
7
7
 
8
8
  from langroid.mytypes import Document
9
9
  from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
10
- from langroid.pydantic_v1 import BaseSettings
10
+ from langroid.pydantic_v1 import BaseSettings, root_validator
11
11
  from langroid.utils.object_registry import ObjectRegistry
12
12
 
13
13
  logger = logging.getLogger(__name__)
@@ -20,7 +20,26 @@ class Splitter(str, Enum):
20
20
  SIMPLE = "simple"
21
21
 
22
22
 
23
- class PdfParsingConfig(BaseSettings):
23
+ class BaseParsingConfig(BaseSettings):
24
+ """Base class for document parsing configurations."""
25
+
26
+ library: str
27
+
28
+ class Config:
29
+ extra = "ignore" # Ignore unknown settings
30
+
31
+
32
+ class GeminiConfig(BaseSettings):
33
+ """Configuration for Gemini-based parsing."""
34
+
35
+ model_name: str = "gemini-2.0-flash" # Default model
36
+ max_tokens: Optional[int] = None
37
+ split_on_page: Optional[bool] = True
38
+ requests_per_minute: Optional[int] = 5
39
+
40
+
41
+ class PdfParsingConfig(BaseParsingConfig):
42
+
24
43
  library: Literal[
25
44
  "fitz",
26
45
  "pymupdf4llm",
@@ -29,7 +48,18 @@ class PdfParsingConfig(BaseSettings):
29
48
  "unstructured",
30
49
  "pdf2image",
31
50
  "markitdown",
51
+ "gemini",
32
52
  ] = "pymupdf4llm"
53
+ gemini_config: Optional[GeminiConfig] = None
54
+
55
+ @root_validator(pre=True)
56
+ def enable_gemini_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
57
+ """Ensure GeminiConfig is set only when library is 'gemini'."""
58
+ if values.get("library") == "gemini":
59
+ values["gemini_config"] = values.get("gemini_config") or GeminiConfig()
60
+ else:
61
+ values["gemini_config"] = None
62
+ return values
33
63
 
34
64
 
35
65
  class DocxParsingConfig(BaseSettings):
langroid/utils/system.py CHANGED
@@ -14,7 +14,12 @@ from typing import Any, Literal
14
14
 
15
15
  logger = logging.getLogger(__name__)
16
16
 
17
- DELETION_ALLOWED_PATHS = [".qdrant", ".chroma", ".lancedb", ".weaviate"]
17
+ DELETION_ALLOWED_PATHS = [
18
+ ".qdrant",
19
+ ".chroma",
20
+ ".lancedb",
21
+ ".weaviate",
22
+ ]
18
23
 
19
24
 
20
25
  def pydantic_major_version() -> int:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langroid
3
- Version: 0.42.9
3
+ Version: 0.43.0
4
4
  Summary: Harness LLMs with Multi-Agent Programming
5
5
  Author-email: Prasad Chalasani <pchalasani@gmail.com>
6
6
  License: MIT
@@ -86,6 +86,8 @@ Requires-Dist: weaviate-client>=4.9.6; extra == 'all'
86
86
  Provides-Extra: arango
87
87
  Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'arango'
88
88
  Requires-Dist: python-arango<9.0.0,>=8.1.2; extra == 'arango'
89
+ Provides-Extra: asyncio
90
+ Requires-Dist: asyncio>=3.4.3; extra == 'asyncio'
89
91
  Provides-Extra: chainlit
90
92
  Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'chainlit'
91
93
  Requires-Dist: python-socketio<6.0.0,>=5.11.0; extra == 'chainlit'
@@ -3,12 +3,12 @@ langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
3
3
  langroid/mytypes.py,sha256=FXSH62MUCeMCJP-66RVmbNaHCDLMxllEShZ-xEeTn9A,2833
4
4
  langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
5
  langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
6
- langroid/agent/base.py,sha256=k5kJGTpo2CcnVl2cEM0luBQ__a7C9put1aH5-wd3hQ8,78212
6
+ langroid/agent/base.py,sha256=0szJ5ZxNSmobFO5805ur2cqKfD6vUP4ooN76Z5qAeyw,78677
7
7
  langroid/agent/batch.py,sha256=vi1r5i1-vN80WfqHDSwjEym_KfGsqPGUtwktmiK1nuk,20635
8
- langroid/agent/chat_agent.py,sha256=hUu13nYhhr6ph01Sln8y_WuOIpcd38icN6p22h6IiDY,84211
8
+ langroid/agent/chat_agent.py,sha256=yuuEWVFLIN71XUpxdbhwZxEKAbOWG7zAV3ofYX4lCWg,84443
9
9
  langroid/agent/chat_document.py,sha256=xzMtrPbaW-Y-BnF7kuhr2dorsD-D5rMWzfOqJ8HAoo8,17885
10
10
  langroid/agent/openai_assistant.py,sha256=JkAcs02bIrgPNVvUWVR06VCthc5-ulla2QMBzux_q6o,34340
11
- langroid/agent/task.py,sha256=Mi1QZgbRWvKZKEqkh5157LdUFjPKq7EF77yEeqU7fGE,90468
11
+ langroid/agent/task.py,sha256=HB6N-Jn80HFqCf0ZYOC1v3Bn3oO7NLjShHQJJFwW0q4,90557
12
12
  langroid/agent/tool_message.py,sha256=BhjP-_TfQ2tgxuY4Yo_JHLOwwt0mJ4BwjPnREvEY4vk,14744
13
13
  langroid/agent/xml_tool_message.py,sha256=6SshYZJKIfi4mkE-gIoSwjkEYekQ8GwcSiCv7a5uO9E,15054
14
14
  langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,7 +72,7 @@ langroid/language_models/base.py,sha256=is4l3x858tdPHbrJU2jxJXe2j9PCGb9kk_c5nyfS
72
72
  langroid/language_models/config.py,sha256=9Q8wk5a7RQr8LGMT_0WkpjY8S4ywK06SalVRjXlfCiI,378
73
73
  langroid/language_models/mock_lm.py,sha256=5BgHKDVRWFbUwDT_PFgTZXz9-k8wJSA2e3PZmyDgQ1k,4022
74
74
  langroid/language_models/model_info.py,sha256=_EidEMIgAMx0RuELAf5Ans0yiE1QllybZALw5o-1HJg,12265
75
- langroid/language_models/openai_gpt.py,sha256=yuxbOTZp2TuhTdy88NmdhCvqcCSs1ls5j9Cn81yQQ6M,77402
75
+ langroid/language_models/openai_gpt.py,sha256=lOQcExZO5Tja35Xi4F2HcG8pE-2LEnGrHwLTXLOOagk,77367
76
76
  langroid/language_models/utils.py,sha256=L4_CbihDMTGcsg0TOG1Yd5JFEto46--h7CX_14m89sQ,5016
77
77
  langroid/language_models/prompt_formatter/__init__.py,sha256=2-5cdE24XoFDhifOLl8yiscohil1ogbP1ECkYdBlBsk,372
78
78
  langroid/language_models/prompt_formatter/base.py,sha256=eDS1sgRNZVnoajwV_ZIha6cba5Dt8xjgzdRbPITwx3Q,1221
@@ -81,10 +81,10 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
81
81
  langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
82
82
  langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
83
83
  langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
84
- langroid/parsing/document_parser.py,sha256=NKmN_HjwNdfUjTbXhpyK_Wjay3QYEA26ZnewmbO6moA,33632
84
+ langroid/parsing/document_parser.py,sha256=tov34uYB_2ecq7-G7P7CWSOv5alcfwkrrwfsnCCVdIk,49714
85
85
  langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
86
86
  langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
87
- langroid/parsing/parser.py,sha256=moJKI5Cn_Pxd7xbNrY220dqQu-0FeEWUI7ogeq63Kec,12842
87
+ langroid/parsing/parser.py,sha256=8MDoKQO60RGXod9E5jMj-k90QNhdim4blVJB9L0rrSA,13789
88
88
  langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
89
89
  langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
90
90
  langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
@@ -110,7 +110,7 @@ langroid/utils/logging.py,sha256=mwxHimq1wtVQ64PvDyfJJ7Upj-rjHLNHgx8EC2wClvo,402
110
110
  langroid/utils/object_registry.py,sha256=iPz9GHzvmCeVoidB3JdAMEKcxJEqTdUr0otQEexDZ5s,2100
111
111
  langroid/utils/pandas_utils.py,sha256=UctS986Jtl_MvU5rA7-GfrjEHXP7MNu8ePhepv0bTn0,755
112
112
  langroid/utils/pydantic_utils.py,sha256=R7Ps8VP56-eSo-LYHWllFo-SJ2zDmdItuuYpUq2gGJ8,20854
113
- langroid/utils/system.py,sha256=cJqDgOf9mM82l1GyUeQQdEYAwepYXQwtpJU8Xrz0-MA,8453
113
+ langroid/utils/system.py,sha256=q3QJtTSapIwNe8MMhGEM03wgxPLmZiD47_sF1pKx53I,8472
114
114
  langroid/utils/types.py,sha256=-BvyIf_LmAJ5jR9NC7S4CSVNEr3XayAaxJ5o0TiIej0,2992
115
115
  langroid/utils/algorithms/__init__.py,sha256=WylYoZymA0fnzpB4vrsH_0n7WsoLhmuZq8qxsOCjUpM,41
116
116
  langroid/utils/algorithms/graph.py,sha256=JbdpPnUOhw4-D6O7ou101JLA3xPCD0Lr3qaPoFCaRfo,2866
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
127
127
  langroid/vector_store/postgres.py,sha256=DQHd6dt-OcV_QVNm-ymn28rlTfhI6hqgcpLTPCsm0jI,15990
128
128
  langroid/vector_store/qdrantdb.py,sha256=v7TAsIoj_vxeKDYS9tpwJLBZA8fuTweTYxHo0X_uawM,17949
129
129
  langroid/vector_store/weaviatedb.py,sha256=tjlqEtkwrhykelt-nbr2WIuHWJBuSAGjZuG6gsAMBsc,11753
130
- langroid-0.42.9.dist-info/METADATA,sha256=Uzl-1rTMbTbk-xKzqgJq4gNiV-hsWZyuFNHhsTs4UEQ,61699
131
- langroid-0.42.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
- langroid-0.42.9.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
- langroid-0.42.9.dist-info/RECORD,,
130
+ langroid-0.43.0.dist-info/METADATA,sha256=3BipLtBKwh-Ob9F-PRnmRPJIYPGgAdm_xzP57fJEi6E,61773
131
+ langroid-0.43.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
132
+ langroid-0.43.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
133
+ langroid-0.43.0.dist-info/RECORD,,