langroid 0.42.9__py3-none-any.whl → 0.43.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langroid/agent/base.py +23 -13
- langroid/agent/chat_agent.py +6 -1
- langroid/agent/task.py +4 -1
- langroid/language_models/openai_gpt.py +15 -17
- langroid/parsing/document_parser.py +415 -3
- langroid/parsing/parser.py +33 -3
- langroid/utils/system.py +6 -1
- {langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/METADATA +3 -1
- {langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/RECORD +11 -11
- {langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/WHEEL +0 -0
- {langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/licenses/LICENSE +0 -0
langroid/agent/base.py
CHANGED
@@ -1148,7 +1148,9 @@ class Agent(ABC):
|
|
1148
1148
|
and msg.function_call is None
|
1149
1149
|
):
|
1150
1150
|
|
1151
|
-
tools = self.get_formatted_tool_messages(
|
1151
|
+
tools = self.get_formatted_tool_messages(
|
1152
|
+
msg.content, from_llm=msg.metadata.sender == Entity.LLM
|
1153
|
+
)
|
1152
1154
|
msg.all_tool_messages = tools
|
1153
1155
|
# filter for actually handle-able tools, and recipient is this agent
|
1154
1156
|
my_tools = [t for t in tools if self._tool_recipient_match(t)]
|
@@ -1177,7 +1179,9 @@ class Agent(ABC):
|
|
1177
1179
|
else:
|
1178
1180
|
return my_tools
|
1179
1181
|
|
1180
|
-
def get_formatted_tool_messages(
|
1182
|
+
def get_formatted_tool_messages(
|
1183
|
+
self, input_str: str, from_llm: bool = True
|
1184
|
+
) -> List[ToolMessage]:
|
1181
1185
|
"""
|
1182
1186
|
Returns ToolMessage objects (tools) corresponding to
|
1183
1187
|
tool-formatted substrings, if any.
|
@@ -1190,6 +1194,8 @@ class Agent(ABC):
|
|
1190
1194
|
|
1191
1195
|
Args:
|
1192
1196
|
input_str (str): input string, typically a message sent by an LLM
|
1197
|
+
from_llm (bool): whether the input was generated by the LLM. If so,
|
1198
|
+
we track malformed tool calls.
|
1193
1199
|
|
1194
1200
|
Returns:
|
1195
1201
|
List[ToolMessage]: list of ToolMessage objects
|
@@ -1203,7 +1209,7 @@ class Agent(ABC):
|
|
1203
1209
|
if not is_json:
|
1204
1210
|
return []
|
1205
1211
|
|
1206
|
-
results = [self._get_one_tool_message(j, is_json) for j in substrings]
|
1212
|
+
results = [self._get_one_tool_message(j, is_json, from_llm) for j in substrings]
|
1207
1213
|
valid_results = [r for r in results if r is not None]
|
1208
1214
|
# If any tool is correctly formed we do not set the flag
|
1209
1215
|
if len(valid_results) > 0:
|
@@ -1219,6 +1225,7 @@ class Agent(ABC):
|
|
1219
1225
|
return None
|
1220
1226
|
tool_name = msg.function_call.name
|
1221
1227
|
tool_msg = msg.function_call.arguments or {}
|
1228
|
+
self.tool_error = False
|
1222
1229
|
if tool_name not in self.llm_tools_handled:
|
1223
1230
|
logger.warning(
|
1224
1231
|
f"""
|
@@ -1230,10 +1237,12 @@ class Agent(ABC):
|
|
1230
1237
|
or you need to enable this agent to handle this fn-call.
|
1231
1238
|
"""
|
1232
1239
|
)
|
1233
|
-
if
|
1240
|
+
if (
|
1241
|
+
tool_name not in self.all_llm_tools_known
|
1242
|
+
and msg.metadata.sender == Entity.LLM
|
1243
|
+
):
|
1234
1244
|
self.tool_error = True
|
1235
1245
|
return None
|
1236
|
-
self.tool_error = False
|
1237
1246
|
tool_class = self.llm_tools_map[tool_name]
|
1238
1247
|
tool_msg.update(dict(request=tool_name))
|
1239
1248
|
tool = tool_class.parse_obj(tool_msg)
|
@@ -1272,8 +1281,9 @@ class Agent(ABC):
|
|
1272
1281
|
tool = tool_class.parse_obj(tool_msg)
|
1273
1282
|
tool.id = tc.id or ""
|
1274
1283
|
tools.append(tool)
|
1275
|
-
# When no tool is valid
|
1276
|
-
|
1284
|
+
# When no tool is valid and the message was produced
|
1285
|
+
# by the LLM, set the recovery flag
|
1286
|
+
self.tool_error = all_errors and msg.metadata.sender == Entity.LLM
|
1277
1287
|
return tools
|
1278
1288
|
|
1279
1289
|
def tool_validation_error(self, ve: ValidationError) -> str:
|
@@ -1508,7 +1518,7 @@ class Agent(ABC):
|
|
1508
1518
|
return None
|
1509
1519
|
|
1510
1520
|
def _get_one_tool_message(
|
1511
|
-
self, tool_candidate_str: str, is_json: bool = True
|
1521
|
+
self, tool_candidate_str: str, is_json: bool = True, from_llm: bool = True
|
1512
1522
|
) -> Optional[ToolMessage]:
|
1513
1523
|
"""
|
1514
1524
|
Parse the tool_candidate_str into ANY ToolMessage KNOWN to agent --
|
@@ -1545,7 +1555,7 @@ class Agent(ABC):
|
|
1545
1555
|
# }
|
1546
1556
|
|
1547
1557
|
if not isinstance(maybe_tool_dict, dict):
|
1548
|
-
self.tool_error =
|
1558
|
+
self.tool_error = from_llm
|
1549
1559
|
return None
|
1550
1560
|
|
1551
1561
|
properties = maybe_tool_dict.get("properties")
|
@@ -1593,23 +1603,23 @@ class Agent(ABC):
|
|
1593
1603
|
if len(candidate_tools) == 1:
|
1594
1604
|
return candidate_tools[0]
|
1595
1605
|
else:
|
1596
|
-
self.tool_error =
|
1606
|
+
self.tool_error = from_llm
|
1597
1607
|
return None
|
1598
1608
|
|
1599
1609
|
if not isinstance(request, str) or request not in self.all_llm_tools_known:
|
1600
|
-
self.tool_error =
|
1610
|
+
self.tool_error = from_llm
|
1601
1611
|
return None
|
1602
1612
|
|
1603
1613
|
message_class = self.llm_tools_map.get(request)
|
1604
1614
|
if message_class is None:
|
1605
1615
|
logger.warning(f"No message class found for request '{request}'")
|
1606
|
-
self.tool_error =
|
1616
|
+
self.tool_error = from_llm
|
1607
1617
|
return None
|
1608
1618
|
|
1609
1619
|
try:
|
1610
1620
|
message = message_class.parse_obj(maybe_tool_dict)
|
1611
1621
|
except ValidationError as ve:
|
1612
|
-
self.tool_error =
|
1622
|
+
self.tool_error = from_llm
|
1613
1623
|
raise ve
|
1614
1624
|
return message
|
1615
1625
|
|
langroid/agent/chat_agent.py
CHANGED
@@ -1096,7 +1096,10 @@ class ChatAgent(Agent):
|
|
1096
1096
|
else:
|
1097
1097
|
# We will trigger the strict recovery mechanism to force
|
1098
1098
|
# the LLM to correct its output, allowing us to parse
|
1099
|
-
|
1099
|
+
if isinstance(msg, ChatDocument):
|
1100
|
+
self.tool_error = msg.metadata.sender == Entity.LLM
|
1101
|
+
else:
|
1102
|
+
self.tool_error = True
|
1100
1103
|
|
1101
1104
|
raise ve
|
1102
1105
|
|
@@ -1265,6 +1268,7 @@ class ChatAgent(Agent):
|
|
1265
1268
|
and self._json_schema_available()
|
1266
1269
|
and self.config.strict_recovery
|
1267
1270
|
):
|
1271
|
+
self.tool_error = False
|
1268
1272
|
AnyTool = self._get_any_tool_message()
|
1269
1273
|
if AnyTool is None:
|
1270
1274
|
return None
|
@@ -1352,6 +1356,7 @@ class ChatAgent(Agent):
|
|
1352
1356
|
and self._json_schema_available()
|
1353
1357
|
and self.config.strict_recovery
|
1354
1358
|
):
|
1359
|
+
self.tool_error = False
|
1355
1360
|
AnyTool = self._get_any_tool_message()
|
1356
1361
|
self.set_output_format(
|
1357
1362
|
AnyTool,
|
langroid/agent/task.py
CHANGED
@@ -1572,7 +1572,10 @@ class Task:
|
|
1572
1572
|
response_fn = self._entity_responder_async_map[cast(Entity, e)]
|
1573
1573
|
result = await response_fn(self.pending_message)
|
1574
1574
|
# update result.tool_messages if any
|
1575
|
-
if
|
1575
|
+
if (
|
1576
|
+
isinstance(result, ChatDocument)
|
1577
|
+
and result.metadata.sender == Entity.LLM
|
1578
|
+
):
|
1576
1579
|
self.agent.try_get_tool_messages(result)
|
1577
1580
|
|
1578
1581
|
result_chat_doc = self.agent.to_ChatDocument(
|
@@ -85,9 +85,6 @@ GLHF_BASE_URL = "https://glhf.chat/api/openai/v1"
|
|
85
85
|
OLLAMA_API_KEY = "ollama"
|
86
86
|
DUMMY_API_KEY = "xxx"
|
87
87
|
|
88
|
-
VLLM_API_KEY = os.environ.get("VLLM_API_KEY", DUMMY_API_KEY)
|
89
|
-
LLAMACPP_API_KEY = os.environ.get("LLAMA_API_KEY", DUMMY_API_KEY)
|
90
|
-
|
91
88
|
|
92
89
|
openai_chat_model_pref_list = [
|
93
90
|
OpenAIChatModel.GPT4o,
|
@@ -421,6 +418,9 @@ class OpenAIGPT(LanguageModel):
|
|
421
418
|
self.supports_json_schema: bool = self.config.supports_json_schema or False
|
422
419
|
self.supports_strict_tools: bool = self.config.supports_strict_tools or False
|
423
420
|
|
421
|
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", DUMMY_API_KEY)
|
422
|
+
self.api_key = config.api_key
|
423
|
+
|
424
424
|
# if model name starts with "litellm",
|
425
425
|
# set the actual model name by stripping the "litellm/" prefix
|
426
426
|
# and set the litellm flag to True
|
@@ -449,12 +449,14 @@ class OpenAIGPT(LanguageModel):
|
|
449
449
|
|
450
450
|
# use api_base from config if set, else fall back on OLLAMA_BASE_URL
|
451
451
|
self.api_base = self.config.api_base or OLLAMA_BASE_URL
|
452
|
-
self.api_key
|
452
|
+
if self.api_key == OPENAI_API_KEY:
|
453
|
+
self.api_key = OLLAMA_API_KEY
|
453
454
|
self.config.chat_model = self.config.chat_model.replace("ollama/", "")
|
454
455
|
elif self.config.chat_model.startswith("vllm/"):
|
455
456
|
self.supports_json_schema = True
|
456
457
|
self.config.chat_model = self.config.chat_model.replace("vllm/", "")
|
457
|
-
self.api_key
|
458
|
+
if self.api_key == OPENAI_API_KEY:
|
459
|
+
self.api_key = os.environ.get("VLLM_API_KEY", DUMMY_API_KEY)
|
458
460
|
self.api_base = self.config.api_base or "http://localhost:8000/v1"
|
459
461
|
if not self.api_base.startswith("http"):
|
460
462
|
self.api_base = "http://" + self.api_base
|
@@ -465,7 +467,8 @@ class OpenAIGPT(LanguageModel):
|
|
465
467
|
self.api_base = self.config.chat_model.split("/", 1)[1]
|
466
468
|
if not self.api_base.startswith("http"):
|
467
469
|
self.api_base = "http://" + self.api_base
|
468
|
-
self.api_key
|
470
|
+
if self.api_key == OPENAI_API_KEY:
|
471
|
+
self.api_key = os.environ.get("LLAMA_API_KEY", DUMMY_API_KEY)
|
469
472
|
else:
|
470
473
|
self.api_base = self.config.api_base
|
471
474
|
# If api_base is unset we use OpenAI's endpoint, which supports
|
@@ -487,11 +490,6 @@ class OpenAIGPT(LanguageModel):
|
|
487
490
|
if self.config.use_completion_for_chat:
|
488
491
|
self.config.use_chat_for_completion = False
|
489
492
|
|
490
|
-
self.api_key = config.api_key
|
491
|
-
if self.is_openai_completion_model() or self.is_openai_chat_model():
|
492
|
-
if self.api_key == DUMMY_API_KEY:
|
493
|
-
self.api_key = os.getenv("OPENAI_API_KEY", DUMMY_API_KEY)
|
494
|
-
|
495
493
|
self.is_groq = self.config.chat_model.startswith("groq/")
|
496
494
|
self.is_cerebras = self.config.chat_model.startswith("cerebras/")
|
497
495
|
self.is_gemini = self.is_gemini_model()
|
@@ -502,7 +500,7 @@ class OpenAIGPT(LanguageModel):
|
|
502
500
|
if self.is_groq:
|
503
501
|
# use groq-specific client
|
504
502
|
self.config.chat_model = self.config.chat_model.replace("groq/", "")
|
505
|
-
if self.api_key ==
|
503
|
+
if self.api_key == OPENAI_API_KEY:
|
506
504
|
self.api_key = os.getenv("GROQ_API_KEY", DUMMY_API_KEY)
|
507
505
|
self.client = Groq(
|
508
506
|
api_key=self.api_key,
|
@@ -513,7 +511,7 @@ class OpenAIGPT(LanguageModel):
|
|
513
511
|
elif self.is_cerebras:
|
514
512
|
# use cerebras-specific client
|
515
513
|
self.config.chat_model = self.config.chat_model.replace("cerebras/", "")
|
516
|
-
if self.api_key ==
|
514
|
+
if self.api_key == OPENAI_API_KEY:
|
517
515
|
self.api_key = os.getenv("CEREBRAS_API_KEY", DUMMY_API_KEY)
|
518
516
|
self.client = Cerebras(
|
519
517
|
api_key=self.api_key,
|
@@ -526,25 +524,25 @@ class OpenAIGPT(LanguageModel):
|
|
526
524
|
# in these cases, there's no specific client: OpenAI python client suffices
|
527
525
|
if self.is_gemini:
|
528
526
|
self.config.chat_model = self.config.chat_model.replace("gemini/", "")
|
529
|
-
if self.api_key ==
|
527
|
+
if self.api_key == OPENAI_API_KEY:
|
530
528
|
self.api_key = os.getenv("GEMINI_API_KEY", DUMMY_API_KEY)
|
531
529
|
self.api_base = GEMINI_BASE_URL
|
532
530
|
elif self.is_glhf:
|
533
531
|
self.config.chat_model = self.config.chat_model.replace("glhf/", "")
|
534
|
-
if self.api_key ==
|
532
|
+
if self.api_key == OPENAI_API_KEY:
|
535
533
|
self.api_key = os.getenv("GLHF_API_KEY", DUMMY_API_KEY)
|
536
534
|
self.api_base = GLHF_BASE_URL
|
537
535
|
elif self.is_openrouter:
|
538
536
|
self.config.chat_model = self.config.chat_model.replace(
|
539
537
|
"openrouter/", ""
|
540
538
|
)
|
541
|
-
if self.api_key ==
|
539
|
+
if self.api_key == OPENAI_API_KEY:
|
542
540
|
self.api_key = os.getenv("OPENROUTER_API_KEY", DUMMY_API_KEY)
|
543
541
|
self.api_base = OPENROUTER_BASE_URL
|
544
542
|
elif self.is_deepseek:
|
545
543
|
self.config.chat_model = self.config.chat_model.replace("deepseek/", "")
|
546
544
|
self.api_base = DEEPSEEK_BASE_URL
|
547
|
-
if self.api_key ==
|
545
|
+
if self.api_key == OPENAI_API_KEY:
|
548
546
|
self.api_key = os.getenv("DEEPSEEK_API_KEY", DUMMY_API_KEY)
|
549
547
|
|
550
548
|
self.client = OpenAI(
|
@@ -9,7 +9,9 @@ from enum import Enum
|
|
9
9
|
from io import BytesIO
|
10
10
|
from itertools import accumulate
|
11
11
|
from pathlib import Path
|
12
|
-
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
|
12
|
+
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple, Union
|
13
|
+
|
14
|
+
from dotenv import load_dotenv
|
13
15
|
|
14
16
|
from langroid.exceptions import LangroidImportError
|
15
17
|
from langroid.utils.object_registry import ObjectRegistry
|
@@ -163,6 +165,8 @@ class DocumentParser(Parser):
|
|
163
165
|
return UnstructuredPDFParser(source, config)
|
164
166
|
elif config.pdf.library == "pdf2image":
|
165
167
|
return ImagePdfParser(source, config)
|
168
|
+
elif config.pdf.library == "gemini":
|
169
|
+
return GeminiPdfParser(source, config)
|
166
170
|
else:
|
167
171
|
raise ValueError(
|
168
172
|
f"Unsupported PDF library specified: {config.pdf.library}"
|
@@ -415,13 +419,15 @@ class DocumentParser(Parser):
|
|
415
419
|
# that it needs to be combined with the next chunk.
|
416
420
|
while len(split) > self.config.chunk_size:
|
417
421
|
# pretty formatting of pages (e.g. 1-3, 4, 5-7)
|
418
|
-
|
422
|
+
p_0 = int(pages[0])
|
423
|
+
p_n = int(pages[-1])
|
424
|
+
page_str = f"pages {p_0}-{p_n}" if p_0 != p_n else f"page {p_0}"
|
419
425
|
text = self.tokenizer.decode(split[: self.config.chunk_size])
|
420
426
|
docs.append(
|
421
427
|
Document(
|
422
428
|
content=text,
|
423
429
|
metadata=DocMetaData(
|
424
|
-
source=f"{self.source}
|
430
|
+
source=f"{self.source} {page_str}",
|
425
431
|
is_chunk=True,
|
426
432
|
id=common_id,
|
427
433
|
),
|
@@ -952,3 +958,409 @@ class MarkitdownPPTXParser(DocumentParser):
|
|
952
958
|
content=self.fix_text(md_content),
|
953
959
|
metadata=DocMetaData(source=self.source),
|
954
960
|
)
|
961
|
+
|
962
|
+
|
963
|
+
class GeminiPdfParser(DocumentParser):
|
964
|
+
"""
|
965
|
+
This class converts PDFs to Markdown using Gemini multimodal LLMs.
|
966
|
+
|
967
|
+
It extracts pages, converts them with the LLM (replacing images with
|
968
|
+
detailed descriptions), and outputs Markdown page by page. The
|
969
|
+
conversion follows `GEMINI_SYSTEM_INSTRUCTION`. It employs
|
970
|
+
multiprocessing for speed, async requests with rate limiting, and
|
971
|
+
handles errors.
|
972
|
+
|
973
|
+
It supports page-by-page splitting or chunking multiple pages into
|
974
|
+
one, respecting page boundaries and a `max_token_limit`.
|
975
|
+
"""
|
976
|
+
|
977
|
+
DEFAULT_MAX_TOKENS = 7000
|
978
|
+
OUTPUT_DIR = Path(".gemini_pdfparser") # Fixed output directory
|
979
|
+
|
980
|
+
GEMINI_SYSTEM_INSTRUCTION = """
|
981
|
+
### **Convert PDF to Markdown**
|
982
|
+
1. **Text:**
|
983
|
+
* Preserve structure, formatting (**bold**, *italic*), lists, and indentation.
|
984
|
+
* **Remove running heads (page numbers, headers/footers).**
|
985
|
+
* Keep section and chapter titles; discard repeated page headers.
|
986
|
+
2. **Images:** Replace with **detailed, creative descriptions**
|
987
|
+
optimized for clarity and understanding.
|
988
|
+
3. **Tables:** Convert to Markdown tables with proper structure.
|
989
|
+
4. **Math:** Use LaTeX (`...` inline, `$...$` block).
|
990
|
+
5. **Code:** Wrap in fenced blocks without specifying a language:
|
991
|
+
|
992
|
+
```
|
993
|
+
code
|
994
|
+
```
|
995
|
+
6. **Clean Output:**
|
996
|
+
* No system messages, metadata, or artifacts or ```markdown``` identifier.
|
997
|
+
* Do **not** include introductory or explanatory messages
|
998
|
+
like "Here is your output."
|
999
|
+
* Ensure formatting is **consistent and structured**
|
1000
|
+
for feeding into a markdown parser.
|
1001
|
+
""".strip()
|
1002
|
+
|
1003
|
+
def __init__(self, source: Union[str, bytes], config: ParsingConfig):
|
1004
|
+
super().__init__(source, config)
|
1005
|
+
if not config.pdf.gemini_config:
|
1006
|
+
raise ValueError(
|
1007
|
+
"GeminiPdfParser requires a Gemini-based config in pdf parsing config"
|
1008
|
+
)
|
1009
|
+
self.model_name = config.pdf.gemini_config.model_name
|
1010
|
+
|
1011
|
+
# Ensure output directory exists
|
1012
|
+
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
1013
|
+
|
1014
|
+
prefix = (
|
1015
|
+
Path(source).stem + "_"
|
1016
|
+
if isinstance(source, str) and Path(source).exists()
|
1017
|
+
else "output_"
|
1018
|
+
)
|
1019
|
+
temp_file = tempfile.NamedTemporaryFile(
|
1020
|
+
suffix=".md",
|
1021
|
+
prefix=prefix,
|
1022
|
+
dir=str(self.OUTPUT_DIR),
|
1023
|
+
delete=False,
|
1024
|
+
)
|
1025
|
+
temp_file.close()
|
1026
|
+
self.output_filename = Path(temp_file.name)
|
1027
|
+
|
1028
|
+
self.max_tokens = config.pdf.gemini_config.max_tokens or self.DEFAULT_MAX_TOKENS
|
1029
|
+
|
1030
|
+
"""
|
1031
|
+
If True, each PDF page is processed as a separate chunk,
|
1032
|
+
resulting in one LLM request per page. If False, pages are
|
1033
|
+
grouped into chunks based on `max_token_limit` before being sent
|
1034
|
+
to the LLM.
|
1035
|
+
"""
|
1036
|
+
self.split_on_page = config.pdf.gemini_config.split_on_page or False
|
1037
|
+
|
1038
|
+
# Rate limiting parameters
|
1039
|
+
import asyncio
|
1040
|
+
|
1041
|
+
self.requests_per_minute = config.pdf.gemini_config.requests_per_minute or 5
|
1042
|
+
|
1043
|
+
"""
|
1044
|
+
A semaphore to control the number of concurrent requests to the LLM,
|
1045
|
+
preventing rate limit errors. A semaphore slot is acquired before
|
1046
|
+
making an LLM request and released after the request is complete.
|
1047
|
+
"""
|
1048
|
+
self.semaphore = asyncio.Semaphore(self.requests_per_minute)
|
1049
|
+
self.retry_delay = 5 # seconds, for exponential backoff
|
1050
|
+
self.max_retries = 3
|
1051
|
+
|
1052
|
+
def _extract_page(self, page_num: int) -> Dict[str, Any]:
|
1053
|
+
"""
|
1054
|
+
Extracts a single page and estimates token count.
|
1055
|
+
Opens the PDF from self.doc_bytes (a BytesIO object).
|
1056
|
+
"""
|
1057
|
+
import fitz
|
1058
|
+
|
1059
|
+
try:
|
1060
|
+
# Always open the document from in-memory bytes.
|
1061
|
+
doc = fitz.open(stream=self.doc_bytes.getvalue(), filetype="pdf")
|
1062
|
+
new_pdf = fitz.open()
|
1063
|
+
new_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
|
1064
|
+
pdf_bytes = new_pdf.write()
|
1065
|
+
text = doc[page_num].get_text("text")
|
1066
|
+
token_count = len(text) // 4 if text else len(pdf_bytes) // 4
|
1067
|
+
|
1068
|
+
return {
|
1069
|
+
"page_numbers": page_num + 1,
|
1070
|
+
"pdf_bytes": pdf_bytes,
|
1071
|
+
"token_count": token_count,
|
1072
|
+
}
|
1073
|
+
except Exception as e:
|
1074
|
+
raise ValueError(f"Error processing PDF document: {e}") from e
|
1075
|
+
|
1076
|
+
def _extract_pdf_pages_parallel(
|
1077
|
+
self, num_workers: Optional[int] = None
|
1078
|
+
) -> List[Dict[str, Any]]:
|
1079
|
+
"""Parallel PDF page extraction using self.doc_bytes."""
|
1080
|
+
from multiprocessing import Pool, cpu_count
|
1081
|
+
|
1082
|
+
import fitz
|
1083
|
+
from tqdm import tqdm
|
1084
|
+
|
1085
|
+
try:
|
1086
|
+
doc = fitz.open(stream=self.doc_bytes.getvalue(), filetype="pdf")
|
1087
|
+
total_pages = len(doc)
|
1088
|
+
except Exception as e:
|
1089
|
+
raise ValueError(f"Error opening PDF document: {e}") from e
|
1090
|
+
|
1091
|
+
num_workers = num_workers or cpu_count()
|
1092
|
+
with Pool(num_workers) as pool:
|
1093
|
+
with tqdm(total=total_pages, desc="Extracting pages", unit="page") as pbar:
|
1094
|
+
results = []
|
1095
|
+
for result in pool.imap(self._extract_page, range(total_pages)):
|
1096
|
+
results.append(result)
|
1097
|
+
pbar.update(1)
|
1098
|
+
|
1099
|
+
return results
|
1100
|
+
|
1101
|
+
def _group_pages_by_token_limit(
|
1102
|
+
self, pages: List[Dict[str, Any]], max_tokens: int = DEFAULT_MAX_TOKENS
|
1103
|
+
) -> List[List[Dict[str, Any]]]:
|
1104
|
+
"""Groups pages into chunks where each chunk is approximately `max_tokens`."""
|
1105
|
+
chunks: List[List[Dict[str, Any]]] = []
|
1106
|
+
current_chunk: List[Dict[str, Any]] = []
|
1107
|
+
current_tokens = 0
|
1108
|
+
|
1109
|
+
for page in pages:
|
1110
|
+
if current_tokens + page["token_count"] > max_tokens and current_chunk:
|
1111
|
+
chunks.append(current_chunk)
|
1112
|
+
current_chunk = []
|
1113
|
+
current_tokens = 0
|
1114
|
+
|
1115
|
+
current_chunk.append(page)
|
1116
|
+
current_tokens += page["token_count"]
|
1117
|
+
|
1118
|
+
if current_chunk: # Add remaining pages
|
1119
|
+
chunks.append(current_chunk)
|
1120
|
+
|
1121
|
+
return chunks
|
1122
|
+
|
1123
|
+
def _merge_pages_into_pdf_with_metadata(
|
1124
|
+
self, page_group: List[Dict[str, Any]]
|
1125
|
+
) -> Dict[str, Any]:
|
1126
|
+
"""
|
1127
|
+
Merges grouped pages into a single binary chunk so that
|
1128
|
+
it does not exceed max token limit
|
1129
|
+
"""
|
1130
|
+
import fitz
|
1131
|
+
|
1132
|
+
merged_pdf = fitz.open()
|
1133
|
+
page_numbers = []
|
1134
|
+
|
1135
|
+
for page in page_group:
|
1136
|
+
temp_pdf = fitz.open("pdf", page["pdf_bytes"])
|
1137
|
+
merged_pdf.insert_pdf(temp_pdf)
|
1138
|
+
page_numbers.append(page["page_numbers"])
|
1139
|
+
|
1140
|
+
return {
|
1141
|
+
"pdf_bytes": merged_pdf.write(), # Binary PDF data
|
1142
|
+
"page_numbers": page_numbers, # List of page numbers in this chunk
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
def _prepare_pdf_chunks_for_gemini(
|
1146
|
+
self,
|
1147
|
+
num_workers: Optional[int] = None,
|
1148
|
+
max_tokens: int = DEFAULT_MAX_TOKENS,
|
1149
|
+
split_on_page: bool = False,
|
1150
|
+
) -> List[Dict[str, Any]]:
|
1151
|
+
"""
|
1152
|
+
Extracts, groups, and merges PDF pages into chunks with embedded page markers.
|
1153
|
+
"""
|
1154
|
+
from multiprocessing import Pool
|
1155
|
+
|
1156
|
+
pages = self._extract_pdf_pages_parallel(num_workers)
|
1157
|
+
|
1158
|
+
if split_on_page:
|
1159
|
+
# Each page becomes its own chunk
|
1160
|
+
return pages
|
1161
|
+
else:
|
1162
|
+
# Group pages based on token limit
|
1163
|
+
chunks = self._group_pages_by_token_limit(pages, max_tokens)
|
1164
|
+
with Pool(num_workers) as pool:
|
1165
|
+
pdf_chunks = pool.map(self._merge_pages_into_pdf_with_metadata, chunks)
|
1166
|
+
return pdf_chunks
|
1167
|
+
|
1168
|
+
async def _send_chunk_to_gemini(
|
1169
|
+
self, chunk: Dict[str, Any], gemini_api_key: str
|
1170
|
+
) -> str:
|
1171
|
+
"""
|
1172
|
+
Sends a PDF chunk to the Gemini API and returns the response text.
|
1173
|
+
Uses retries with exponential backoff to handle transient failures.
|
1174
|
+
"""
|
1175
|
+
import asyncio
|
1176
|
+
import logging
|
1177
|
+
|
1178
|
+
from google import genai
|
1179
|
+
from google.genai import types
|
1180
|
+
|
1181
|
+
async with self.semaphore: # Limit concurrent API requests
|
1182
|
+
for attempt in range(self.max_retries):
|
1183
|
+
try:
|
1184
|
+
client = genai.Client(api_key=gemini_api_key)
|
1185
|
+
|
1186
|
+
# Send the request with PDF content and system instructions
|
1187
|
+
response = await client.aio.models.generate_content(
|
1188
|
+
model=self.model_name,
|
1189
|
+
contents=[
|
1190
|
+
types.Part.from_bytes(
|
1191
|
+
data=chunk["pdf_bytes"], mime_type="application/pdf"
|
1192
|
+
),
|
1193
|
+
self.GEMINI_SYSTEM_INSTRUCTION,
|
1194
|
+
],
|
1195
|
+
)
|
1196
|
+
|
1197
|
+
# Return extracted text if available
|
1198
|
+
return str(response.text) if response.text else ""
|
1199
|
+
|
1200
|
+
except Exception as e:
|
1201
|
+
# Log error with page numbers for debugging
|
1202
|
+
logging.error(
|
1203
|
+
"Attempt %d failed for pages %s: %s",
|
1204
|
+
attempt + 1,
|
1205
|
+
chunk.get("page_numbers", "Unknown"),
|
1206
|
+
e,
|
1207
|
+
)
|
1208
|
+
|
1209
|
+
if attempt < self.max_retries - 1:
|
1210
|
+
# Apply exponential backoff before retrying
|
1211
|
+
delay = self.retry_delay * (2**attempt)
|
1212
|
+
logging.info("Retrying in %s sec...", delay)
|
1213
|
+
await asyncio.sleep(delay)
|
1214
|
+
else:
|
1215
|
+
# Log failure after max retries
|
1216
|
+
logging.error(
|
1217
|
+
"Max retries reached for pages %s",
|
1218
|
+
chunk.get("page_numbers", "Unknown"),
|
1219
|
+
)
|
1220
|
+
break
|
1221
|
+
|
1222
|
+
return "" # Return empty string if all retries fail
|
1223
|
+
|
1224
|
+
async def process_chunks(
|
1225
|
+
self, chunks: List[Dict[str, Any]], api_key: str
|
1226
|
+
) -> List[str]:
|
1227
|
+
"""
|
1228
|
+
Processes PDF chunks by sending them to the Gemini API and
|
1229
|
+
collecting the results.
|
1230
|
+
|
1231
|
+
Args:
|
1232
|
+
chunks: A list of dictionaries, where each dictionary represents
|
1233
|
+
a PDF chunk and contains the PDF data and page numbers.
|
1234
|
+
api_key: The Gemini API key.
|
1235
|
+
"""
|
1236
|
+
# To show nice progress bar
|
1237
|
+
from tqdm.asyncio import tqdm_asyncio
|
1238
|
+
|
1239
|
+
# Create a list of asynchronous tasks to send each chunk to Gemini.
|
1240
|
+
# Chunk in this case might be single page or group of pages returned
|
1241
|
+
# by prepare_pdf_chunks function
|
1242
|
+
tasks = [self._send_chunk_to_gemini(chunk, api_key) for chunk in chunks]
|
1243
|
+
|
1244
|
+
# Gather the results from all tasks, allowing exceptions to be returned.
|
1245
|
+
# tqdm_asyncio is wrapper around asyncio.gather
|
1246
|
+
gathered_results = await tqdm_asyncio.gather(
|
1247
|
+
*tasks, desc="Processing chunks(pages)", unit="chunk"
|
1248
|
+
)
|
1249
|
+
results = []
|
1250
|
+
for i, result in enumerate(gathered_results):
|
1251
|
+
chunk = chunks[i] # Get the corresponding chunk.
|
1252
|
+
|
1253
|
+
if isinstance(result, Exception):
|
1254
|
+
# Handle exceptions that occurred during chunk processing.
|
1255
|
+
logging.error(
|
1256
|
+
"Failed to process chunk %s: %s",
|
1257
|
+
chunk.get("page_numbers", "Unknown"),
|
1258
|
+
result,
|
1259
|
+
)
|
1260
|
+
results.append(
|
1261
|
+
"<!----Error: Could not process chunk %s---->"
|
1262
|
+
% chunk.get("page_numbers", "Unknown")
|
1263
|
+
)
|
1264
|
+
else:
|
1265
|
+
# Process successful results and append page/chunk markers.
|
1266
|
+
markdown = str(result)
|
1267
|
+
if self.split_on_page:
|
1268
|
+
results.append(
|
1269
|
+
markdown + f"<!----Page-{chunk['page_numbers']}---->"
|
1270
|
+
)
|
1271
|
+
else:
|
1272
|
+
results.append(
|
1273
|
+
markdown + f"<!----Chunk-{chunk['page_numbers']}---->"
|
1274
|
+
)
|
1275
|
+
|
1276
|
+
return results # Return the list of results.
|
1277
|
+
|
1278
|
+
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
1279
|
+
"""
|
1280
|
+
Iterates over the document pages, extracting content using the
|
1281
|
+
Gemini API, saves them to a markdown file, and yields page numbers
|
1282
|
+
along with their corresponding content.
|
1283
|
+
|
1284
|
+
Yields:
|
1285
|
+
A generator of tuples, where each tuple contains the page number
|
1286
|
+
(int) and the page content (Any).
|
1287
|
+
"""
|
1288
|
+
import asyncio
|
1289
|
+
import os
|
1290
|
+
|
1291
|
+
# Load environment variables (e.g., GEMINI_API_KEY) from a .env file.
|
1292
|
+
load_dotenv()
|
1293
|
+
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
1294
|
+
if not gemini_api_key:
|
1295
|
+
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
1296
|
+
|
1297
|
+
try:
|
1298
|
+
# This involves extracting pages, grouping them according to the
|
1299
|
+
# `max_tokens` limit (if `split_on_page` is False), and
|
1300
|
+
# merging pages into larger PDF chunks. The result
|
1301
|
+
# is a list of dictionaries, where each dictionary contains the
|
1302
|
+
# PDF bytes and the associated page numbers or single page if
|
1303
|
+
# `split_on_page` is true
|
1304
|
+
|
1305
|
+
pdf_chunks = self._prepare_pdf_chunks_for_gemini(
|
1306
|
+
num_workers=8,
|
1307
|
+
max_tokens=self.max_tokens,
|
1308
|
+
split_on_page=self.split_on_page,
|
1309
|
+
)
|
1310
|
+
|
1311
|
+
# We asynchronously processes each chunk, sending it
|
1312
|
+
# to Gemini and retrieving the Markdown output. It handles rate
|
1313
|
+
# limiting and retries.
|
1314
|
+
markdown_results = asyncio.run(
|
1315
|
+
self.process_chunks(pdf_chunks, gemini_api_key)
|
1316
|
+
)
|
1317
|
+
|
1318
|
+
# This file serves as an intermediate storage location for the
|
1319
|
+
# complete Markdown output.
|
1320
|
+
with open(self.output_filename, "w", encoding="utf-8") as outfile:
|
1321
|
+
outfile.write("\n\n".join(markdown_results))
|
1322
|
+
|
1323
|
+
# Read the full Markdown content from the temporary file.
|
1324
|
+
with open(self.output_filename, "r", encoding="utf-8") as infile:
|
1325
|
+
full_markdown = infile.read()
|
1326
|
+
|
1327
|
+
# The splitting is based on the `split_on_page` setting. If True,
|
1328
|
+
# the Markdown is split using the "Page-" marker. Otherwise, it's
|
1329
|
+
# split using the "Chunk-" marker.
|
1330
|
+
if self.split_on_page:
|
1331
|
+
pages = full_markdown.split("<!----Page-")
|
1332
|
+
else:
|
1333
|
+
pages = full_markdown.split("<!----Chunk-")
|
1334
|
+
|
1335
|
+
# Remove the first element if it's empty (due to the split).
|
1336
|
+
if pages and pages[0] == "":
|
1337
|
+
pages = pages[1:]
|
1338
|
+
|
1339
|
+
# Iterate over the pages or chunks and yield their content.
|
1340
|
+
for i, page in enumerate(pages):
|
1341
|
+
# Check for errors during processing.
|
1342
|
+
if "<!----Error:" in page:
|
1343
|
+
page_content = page
|
1344
|
+
logging.warning(f"Page {i}: Error processing chunk.")
|
1345
|
+
else:
|
1346
|
+
# Extract the actual page content by removing the marker.
|
1347
|
+
page_content = (
|
1348
|
+
page.split("---->", 1)[1]
|
1349
|
+
if len(page.split("---->", 1)) > 1
|
1350
|
+
else page
|
1351
|
+
)
|
1352
|
+
|
1353
|
+
# Yield the page number and content.
|
1354
|
+
yield i, page_content
|
1355
|
+
|
1356
|
+
except Exception as e:
|
1357
|
+
raise ValueError(f"Error processing document: {e}") from e
|
1358
|
+
|
1359
|
+
def get_document_from_page(self, page: str) -> Document:
|
1360
|
+
"""
|
1361
|
+
Get a Document object from a given markdown page.
|
1362
|
+
"""
|
1363
|
+
return Document(
|
1364
|
+
content=page,
|
1365
|
+
metadata=DocMetaData(source=self.source),
|
1366
|
+
)
|
langroid/parsing/parser.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
import logging
|
2
2
|
import re
|
3
3
|
from enum import Enum
|
4
|
-
from typing import Dict, List, Literal
|
4
|
+
from typing import Any, Dict, List, Literal, Optional
|
5
5
|
|
6
6
|
import tiktoken
|
7
7
|
|
8
8
|
from langroid.mytypes import Document
|
9
9
|
from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
|
10
|
-
from langroid.pydantic_v1 import BaseSettings
|
10
|
+
from langroid.pydantic_v1 import BaseSettings, root_validator
|
11
11
|
from langroid.utils.object_registry import ObjectRegistry
|
12
12
|
|
13
13
|
logger = logging.getLogger(__name__)
|
@@ -20,7 +20,26 @@ class Splitter(str, Enum):
|
|
20
20
|
SIMPLE = "simple"
|
21
21
|
|
22
22
|
|
23
|
-
class
|
23
|
+
class BaseParsingConfig(BaseSettings):
|
24
|
+
"""Base class for document parsing configurations."""
|
25
|
+
|
26
|
+
library: str
|
27
|
+
|
28
|
+
class Config:
|
29
|
+
extra = "ignore" # Ignore unknown settings
|
30
|
+
|
31
|
+
|
32
|
+
class GeminiConfig(BaseSettings):
|
33
|
+
"""Configuration for Gemini-based parsing."""
|
34
|
+
|
35
|
+
model_name: str = "gemini-2.0-flash" # Default model
|
36
|
+
max_tokens: Optional[int] = None
|
37
|
+
split_on_page: Optional[bool] = True
|
38
|
+
requests_per_minute: Optional[int] = 5
|
39
|
+
|
40
|
+
|
41
|
+
class PdfParsingConfig(BaseParsingConfig):
|
42
|
+
|
24
43
|
library: Literal[
|
25
44
|
"fitz",
|
26
45
|
"pymupdf4llm",
|
@@ -29,7 +48,18 @@ class PdfParsingConfig(BaseSettings):
|
|
29
48
|
"unstructured",
|
30
49
|
"pdf2image",
|
31
50
|
"markitdown",
|
51
|
+
"gemini",
|
32
52
|
] = "pymupdf4llm"
|
53
|
+
gemini_config: Optional[GeminiConfig] = None
|
54
|
+
|
55
|
+
@root_validator(pre=True)
|
56
|
+
def enable_gemini_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
57
|
+
"""Ensure GeminiConfig is set only when library is 'gemini'."""
|
58
|
+
if values.get("library") == "gemini":
|
59
|
+
values["gemini_config"] = values.get("gemini_config") or GeminiConfig()
|
60
|
+
else:
|
61
|
+
values["gemini_config"] = None
|
62
|
+
return values
|
33
63
|
|
34
64
|
|
35
65
|
class DocxParsingConfig(BaseSettings):
|
langroid/utils/system.py
CHANGED
@@ -14,7 +14,12 @@ from typing import Any, Literal
|
|
14
14
|
|
15
15
|
logger = logging.getLogger(__name__)
|
16
16
|
|
17
|
-
DELETION_ALLOWED_PATHS = [
|
17
|
+
DELETION_ALLOWED_PATHS = [
|
18
|
+
".qdrant",
|
19
|
+
".chroma",
|
20
|
+
".lancedb",
|
21
|
+
".weaviate",
|
22
|
+
]
|
18
23
|
|
19
24
|
|
20
25
|
def pydantic_major_version() -> int:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langroid
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.43.0
|
4
4
|
Summary: Harness LLMs with Multi-Agent Programming
|
5
5
|
Author-email: Prasad Chalasani <pchalasani@gmail.com>
|
6
6
|
License: MIT
|
@@ -86,6 +86,8 @@ Requires-Dist: weaviate-client>=4.9.6; extra == 'all'
|
|
86
86
|
Provides-Extra: arango
|
87
87
|
Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'arango'
|
88
88
|
Requires-Dist: python-arango<9.0.0,>=8.1.2; extra == 'arango'
|
89
|
+
Provides-Extra: asyncio
|
90
|
+
Requires-Dist: asyncio>=3.4.3; extra == 'asyncio'
|
89
91
|
Provides-Extra: chainlit
|
90
92
|
Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'chainlit'
|
91
93
|
Requires-Dist: python-socketio<6.0.0,>=5.11.0; extra == 'chainlit'
|
@@ -3,12 +3,12 @@ langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
|
|
3
3
|
langroid/mytypes.py,sha256=FXSH62MUCeMCJP-66RVmbNaHCDLMxllEShZ-xEeTn9A,2833
|
4
4
|
langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
5
|
langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
|
6
|
-
langroid/agent/base.py,sha256=
|
6
|
+
langroid/agent/base.py,sha256=0szJ5ZxNSmobFO5805ur2cqKfD6vUP4ooN76Z5qAeyw,78677
|
7
7
|
langroid/agent/batch.py,sha256=vi1r5i1-vN80WfqHDSwjEym_KfGsqPGUtwktmiK1nuk,20635
|
8
|
-
langroid/agent/chat_agent.py,sha256=
|
8
|
+
langroid/agent/chat_agent.py,sha256=yuuEWVFLIN71XUpxdbhwZxEKAbOWG7zAV3ofYX4lCWg,84443
|
9
9
|
langroid/agent/chat_document.py,sha256=xzMtrPbaW-Y-BnF7kuhr2dorsD-D5rMWzfOqJ8HAoo8,17885
|
10
10
|
langroid/agent/openai_assistant.py,sha256=JkAcs02bIrgPNVvUWVR06VCthc5-ulla2QMBzux_q6o,34340
|
11
|
-
langroid/agent/task.py,sha256=
|
11
|
+
langroid/agent/task.py,sha256=HB6N-Jn80HFqCf0ZYOC1v3Bn3oO7NLjShHQJJFwW0q4,90557
|
12
12
|
langroid/agent/tool_message.py,sha256=BhjP-_TfQ2tgxuY4Yo_JHLOwwt0mJ4BwjPnREvEY4vk,14744
|
13
13
|
langroid/agent/xml_tool_message.py,sha256=6SshYZJKIfi4mkE-gIoSwjkEYekQ8GwcSiCv7a5uO9E,15054
|
14
14
|
langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -72,7 +72,7 @@ langroid/language_models/base.py,sha256=is4l3x858tdPHbrJU2jxJXe2j9PCGb9kk_c5nyfS
|
|
72
72
|
langroid/language_models/config.py,sha256=9Q8wk5a7RQr8LGMT_0WkpjY8S4ywK06SalVRjXlfCiI,378
|
73
73
|
langroid/language_models/mock_lm.py,sha256=5BgHKDVRWFbUwDT_PFgTZXz9-k8wJSA2e3PZmyDgQ1k,4022
|
74
74
|
langroid/language_models/model_info.py,sha256=_EidEMIgAMx0RuELAf5Ans0yiE1QllybZALw5o-1HJg,12265
|
75
|
-
langroid/language_models/openai_gpt.py,sha256=
|
75
|
+
langroid/language_models/openai_gpt.py,sha256=lOQcExZO5Tja35Xi4F2HcG8pE-2LEnGrHwLTXLOOagk,77367
|
76
76
|
langroid/language_models/utils.py,sha256=L4_CbihDMTGcsg0TOG1Yd5JFEto46--h7CX_14m89sQ,5016
|
77
77
|
langroid/language_models/prompt_formatter/__init__.py,sha256=2-5cdE24XoFDhifOLl8yiscohil1ogbP1ECkYdBlBsk,372
|
78
78
|
langroid/language_models/prompt_formatter/base.py,sha256=eDS1sgRNZVnoajwV_ZIha6cba5Dt8xjgzdRbPITwx3Q,1221
|
@@ -81,10 +81,10 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
|
|
81
81
|
langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
|
82
82
|
langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
|
83
83
|
langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
|
84
|
-
langroid/parsing/document_parser.py,sha256=
|
84
|
+
langroid/parsing/document_parser.py,sha256=tov34uYB_2ecq7-G7P7CWSOv5alcfwkrrwfsnCCVdIk,49714
|
85
85
|
langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
|
86
86
|
langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
|
87
|
-
langroid/parsing/parser.py,sha256=
|
87
|
+
langroid/parsing/parser.py,sha256=8MDoKQO60RGXod9E5jMj-k90QNhdim4blVJB9L0rrSA,13789
|
88
88
|
langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
|
89
89
|
langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
|
90
90
|
langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
|
@@ -110,7 +110,7 @@ langroid/utils/logging.py,sha256=mwxHimq1wtVQ64PvDyfJJ7Upj-rjHLNHgx8EC2wClvo,402
|
|
110
110
|
langroid/utils/object_registry.py,sha256=iPz9GHzvmCeVoidB3JdAMEKcxJEqTdUr0otQEexDZ5s,2100
|
111
111
|
langroid/utils/pandas_utils.py,sha256=UctS986Jtl_MvU5rA7-GfrjEHXP7MNu8ePhepv0bTn0,755
|
112
112
|
langroid/utils/pydantic_utils.py,sha256=R7Ps8VP56-eSo-LYHWllFo-SJ2zDmdItuuYpUq2gGJ8,20854
|
113
|
-
langroid/utils/system.py,sha256=
|
113
|
+
langroid/utils/system.py,sha256=q3QJtTSapIwNe8MMhGEM03wgxPLmZiD47_sF1pKx53I,8472
|
114
114
|
langroid/utils/types.py,sha256=-BvyIf_LmAJ5jR9NC7S4CSVNEr3XayAaxJ5o0TiIej0,2992
|
115
115
|
langroid/utils/algorithms/__init__.py,sha256=WylYoZymA0fnzpB4vrsH_0n7WsoLhmuZq8qxsOCjUpM,41
|
116
116
|
langroid/utils/algorithms/graph.py,sha256=JbdpPnUOhw4-D6O7ou101JLA3xPCD0Lr3qaPoFCaRfo,2866
|
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
|
|
127
127
|
langroid/vector_store/postgres.py,sha256=DQHd6dt-OcV_QVNm-ymn28rlTfhI6hqgcpLTPCsm0jI,15990
|
128
128
|
langroid/vector_store/qdrantdb.py,sha256=v7TAsIoj_vxeKDYS9tpwJLBZA8fuTweTYxHo0X_uawM,17949
|
129
129
|
langroid/vector_store/weaviatedb.py,sha256=tjlqEtkwrhykelt-nbr2WIuHWJBuSAGjZuG6gsAMBsc,11753
|
130
|
-
langroid-0.
|
131
|
-
langroid-0.
|
132
|
-
langroid-0.
|
133
|
-
langroid-0.
|
130
|
+
langroid-0.43.0.dist-info/METADATA,sha256=3BipLtBKwh-Ob9F-PRnmRPJIYPGgAdm_xzP57fJEi6E,61773
|
131
|
+
langroid-0.43.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
132
|
+
langroid-0.43.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
|
133
|
+
langroid-0.43.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|