langroid 0.50.12__tar.gz → 0.51.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langroid-0.50.12 → langroid-0.51.0}/PKG-INFO +1 -1
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/document_parser.py +94 -48
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/parser.py +8 -8
- {langroid-0.50.12 → langroid-0.51.0}/pyproject.toml +1 -1
- {langroid-0.50.12 → langroid-0.51.0}/.gitignore +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/LICENSE +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/README.md +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/base.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/batch.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/callbacks/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/callbacks/chainlit.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/chat_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/chat_document.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/openai_assistant.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/arangodb/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/arangodb/arangodb_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/arangodb/system_messages.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/arangodb/tools.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/arangodb/utils.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/doc_chat_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/doc_chat_task.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/lance_doc_chat_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/lance_rag/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/lance_rag/critic_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/lance_rag/lance_rag_task.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/lance_rag/query_planner_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/lance_tools.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/neo4j/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/neo4j/csv_kg_chat.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/neo4j/neo4j_chat_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/neo4j/system_messages.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/neo4j/tools.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/relevance_extractor_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/retriever_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/sql/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/sql/sql_chat_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/sql/utils/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/sql/utils/description_extractors.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/sql/utils/populate_metadata.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/sql/utils/system_message.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/sql/utils/tools.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/table_chat_agent.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/task.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tool_message.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/duckduckgo_search_tool.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/exa_search_tool.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/file_tools.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/google_search_tool.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/metaphor_search_tool.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/orchestration.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/recipient_tool.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/retrieval_tool.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/rewind_tool.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/segment_extract_tool.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/tools/tavily_search_tool.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/agent/xml_tool_message.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/cachedb/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/cachedb/base.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/cachedb/momento_cachedb.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/cachedb/redis_cachedb.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/base.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/models.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/protoc/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/protoc/embeddings.proto +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/protoc/embeddings_pb2.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/protoc/embeddings_pb2.pyi +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/remote_embeds.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/exceptions.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/azure_openai.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/base.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/config.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/mock_lm.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/model_info.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/openai_gpt.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/prompt_formatter/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/prompt_formatter/base.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/prompt_formatter/hf_formatter.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/prompt_formatter/llama2_formatter.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/utils.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/mytypes.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/agent_chats.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/code_parser.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/md_parser.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/para_sentence_split.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/parse_json.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/pdf_utils.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/repo_loader.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/routing.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/search.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/spider.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/table_loader.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/url_loader.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/urls.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/utils.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/parsing/web_search.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/prompts/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/prompts/dialog.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/prompts/prompts_config.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/prompts/templates.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/py.typed +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/pydantic_v1/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/pydantic_v1/main.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/algorithms/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/algorithms/graph.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/configuration.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/constants.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/git_utils.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/globals.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/logging.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/object_registry.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/output/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/output/citations.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/output/printing.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/output/status.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/pandas_utils.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/pydantic_utils.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/system.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/utils/types.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/vector_store/__init__.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/vector_store/base.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/vector_store/chromadb.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/vector_store/lancedb.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/vector_store/meilisearch.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/vector_store/pineconedb.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/vector_store/postgres.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/vector_store/qdrantdb.py +0 -0
- {langroid-0.50.12 → langroid-0.51.0}/langroid/vector_store/weaviatedb.py +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import base64
|
3
4
|
import itertools
|
4
5
|
import logging
|
5
6
|
import os
|
@@ -148,8 +149,8 @@ class DocumentParser(Parser):
|
|
148
149
|
return UnstructuredPDFParser(source, config)
|
149
150
|
elif config.pdf.library == "pdf2image":
|
150
151
|
return ImagePdfParser(source, config)
|
151
|
-
elif config.pdf.library == "
|
152
|
-
return
|
152
|
+
elif config.pdf.library == "llm-pdf-parser":
|
153
|
+
return LLMPdfParser(source, config)
|
153
154
|
elif config.pdf.library == "marker":
|
154
155
|
return MarkerPdfParser(source, config)
|
155
156
|
else:
|
@@ -993,13 +994,13 @@ class MarkitdownPPTXParser(DocumentParser):
|
|
993
994
|
)
|
994
995
|
|
995
996
|
|
996
|
-
class
|
997
|
+
class LLMPdfParser(DocumentParser):
|
997
998
|
"""
|
998
|
-
This class converts PDFs to Markdown using
|
999
|
+
This class converts PDFs to Markdown using multimodal LLMs.
|
999
1000
|
|
1000
1001
|
It extracts pages, converts them with the LLM (replacing images with
|
1001
1002
|
detailed descriptions), and outputs Markdown page by page. The
|
1002
|
-
conversion follows `
|
1003
|
+
conversion follows `LLM_PDF_MD_SYSTEM_INSTRUCTION`. It employs
|
1003
1004
|
multiprocessing for speed, async requests with rate limiting, and
|
1004
1005
|
handles errors.
|
1005
1006
|
|
@@ -1008,9 +1009,9 @@ class GeminiPdfParser(DocumentParser):
|
|
1008
1009
|
"""
|
1009
1010
|
|
1010
1011
|
DEFAULT_MAX_TOKENS = 7000
|
1011
|
-
OUTPUT_DIR = Path(".
|
1012
|
+
OUTPUT_DIR = Path(".llm_pdfparser") # Fixed output directory
|
1012
1013
|
|
1013
|
-
|
1014
|
+
LLM_PDF_MD_SYSTEM_INSTRUCTION = """
|
1014
1015
|
### **Convert PDF to Markdown**
|
1015
1016
|
1. **Text:**
|
1016
1017
|
* Preserve structure, formatting (**bold**, *italic*), lists, and indentation.
|
@@ -1035,11 +1036,11 @@ class GeminiPdfParser(DocumentParser):
|
|
1035
1036
|
|
1036
1037
|
def __init__(self, source: Union[str, bytes], config: ParsingConfig):
|
1037
1038
|
super().__init__(source, config)
|
1038
|
-
if not config.pdf.
|
1039
|
+
if not config.pdf.llm_parser_config:
|
1039
1040
|
raise ValueError(
|
1040
|
-
"
|
1041
|
+
"LLMPdfParser requires a llm-based config in pdf parsing config"
|
1041
1042
|
)
|
1042
|
-
self.model_name = config.pdf.
|
1043
|
+
self.model_name = config.pdf.llm_parser_config.model_name
|
1043
1044
|
|
1044
1045
|
# Ensure output directory exists
|
1045
1046
|
self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
@@ -1058,7 +1059,9 @@ class GeminiPdfParser(DocumentParser):
|
|
1058
1059
|
temp_file.close()
|
1059
1060
|
self.output_filename = Path(temp_file.name)
|
1060
1061
|
|
1061
|
-
self.max_tokens =
|
1062
|
+
self.max_tokens = (
|
1063
|
+
config.pdf.llm_parser_config.max_tokens or self.DEFAULT_MAX_TOKENS
|
1064
|
+
)
|
1062
1065
|
|
1063
1066
|
"""
|
1064
1067
|
If True, each PDF page is processed as a separate chunk,
|
@@ -1066,12 +1069,12 @@ class GeminiPdfParser(DocumentParser):
|
|
1066
1069
|
grouped into chunks based on `max_token_limit` before being sent
|
1067
1070
|
to the LLM.
|
1068
1071
|
"""
|
1069
|
-
self.split_on_page = config.pdf.
|
1072
|
+
self.split_on_page = config.pdf.llm_parser_config.split_on_page or False
|
1070
1073
|
|
1071
1074
|
# Rate limiting parameters
|
1072
1075
|
import asyncio
|
1073
1076
|
|
1074
|
-
self.requests_per_minute = config.pdf.
|
1077
|
+
self.requests_per_minute = config.pdf.llm_parser_config.requests_per_minute or 5
|
1075
1078
|
|
1076
1079
|
"""
|
1077
1080
|
A semaphore to control the number of concurrent requests to the LLM,
|
@@ -1175,7 +1178,7 @@ class GeminiPdfParser(DocumentParser):
|
|
1175
1178
|
"page_numbers": page_numbers, # List of page numbers in this chunk
|
1176
1179
|
}
|
1177
1180
|
|
1178
|
-
def
|
1181
|
+
def _prepare_pdf_chunks_for_llm(
|
1179
1182
|
self,
|
1180
1183
|
num_workers: Optional[int] = None,
|
1181
1184
|
max_tokens: int = DEFAULT_MAX_TOKENS,
|
@@ -1198,37 +1201,92 @@ class GeminiPdfParser(DocumentParser):
|
|
1198
1201
|
pdf_chunks = pool.map(self._merge_pages_into_pdf_with_metadata, chunks)
|
1199
1202
|
return pdf_chunks
|
1200
1203
|
|
1201
|
-
async def
|
1202
|
-
self, chunk: Dict[str, Any], gemini_api_key: str
|
1203
|
-
) -> str:
|
1204
|
+
async def _send_chunk_to_llm(self, chunk: Dict[str, Any]) -> str:
|
1204
1205
|
"""
|
1205
|
-
Sends a PDF chunk to the
|
1206
|
+
Sends a PDF chunk to the LLM API and returns the response text.
|
1206
1207
|
Uses retries with exponential backoff to handle transient failures.
|
1207
1208
|
"""
|
1208
1209
|
import asyncio
|
1209
1210
|
import logging
|
1210
1211
|
|
1211
|
-
from
|
1212
|
-
from google.genai import types
|
1212
|
+
from langroid.language_models.openai_gpt import OpenAIGPT, OpenAIGPTConfig
|
1213
1213
|
|
1214
1214
|
async with self.semaphore: # Limit concurrent API requests
|
1215
1215
|
for attempt in range(self.max_retries):
|
1216
1216
|
try:
|
1217
|
-
|
1217
|
+
llm_config = OpenAIGPTConfig(
|
1218
|
+
chat_model=self.model_name,
|
1219
|
+
max_output_tokens=self.max_tokens,
|
1220
|
+
)
|
1221
|
+
llm = OpenAIGPT(config=llm_config)
|
1222
|
+
base64_string = base64.b64encode(chunk["pdf_bytes"]).decode("utf-8")
|
1223
|
+
data_uri = f"data:application/pdf;base64,{base64_string}"
|
1224
|
+
if "gemini" in self.model_name.lower():
|
1225
|
+
file_content = dict(
|
1226
|
+
type="image_url",
|
1227
|
+
image_url=dict(url=data_uri),
|
1228
|
+
)
|
1229
|
+
elif "claude" in self.model_name.lower() and llm.is_litellm_proxy:
|
1230
|
+
file_content = dict(
|
1231
|
+
type="file",
|
1232
|
+
file=dict(
|
1233
|
+
file_data=data_uri,
|
1234
|
+
),
|
1235
|
+
)
|
1236
|
+
else:
|
1237
|
+
if not llm.is_openai_chat_model():
|
1238
|
+
logger.warning(
|
1239
|
+
f"""
|
1240
|
+
File uploads may not be supported for this model
|
1241
|
+
{self.model_name}. But attempting to
|
1242
|
+
use OpenAI-like file upload.
|
1243
|
+
""",
|
1244
|
+
)
|
1245
|
+
file_content = dict(
|
1246
|
+
type="file",
|
1247
|
+
file=dict(
|
1248
|
+
filename="dummy.pdf",
|
1249
|
+
file_data=data_uri,
|
1250
|
+
),
|
1251
|
+
)
|
1218
1252
|
|
1219
1253
|
# Send the request with PDF content and system instructions
|
1220
|
-
response = await
|
1221
|
-
model=self.model_name,
|
1222
|
-
|
1223
|
-
|
1224
|
-
|
1254
|
+
response = await llm.async_client.chat.completions.create( # type: ignore
|
1255
|
+
model=self.model_name.split("/")[-1],
|
1256
|
+
messages=[
|
1257
|
+
dict(
|
1258
|
+
role="system",
|
1259
|
+
content="""
|
1260
|
+
You are an expert pdf -> markdown converter.
|
1261
|
+
Do NOT use any triple backquotes when you present the
|
1262
|
+
markdown content,like ```markdown etc.
|
1263
|
+
FAITHFULLY CONVERT THE PDF TO MARKDOWN,
|
1264
|
+
retaining ALL content as you find it.
|
1265
|
+
""",
|
1266
|
+
),
|
1267
|
+
dict( # type: ignore
|
1268
|
+
role="user",
|
1269
|
+
content=[
|
1270
|
+
dict(
|
1271
|
+
type="text",
|
1272
|
+
text=self.LLM_PDF_MD_SYSTEM_INSTRUCTION,
|
1273
|
+
),
|
1274
|
+
file_content,
|
1275
|
+
],
|
1225
1276
|
),
|
1226
|
-
self.GEMINI_SYSTEM_INSTRUCTION,
|
1227
1277
|
],
|
1228
1278
|
)
|
1229
1279
|
|
1230
1280
|
# Return extracted text if available
|
1231
|
-
return
|
1281
|
+
return (
|
1282
|
+
""
|
1283
|
+
if (
|
1284
|
+
response is None
|
1285
|
+
or not hasattr(response, "choices")
|
1286
|
+
or not isinstance(response.choices, list)
|
1287
|
+
)
|
1288
|
+
else (response.choices[0].message.content)
|
1289
|
+
)
|
1232
1290
|
|
1233
1291
|
except Exception as e:
|
1234
1292
|
# Log error with page numbers for debugging
|
@@ -1251,28 +1309,24 @@ class GeminiPdfParser(DocumentParser):
|
|
1251
1309
|
chunk.get("page_numbers", "Unknown"),
|
1252
1310
|
)
|
1253
1311
|
break
|
1254
|
-
|
1255
1312
|
return "" # Return empty string if all retries fail
|
1256
1313
|
|
1257
|
-
async def process_chunks(
|
1258
|
-
self, chunks: List[Dict[str, Any]], api_key: str
|
1259
|
-
) -> List[str]:
|
1314
|
+
async def process_chunks(self, chunks: List[Dict[str, Any]]) -> List[str]:
|
1260
1315
|
"""
|
1261
|
-
Processes PDF chunks by sending them to the
|
1316
|
+
Processes PDF chunks by sending them to the LLM API and
|
1262
1317
|
collecting the results.
|
1263
1318
|
|
1264
1319
|
Args:
|
1265
1320
|
chunks: A list of dictionaries, where each dictionary represents
|
1266
1321
|
a PDF chunk and contains the PDF data and page numbers.
|
1267
|
-
api_key: The Gemini API key.
|
1268
1322
|
"""
|
1269
1323
|
# To show nice progress bar
|
1270
1324
|
from tqdm.asyncio import tqdm_asyncio
|
1271
1325
|
|
1272
|
-
# Create a list of asynchronous tasks to send each chunk to
|
1326
|
+
# Create a list of asynchronous tasks to send each chunk to the LLM.
|
1273
1327
|
# Chunk in this case might be single page or group of pages returned
|
1274
1328
|
# by prepare_pdf_chunks function
|
1275
|
-
tasks = [self.
|
1329
|
+
tasks = [self._send_chunk_to_llm(chunk) for chunk in chunks]
|
1276
1330
|
|
1277
1331
|
# Gather the results from all tasks, allowing exceptions to be returned.
|
1278
1332
|
# tqdm_asyncio is wrapper around asyncio.gather
|
@@ -1311,7 +1365,7 @@ class GeminiPdfParser(DocumentParser):
|
|
1311
1365
|
def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
|
1312
1366
|
"""
|
1313
1367
|
Iterates over the document pages, extracting content using the
|
1314
|
-
|
1368
|
+
LLM API, saves them to a markdown file, and yields page numbers
|
1315
1369
|
along with their corresponding content.
|
1316
1370
|
|
1317
1371
|
Yields:
|
@@ -1319,14 +1373,8 @@ class GeminiPdfParser(DocumentParser):
|
|
1319
1373
|
(int) and the page content (Any).
|
1320
1374
|
"""
|
1321
1375
|
import asyncio
|
1322
|
-
import os
|
1323
1376
|
|
1324
|
-
# Load environment variables (e.g., GEMINI_API_KEY) from a .env file.
|
1325
1377
|
load_dotenv()
|
1326
|
-
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
1327
|
-
if not gemini_api_key:
|
1328
|
-
raise ValueError("GEMINI_API_KEY not found in environment variables.")
|
1329
|
-
|
1330
1378
|
try:
|
1331
1379
|
# This involves extracting pages, grouping them according to the
|
1332
1380
|
# `max_tokens` limit (if `split_on_page` is False), and
|
@@ -1335,18 +1383,16 @@ class GeminiPdfParser(DocumentParser):
|
|
1335
1383
|
# PDF bytes and the associated page numbers or single page if
|
1336
1384
|
# `split_on_page` is true
|
1337
1385
|
|
1338
|
-
pdf_chunks = self.
|
1386
|
+
pdf_chunks = self._prepare_pdf_chunks_for_llm(
|
1339
1387
|
num_workers=8,
|
1340
1388
|
max_tokens=self.max_tokens,
|
1341
1389
|
split_on_page=self.split_on_page,
|
1342
1390
|
)
|
1343
1391
|
|
1344
1392
|
# We asynchronously processes each chunk, sending it
|
1345
|
-
# to
|
1393
|
+
# to the LLM and retrieving the Markdown output. It handles rate
|
1346
1394
|
# limiting and retries.
|
1347
|
-
markdown_results = asyncio.run(
|
1348
|
-
self.process_chunks(pdf_chunks, gemini_api_key)
|
1349
|
-
)
|
1395
|
+
markdown_results = asyncio.run(self.process_chunks(pdf_chunks))
|
1350
1396
|
|
1351
1397
|
# This file serves as an intermediate storage location for the
|
1352
1398
|
# complete Markdown output.
|
@@ -36,10 +36,10 @@ class BaseParsingConfig(BaseSettings):
|
|
36
36
|
extra = "ignore" # Ignore unknown settings
|
37
37
|
|
38
38
|
|
39
|
-
class
|
40
|
-
"""Configuration for
|
39
|
+
class LLMPdfParserConfig(BaseSettings):
|
40
|
+
"""Configuration for LLM-based parsing."""
|
41
41
|
|
42
|
-
model_name: str = "gemini-2.0-flash" # Default model
|
42
|
+
model_name: str = "gemini/gemini-2.0-flash" # Default model
|
43
43
|
max_tokens: Optional[int] = None
|
44
44
|
split_on_page: Optional[bool] = True
|
45
45
|
requests_per_minute: Optional[int] = 5
|
@@ -60,10 +60,10 @@ class PdfParsingConfig(BaseParsingConfig):
|
|
60
60
|
"unstructured",
|
61
61
|
"pdf2image",
|
62
62
|
"markitdown",
|
63
|
-
"
|
63
|
+
"llm-pdf-parser",
|
64
64
|
"marker",
|
65
65
|
] = "pymupdf4llm"
|
66
|
-
|
66
|
+
llm_parser_config: Optional[LLMPdfParserConfig] = None
|
67
67
|
marker_config: Optional[MarkerConfig] = None
|
68
68
|
|
69
69
|
@root_validator(pre=True)
|
@@ -71,10 +71,10 @@ class PdfParsingConfig(BaseParsingConfig):
|
|
71
71
|
"""Ensure correct config is set based on library selection."""
|
72
72
|
library = values.get("library")
|
73
73
|
|
74
|
-
if library == "
|
75
|
-
values.setdefault("
|
74
|
+
if library == "llm-pdf-parser":
|
75
|
+
values.setdefault("llm_parser_config", LLMPdfParserConfig())
|
76
76
|
else:
|
77
|
-
values["
|
77
|
+
values["llm_parser_config"] = None
|
78
78
|
|
79
79
|
if library == "marker":
|
80
80
|
values.setdefault("marker_config", MarkerConfig())
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/lance_rag/query_planner_agent.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{langroid-0.50.12 → langroid-0.51.0}/langroid/agent/special/sql/utils/description_extractors.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{langroid-0.50.12 → langroid-0.51.0}/langroid/embedding_models/protoc/embeddings_pb2_grpc.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/prompt_formatter/hf_formatter.py
RENAMED
File without changes
|
{langroid-0.50.12 → langroid-0.51.0}/langroid/language_models/prompt_formatter/llama2_formatter.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|