PyPI - langroid - Versions diffs - 0.42.9__py3-none-any.whl → 0.43.0__py3-none-any.whl - Mend

langroid 0.42.9py3-none-any.whl → 0.43.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

langroid/agent/base.py +23 -13
langroid/agent/chat_agent.py +6 -1
langroid/agent/task.py +4 -1
langroid/language_models/openai_gpt.py +15 -17
langroid/parsing/document_parser.py +415 -3
langroid/parsing/parser.py +33 -3
langroid/utils/system.py +6 -1
{langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/METADATA +3 -1
{langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/RECORD +11 -11
{langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/WHEEL +0 -0
{langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/licenses/LICENSE +0 -0

langroid/agent/base.py CHANGED Viewed

@@ -1148,7 +1148,9 @@ class Agent(ABC):
             and msg.function_call is None
         ):
-            tools = self.get_formatted_tool_messages(msg.content)
+            tools = self.get_formatted_tool_messages(
+                msg.content, from_llm=msg.metadata.sender == Entity.LLM
+            )
             msg.all_tool_messages = tools
             # filter for actually handle-able tools, and recipient is this agent
             my_tools = [t for t in tools if self._tool_recipient_match(t)]
@@ -1177,7 +1179,9 @@ class Agent(ABC):
         else:
             return my_tools
-    def get_formatted_tool_messages(self, input_str: str) -> List[ToolMessage]:
+    def get_formatted_tool_messages(
+        self, input_str: str, from_llm: bool = True
+    ) -> List[ToolMessage]:
         """
         Returns ToolMessage objects (tools) corresponding to
         tool-formatted substrings, if any.
@@ -1190,6 +1194,8 @@ class Agent(ABC):
         Args:
             input_str (str): input string, typically a message sent by an LLM
+            from_llm (bool): whether the input was generated by the LLM. If so,
+                we track malformed tool calls.
         Returns:
             List[ToolMessage]: list of ToolMessage objects
@@ -1203,7 +1209,7 @@ class Agent(ABC):
             if not is_json:
                 return []
-        results = [self._get_one_tool_message(j, is_json) for j in substrings]
+        results = [self._get_one_tool_message(j, is_json, from_llm) for j in substrings]
         valid_results = [r for r in results if r is not None]
         # If any tool is correctly formed we do not set the flag
         if len(valid_results) > 0:
@@ -1219,6 +1225,7 @@ class Agent(ABC):
             return None
         tool_name = msg.function_call.name
         tool_msg = msg.function_call.arguments or {}
+        self.tool_error = False
         if tool_name not in self.llm_tools_handled:
             logger.warning(
                 f"""
@@ -1230,10 +1237,12 @@ class Agent(ABC):
                 or you need to enable this agent to handle this fn-call.
                 """
             )
-            if tool_name not in self.all_llm_tools_known:
+            if (
+                tool_name not in self.all_llm_tools_known
+                and msg.metadata.sender == Entity.LLM
+            ):
                 self.tool_error = True
             return None
-        self.tool_error = False
         tool_class = self.llm_tools_map[tool_name]
         tool_msg.update(dict(request=tool_name))
         tool = tool_class.parse_obj(tool_msg)
@@ -1272,8 +1281,9 @@ class Agent(ABC):
             tool = tool_class.parse_obj(tool_msg)
             tool.id = tc.id or ""
             tools.append(tool)
-        # When no tool is valid, set the recovery flag
-        self.tool_error = all_errors
+        # When no tool is valid and the message was produced
+        # by the LLM, set the recovery flag
+        self.tool_error = all_errors and msg.metadata.sender == Entity.LLM
         return tools
     def tool_validation_error(self, ve: ValidationError) -> str:
@@ -1508,7 +1518,7 @@ class Agent(ABC):
         return None
     def _get_one_tool_message(
-        self, tool_candidate_str: str, is_json: bool = True
+        self, tool_candidate_str: str, is_json: bool = True, from_llm: bool = True
     ) -> Optional[ToolMessage]:
         """
         Parse the tool_candidate_str into ANY ToolMessage KNOWN to agent --
@@ -1545,7 +1555,7 @@ class Agent(ABC):
         # }
         if not isinstance(maybe_tool_dict, dict):
-            self.tool_error = True
+            self.tool_error = from_llm
             return None
         properties = maybe_tool_dict.get("properties")
@@ -1593,23 +1603,23 @@ class Agent(ABC):
             if len(candidate_tools) == 1:
                 return candidate_tools[0]
             else:
-                self.tool_error = True
+                self.tool_error = from_llm
                 return None
         if not isinstance(request, str) or request not in self.all_llm_tools_known:
-            self.tool_error = True
+            self.tool_error = from_llm
             return None
         message_class = self.llm_tools_map.get(request)
         if message_class is None:
             logger.warning(f"No message class found for request '{request}'")
-            self.tool_error = True
+            self.tool_error = from_llm
             return None
         try:
             message = message_class.parse_obj(maybe_tool_dict)
         except ValidationError as ve:
-            self.tool_error = True
+            self.tool_error = from_llm
             raise ve
         return message

langroid/agent/chat_agent.py CHANGED Viewed

@@ -1096,7 +1096,10 @@ class ChatAgent(Agent):
                 else:
                     # We will trigger the strict recovery mechanism to force
                     # the LLM to correct its output, allowing us to parse
-                    self.tool_error = True
+                    if isinstance(msg, ChatDocument):
+                        self.tool_error = msg.metadata.sender == Entity.LLM
+                    else:
+                        self.tool_error = True
             raise ve
@@ -1265,6 +1268,7 @@ class ChatAgent(Agent):
             and self._json_schema_available()
             and self.config.strict_recovery
         ):
+            self.tool_error = False
             AnyTool = self._get_any_tool_message()
             if AnyTool is None:
                 return None
@@ -1352,6 +1356,7 @@ class ChatAgent(Agent):
             and self._json_schema_available()
             and self.config.strict_recovery
         ):
+            self.tool_error = False
             AnyTool = self._get_any_tool_message()
             self.set_output_format(
                 AnyTool,

langroid/agent/task.py CHANGED Viewed

@@ -1572,7 +1572,10 @@ class Task:
             response_fn = self._entity_responder_async_map[cast(Entity, e)]
             result = await response_fn(self.pending_message)
             # update result.tool_messages if any
-            if isinstance(result, ChatDocument):
+            if (
+                isinstance(result, ChatDocument)
+                and result.metadata.sender == Entity.LLM
+            ):
                 self.agent.try_get_tool_messages(result)
         result_chat_doc = self.agent.to_ChatDocument(

langroid/language_models/openai_gpt.py CHANGED Viewed

@@ -85,9 +85,6 @@ GLHF_BASE_URL = "https://glhf.chat/api/openai/v1"
 OLLAMA_API_KEY = "ollama"
 DUMMY_API_KEY = "xxx"
-VLLM_API_KEY = os.environ.get("VLLM_API_KEY", DUMMY_API_KEY)
-LLAMACPP_API_KEY = os.environ.get("LLAMA_API_KEY", DUMMY_API_KEY)
 openai_chat_model_pref_list = [
     OpenAIChatModel.GPT4o,
@@ -421,6 +418,9 @@ class OpenAIGPT(LanguageModel):
         self.supports_json_schema: bool = self.config.supports_json_schema or False
         self.supports_strict_tools: bool = self.config.supports_strict_tools or False
+        OPENAI_API_KEY = os.getenv("OPENAI_API_KEY", DUMMY_API_KEY)
+        self.api_key = config.api_key
         # if model name starts with "litellm",
         # set the actual model name by stripping the "litellm/" prefix
         # and set the litellm flag to True
@@ -449,12 +449,14 @@ class OpenAIGPT(LanguageModel):
             # use api_base from config if set, else fall back on OLLAMA_BASE_URL
             self.api_base = self.config.api_base or OLLAMA_BASE_URL
-            self.api_key = OLLAMA_API_KEY
+            if self.api_key == OPENAI_API_KEY:
+                self.api_key = OLLAMA_API_KEY
             self.config.chat_model = self.config.chat_model.replace("ollama/", "")
         elif self.config.chat_model.startswith("vllm/"):
             self.supports_json_schema = True
             self.config.chat_model = self.config.chat_model.replace("vllm/", "")
-            self.api_key = VLLM_API_KEY
+            if self.api_key == OPENAI_API_KEY:
+                self.api_key = os.environ.get("VLLM_API_KEY", DUMMY_API_KEY)
             self.api_base = self.config.api_base or "http://localhost:8000/v1"
             if not self.api_base.startswith("http"):
                 self.api_base = "http://" + self.api_base
@@ -465,7 +467,8 @@ class OpenAIGPT(LanguageModel):
             self.api_base = self.config.chat_model.split("/", 1)[1]
             if not self.api_base.startswith("http"):
                 self.api_base = "http://" + self.api_base
-            self.api_key = LLAMACPP_API_KEY
+            if self.api_key == OPENAI_API_KEY:
+                self.api_key = os.environ.get("LLAMA_API_KEY", DUMMY_API_KEY)
         else:
             self.api_base = self.config.api_base
             # If api_base is unset we use OpenAI's endpoint, which supports
@@ -487,11 +490,6 @@ class OpenAIGPT(LanguageModel):
         if self.config.use_completion_for_chat:
             self.config.use_chat_for_completion = False
-        self.api_key = config.api_key
-        if self.is_openai_completion_model() or self.is_openai_chat_model():
-            if self.api_key == DUMMY_API_KEY:
-                self.api_key = os.getenv("OPENAI_API_KEY", DUMMY_API_KEY)
         self.is_groq = self.config.chat_model.startswith("groq/")
         self.is_cerebras = self.config.chat_model.startswith("cerebras/")
         self.is_gemini = self.is_gemini_model()
@@ -502,7 +500,7 @@ class OpenAIGPT(LanguageModel):
         if self.is_groq:
             # use groq-specific client
             self.config.chat_model = self.config.chat_model.replace("groq/", "")
-            if self.api_key == DUMMY_API_KEY:
+            if self.api_key == OPENAI_API_KEY:
                 self.api_key = os.getenv("GROQ_API_KEY", DUMMY_API_KEY)
             self.client = Groq(
                 api_key=self.api_key,
@@ -513,7 +511,7 @@ class OpenAIGPT(LanguageModel):
         elif self.is_cerebras:
             # use cerebras-specific client
             self.config.chat_model = self.config.chat_model.replace("cerebras/", "")
-            if self.api_key == DUMMY_API_KEY:
+            if self.api_key == OPENAI_API_KEY:
                 self.api_key = os.getenv("CEREBRAS_API_KEY", DUMMY_API_KEY)
             self.client = Cerebras(
                 api_key=self.api_key,
@@ -526,25 +524,25 @@ class OpenAIGPT(LanguageModel):
             # in these cases, there's no specific client: OpenAI python client suffices
             if self.is_gemini:
                 self.config.chat_model = self.config.chat_model.replace("gemini/", "")
-                if self.api_key == DUMMY_API_KEY:
+                if self.api_key == OPENAI_API_KEY:
                     self.api_key = os.getenv("GEMINI_API_KEY", DUMMY_API_KEY)
                 self.api_base = GEMINI_BASE_URL
             elif self.is_glhf:
                 self.config.chat_model = self.config.chat_model.replace("glhf/", "")
-                if self.api_key == DUMMY_API_KEY:
+                if self.api_key == OPENAI_API_KEY:
                     self.api_key = os.getenv("GLHF_API_KEY", DUMMY_API_KEY)
                 self.api_base = GLHF_BASE_URL
             elif self.is_openrouter:
                 self.config.chat_model = self.config.chat_model.replace(
                     "openrouter/", ""
                 )
-                if self.api_key == DUMMY_API_KEY:
+                if self.api_key == OPENAI_API_KEY:
                     self.api_key = os.getenv("OPENROUTER_API_KEY", DUMMY_API_KEY)
                 self.api_base = OPENROUTER_BASE_URL
             elif self.is_deepseek:
                 self.config.chat_model = self.config.chat_model.replace("deepseek/", "")
                 self.api_base = DEEPSEEK_BASE_URL
-                if self.api_key == DUMMY_API_KEY:
+                if self.api_key == OPENAI_API_KEY:
                     self.api_key = os.getenv("DEEPSEEK_API_KEY", DUMMY_API_KEY)
             self.client = OpenAI(

langroid/parsing/document_parser.py CHANGED Viewed

@@ -9,7 +9,9 @@ from enum import Enum
 from io import BytesIO
 from itertools import accumulate
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, Generator, List, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple, Union
+from dotenv import load_dotenv
 from langroid.exceptions import LangroidImportError
 from langroid.utils.object_registry import ObjectRegistry
@@ -163,6 +165,8 @@ class DocumentParser(Parser):
                 return UnstructuredPDFParser(source, config)
             elif config.pdf.library == "pdf2image":
                 return ImagePdfParser(source, config)
+            elif config.pdf.library == "gemini":
+                return GeminiPdfParser(source, config)
             else:
                 raise ValueError(
                     f"Unsupported PDF library specified: {config.pdf.library}"
@@ -415,13 +419,15 @@ class DocumentParser(Parser):
             # that it needs to be combined with the next chunk.
             while len(split) > self.config.chunk_size:
                 # pretty formatting of pages (e.g. 1-3, 4, 5-7)
-                pg = "-".join([pages[0], pages[-1]])
+                p_0 = int(pages[0])
+                p_n = int(pages[-1])
+                page_str = f"pages {p_0}-{p_n}" if p_0 != p_n else f"page {p_0}"
                 text = self.tokenizer.decode(split[: self.config.chunk_size])
                 docs.append(
                     Document(
                         content=text,
                         metadata=DocMetaData(
-                            source=f"{self.source} pages {pg}",
+                            source=f"{self.source} {page_str}",
                             is_chunk=True,
                             id=common_id,
                         ),
@@ -952,3 +958,409 @@ class MarkitdownPPTXParser(DocumentParser):
             content=self.fix_text(md_content),
             metadata=DocMetaData(source=self.source),
         )
+class GeminiPdfParser(DocumentParser):
+    """
+    This class converts PDFs to Markdown using Gemini multimodal LLMs.
+    It extracts pages, converts them with the LLM (replacing images with
+    detailed descriptions), and outputs Markdown page by page. The
+    conversion follows `GEMINI_SYSTEM_INSTRUCTION`. It employs
+    multiprocessing for speed, async requests with rate limiting, and
+    handles errors.
+    It supports page-by-page splitting or chunking multiple pages into
+    one, respecting page boundaries and a `max_token_limit`.
+    """
+    DEFAULT_MAX_TOKENS = 7000
+    OUTPUT_DIR = Path(".gemini_pdfparser")  # Fixed output directory
+    GEMINI_SYSTEM_INSTRUCTION = """
+    ### **Convert PDF to Markdown**
+    1. **Text:**
+        * Preserve structure, formatting (**bold**, *italic*), lists, and indentation.
+        * **Remove running heads (page numbers, headers/footers).**
+        * Keep section and chapter titles; discard repeated page headers.
+    2. **Images:** Replace with **detailed, creative descriptions**
+    optimized for clarity and understanding.
+    3. **Tables:** Convert to Markdown tables with proper structure.
+    4. **Math:** Use LaTeX (`...` inline, `$...$` block).
+    5. **Code:** Wrap in fenced blocks without specifying a language:
+        ```
+        code
+        ```
+    6. **Clean Output:**
+        * No system messages, metadata, or artifacts or ```markdown``` identifier.
+        * Do **not** include introductory or explanatory messages
+        like "Here is your output."
+        * Ensure formatting is **consistent and structured**
+        for feeding into a markdown parser.
+    """.strip()
+    def __init__(self, source: Union[str, bytes], config: ParsingConfig):
+        super().__init__(source, config)
+        if not config.pdf.gemini_config:
+            raise ValueError(
+                "GeminiPdfParser requires a Gemini-based config in pdf parsing config"
+            )
+        self.model_name = config.pdf.gemini_config.model_name
+        # Ensure output directory exists
+        self.OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+        prefix = (
+            Path(source).stem + "_"
+            if isinstance(source, str) and Path(source).exists()
+            else "output_"
+        )
+        temp_file = tempfile.NamedTemporaryFile(
+            suffix=".md",
+            prefix=prefix,
+            dir=str(self.OUTPUT_DIR),
+            delete=False,
+        )
+        temp_file.close()
+        self.output_filename = Path(temp_file.name)
+        self.max_tokens = config.pdf.gemini_config.max_tokens or self.DEFAULT_MAX_TOKENS
+        """
+        If True, each PDF page is processed as a separate chunk,
+        resulting in one LLM request per page. If False, pages are
+        grouped into chunks based on `max_token_limit` before being sent
+        to the LLM.
+        """
+        self.split_on_page = config.pdf.gemini_config.split_on_page or False
+        # Rate limiting parameters
+        import asyncio
+        self.requests_per_minute = config.pdf.gemini_config.requests_per_minute or 5
+        """
+        A semaphore to control the number of concurrent requests to the LLM,
+        preventing rate limit errors.  A semaphore slot is acquired before
+        making an LLM request and released after the request is complete.
+        """
+        self.semaphore = asyncio.Semaphore(self.requests_per_minute)
+        self.retry_delay = 5  # seconds, for exponential backoff
+        self.max_retries = 3
+    def _extract_page(self, page_num: int) -> Dict[str, Any]:
+        """
+        Extracts a single page and estimates token count.
+        Opens the PDF from self.doc_bytes (a BytesIO object).
+        """
+        import fitz
+        try:
+            # Always open the document from in-memory bytes.
+            doc = fitz.open(stream=self.doc_bytes.getvalue(), filetype="pdf")
+            new_pdf = fitz.open()
+            new_pdf.insert_pdf(doc, from_page=page_num, to_page=page_num)
+            pdf_bytes = new_pdf.write()
+            text = doc[page_num].get_text("text")
+            token_count = len(text) // 4 if text else len(pdf_bytes) // 4
+            return {
+                "page_numbers": page_num + 1,
+                "pdf_bytes": pdf_bytes,
+                "token_count": token_count,
+            }
+        except Exception as e:
+            raise ValueError(f"Error processing PDF document: {e}") from e
+    def _extract_pdf_pages_parallel(
+        self, num_workers: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
+        """Parallel PDF page extraction using self.doc_bytes."""
+        from multiprocessing import Pool, cpu_count
+        import fitz
+        from tqdm import tqdm
+        try:
+            doc = fitz.open(stream=self.doc_bytes.getvalue(), filetype="pdf")
+            total_pages = len(doc)
+        except Exception as e:
+            raise ValueError(f"Error opening PDF document: {e}") from e
+        num_workers = num_workers or cpu_count()
+        with Pool(num_workers) as pool:
+            with tqdm(total=total_pages, desc="Extracting pages", unit="page") as pbar:
+                results = []
+                for result in pool.imap(self._extract_page, range(total_pages)):
+                    results.append(result)
+                    pbar.update(1)
+        return results
+    def _group_pages_by_token_limit(
+        self, pages: List[Dict[str, Any]], max_tokens: int = DEFAULT_MAX_TOKENS
+    ) -> List[List[Dict[str, Any]]]:
+        """Groups pages into chunks where each chunk is approximately `max_tokens`."""
+        chunks: List[List[Dict[str, Any]]] = []
+        current_chunk: List[Dict[str, Any]] = []
+        current_tokens = 0
+        for page in pages:
+            if current_tokens + page["token_count"] > max_tokens and current_chunk:
+                chunks.append(current_chunk)
+                current_chunk = []
+                current_tokens = 0
+            current_chunk.append(page)
+            current_tokens += page["token_count"]
+        if current_chunk:  # Add remaining pages
+            chunks.append(current_chunk)
+        return chunks
+    def _merge_pages_into_pdf_with_metadata(
+        self, page_group: List[Dict[str, Any]]
+    ) -> Dict[str, Any]:
+        """
+        Merges grouped pages into a single binary chunk so that
+        it does not exceed max token limit
+        """
+        import fitz
+        merged_pdf = fitz.open()
+        page_numbers = []
+        for page in page_group:
+            temp_pdf = fitz.open("pdf", page["pdf_bytes"])
+            merged_pdf.insert_pdf(temp_pdf)
+            page_numbers.append(page["page_numbers"])
+        return {
+            "pdf_bytes": merged_pdf.write(),  # Binary PDF data
+            "page_numbers": page_numbers,  # List of page numbers in this chunk
+        }
+    def _prepare_pdf_chunks_for_gemini(
+        self,
+        num_workers: Optional[int] = None,
+        max_tokens: int = DEFAULT_MAX_TOKENS,
+        split_on_page: bool = False,
+    ) -> List[Dict[str, Any]]:
+        """
+        Extracts, groups, and merges PDF pages into chunks with embedded page markers.
+        """
+        from multiprocessing import Pool
+        pages = self._extract_pdf_pages_parallel(num_workers)
+        if split_on_page:
+            # Each page becomes its own chunk
+            return pages
+        else:
+            # Group pages based on token limit
+            chunks = self._group_pages_by_token_limit(pages, max_tokens)
+            with Pool(num_workers) as pool:
+                pdf_chunks = pool.map(self._merge_pages_into_pdf_with_metadata, chunks)
+            return pdf_chunks
+    async def _send_chunk_to_gemini(
+        self, chunk: Dict[str, Any], gemini_api_key: str
+    ) -> str:
+        """
+        Sends a PDF chunk to the Gemini API and returns the response text.
+        Uses retries with exponential backoff to handle transient failures.
+        """
+        import asyncio
+        import logging
+        from google import genai
+        from google.genai import types
+        async with self.semaphore:  # Limit concurrent API requests
+            for attempt in range(self.max_retries):
+                try:
+                    client = genai.Client(api_key=gemini_api_key)
+                    # Send the request with PDF content and system instructions
+                    response = await client.aio.models.generate_content(
+                        model=self.model_name,
+                        contents=[
+                            types.Part.from_bytes(
+                                data=chunk["pdf_bytes"], mime_type="application/pdf"
+                            ),
+                            self.GEMINI_SYSTEM_INSTRUCTION,
+                        ],
+                    )
+                    # Return extracted text if available
+                    return str(response.text) if response.text else ""
+                except Exception as e:
+                    # Log error with page numbers for debugging
+                    logging.error(
+                        "Attempt %d failed for pages %s: %s",
+                        attempt + 1,
+                        chunk.get("page_numbers", "Unknown"),
+                        e,
+                    )
+                    if attempt < self.max_retries - 1:
+                        # Apply exponential backoff before retrying
+                        delay = self.retry_delay * (2**attempt)
+                        logging.info("Retrying in %s sec...", delay)
+                        await asyncio.sleep(delay)
+                    else:
+                        # Log failure after max retries
+                        logging.error(
+                            "Max retries reached for pages %s",
+                            chunk.get("page_numbers", "Unknown"),
+                        )
+                        break
+        return ""  # Return empty string if all retries fail
+    async def process_chunks(
+        self, chunks: List[Dict[str, Any]], api_key: str
+    ) -> List[str]:
+        """
+        Processes PDF chunks by sending them to the Gemini API and
+        collecting the results.
+        Args:
+            chunks: A list of dictionaries, where each dictionary represents
+                a PDF chunk and contains the PDF data and page numbers.
+            api_key: The Gemini API key.
+        """
+        # To show nice progress bar
+        from tqdm.asyncio import tqdm_asyncio
+        # Create a list of asynchronous tasks to send each chunk to Gemini.
+        # Chunk in this case might be single page or group of pages returned
+        # by prepare_pdf_chunks function
+        tasks = [self._send_chunk_to_gemini(chunk, api_key) for chunk in chunks]
+        # Gather the results from all tasks, allowing exceptions to be returned.
+        # tqdm_asyncio is wrapper around asyncio.gather
+        gathered_results = await tqdm_asyncio.gather(
+            *tasks, desc="Processing chunks(pages)", unit="chunk"
+        )
+        results = []
+        for i, result in enumerate(gathered_results):
+            chunk = chunks[i]  # Get the corresponding chunk.
+            if isinstance(result, Exception):
+                # Handle exceptions that occurred during chunk processing.
+                logging.error(
+                    "Failed to process chunk %s: %s",
+                    chunk.get("page_numbers", "Unknown"),
+                    result,
+                )
+                results.append(
+                    "<!----Error: Could not process chunk %s---->"
+                    % chunk.get("page_numbers", "Unknown")
+                )
+            else:
+                # Process successful results and append page/chunk markers.
+                markdown = str(result)
+                if self.split_on_page:
+                    results.append(
+                        markdown + f"<!----Page-{chunk['page_numbers']}---->"
+                    )
+                else:
+                    results.append(
+                        markdown + f"<!----Chunk-{chunk['page_numbers']}---->"
+                    )
+        return results  # Return the list of results.
+    def iterate_pages(self) -> Generator[Tuple[int, Any], None, None]:
+        """
+        Iterates over the document pages, extracting content using the
+        Gemini API, saves them to a markdown file, and yields page numbers
+        along with their corresponding content.
+        Yields:
+            A generator of tuples, where each tuple contains the page number
+            (int) and the page content (Any).
+        """
+        import asyncio
+        import os
+        # Load environment variables (e.g., GEMINI_API_KEY) from a .env file.
+        load_dotenv()
+        gemini_api_key = os.getenv("GEMINI_API_KEY")
+        if not gemini_api_key:
+            raise ValueError("GEMINI_API_KEY not found in environment variables.")
+        try:
+            # This involves extracting pages, grouping them according to the
+            # `max_tokens` limit (if `split_on_page` is False), and
+            # merging pages into larger PDF chunks. The result
+            # is a list of dictionaries, where each dictionary contains the
+            # PDF bytes and the associated page numbers or single page if
+            # `split_on_page` is true
+            pdf_chunks = self._prepare_pdf_chunks_for_gemini(
+                num_workers=8,
+                max_tokens=self.max_tokens,
+                split_on_page=self.split_on_page,
+            )
+            # We asynchronously processes each chunk, sending it
+            # to Gemini and retrieving the Markdown output. It handles rate
+            # limiting and retries.
+            markdown_results = asyncio.run(
+                self.process_chunks(pdf_chunks, gemini_api_key)
+            )
+            # This file serves as an intermediate storage location for the
+            # complete Markdown output.
+            with open(self.output_filename, "w", encoding="utf-8") as outfile:
+                outfile.write("\n\n".join(markdown_results))
+            # Read the full Markdown content from the temporary file.
+            with open(self.output_filename, "r", encoding="utf-8") as infile:
+                full_markdown = infile.read()
+            # The splitting is based on the `split_on_page` setting. If True,
+            # the Markdown is split using the "Page-" marker. Otherwise, it's
+            # split using the "Chunk-" marker.
+            if self.split_on_page:
+                pages = full_markdown.split("<!----Page-")
+            else:
+                pages = full_markdown.split("<!----Chunk-")
+            # Remove the first element if it's empty (due to the split).
+            if pages and pages[0] == "":
+                pages = pages[1:]
+            # Iterate over the pages or chunks and yield their content.
+            for i, page in enumerate(pages):
+                # Check for errors during processing.
+                if "<!----Error:" in page:
+                    page_content = page
+                    logging.warning(f"Page {i}: Error processing chunk.")
+                else:
+                    # Extract the actual page content by removing the marker.
+                    page_content = (
+                        page.split("---->", 1)[1]
+                        if len(page.split("---->", 1)) > 1
+                        else page
+                    )
+                # Yield the page number and content.
+                yield i, page_content
+        except Exception as e:
+            raise ValueError(f"Error processing document: {e}") from e
+    def get_document_from_page(self, page: str) -> Document:
+        """
+        Get a Document object from a given markdown page.
+        """
+        return Document(
+            content=page,
+            metadata=DocMetaData(source=self.source),
+        )

langroid/parsing/parser.py CHANGED Viewed

@@ -1,13 +1,13 @@
 import logging
 import re
 from enum import Enum
-from typing import Dict, List, Literal
+from typing import Any, Dict, List, Literal, Optional
 import tiktoken
 from langroid.mytypes import Document
 from langroid.parsing.para_sentence_split import create_chunks, remove_extra_whitespace
-from langroid.pydantic_v1 import BaseSettings
+from langroid.pydantic_v1 import BaseSettings, root_validator
 from langroid.utils.object_registry import ObjectRegistry
 logger = logging.getLogger(__name__)
@@ -20,7 +20,26 @@ class Splitter(str, Enum):
     SIMPLE = "simple"
-class PdfParsingConfig(BaseSettings):
+class BaseParsingConfig(BaseSettings):
+    """Base class for document parsing configurations."""
+    library: str
+    class Config:
+        extra = "ignore"  # Ignore unknown settings
+class GeminiConfig(BaseSettings):
+    """Configuration for Gemini-based parsing."""
+    model_name: str = "gemini-2.0-flash"  # Default model
+    max_tokens: Optional[int] = None
+    split_on_page: Optional[bool] = True
+    requests_per_minute: Optional[int] = 5
+class PdfParsingConfig(BaseParsingConfig):
     library: Literal[
         "fitz",
         "pymupdf4llm",
@@ -29,7 +48,18 @@ class PdfParsingConfig(BaseSettings):
         "unstructured",
         "pdf2image",
         "markitdown",
+        "gemini",
     ] = "pymupdf4llm"
+    gemini_config: Optional[GeminiConfig] = None
+    @root_validator(pre=True)
+    def enable_gemini_config(cls, values: Dict[str, Any]) -> Dict[str, Any]:
+        """Ensure GeminiConfig is set only when library is 'gemini'."""
+        if values.get("library") == "gemini":
+            values["gemini_config"] = values.get("gemini_config") or GeminiConfig()
+        else:
+            values["gemini_config"] = None
+        return values
 class DocxParsingConfig(BaseSettings):

langroid/utils/system.py CHANGED Viewed

@@ -14,7 +14,12 @@ from typing import Any, Literal
 logger = logging.getLogger(__name__)
-DELETION_ALLOWED_PATHS = [".qdrant", ".chroma", ".lancedb", ".weaviate"]
+DELETION_ALLOWED_PATHS = [
+    ".qdrant",
+    ".chroma",
+    ".lancedb",
+    ".weaviate",
+]
 def pydantic_major_version() -> int:

{langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langroid
-Version: 0.42.9
+Version: 0.43.0
 Summary: Harness LLMs with Multi-Agent Programming
 Author-email: Prasad Chalasani <pchalasani@gmail.com>
 License: MIT
@@ -86,6 +86,8 @@ Requires-Dist: weaviate-client>=4.9.6; extra == 'all'
 Provides-Extra: arango
 Requires-Dist: arango-datasets<2.0.0,>=1.2.2; extra == 'arango'
 Requires-Dist: python-arango<9.0.0,>=8.1.2; extra == 'arango'
+Provides-Extra: asyncio
+Requires-Dist: asyncio>=3.4.3; extra == 'asyncio'
 Provides-Extra: chainlit
 Requires-Dist: chainlit<3.0.0,>=2.0.1; extra == 'chainlit'
 Requires-Dist: python-socketio<6.0.0,>=5.11.0; extra == 'chainlit'

{langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/RECORD RENAMED Viewed

@@ -3,12 +3,12 @@ langroid/exceptions.py,sha256=OPjece_8cwg94DLPcOGA1ddzy5bGh65pxzcHMnssTz8,2995
 langroid/mytypes.py,sha256=FXSH62MUCeMCJP-66RVmbNaHCDLMxllEShZ-xEeTn9A,2833
 langroid/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 langroid/agent/__init__.py,sha256=ll0Cubd2DZ-fsCMl7e10hf9ZjFGKzphfBco396IKITY,786
-langroid/agent/base.py,sha256=k5kJGTpo2CcnVl2cEM0luBQ__a7C9put1aH5-wd3hQ8,78212
+langroid/agent/base.py,sha256=0szJ5ZxNSmobFO5805ur2cqKfD6vUP4ooN76Z5qAeyw,78677
 langroid/agent/batch.py,sha256=vi1r5i1-vN80WfqHDSwjEym_KfGsqPGUtwktmiK1nuk,20635
-langroid/agent/chat_agent.py,sha256=hUu13nYhhr6ph01Sln8y_WuOIpcd38icN6p22h6IiDY,84211
+langroid/agent/chat_agent.py,sha256=yuuEWVFLIN71XUpxdbhwZxEKAbOWG7zAV3ofYX4lCWg,84443
 langroid/agent/chat_document.py,sha256=xzMtrPbaW-Y-BnF7kuhr2dorsD-D5rMWzfOqJ8HAoo8,17885
 langroid/agent/openai_assistant.py,sha256=JkAcs02bIrgPNVvUWVR06VCthc5-ulla2QMBzux_q6o,34340
-langroid/agent/task.py,sha256=Mi1QZgbRWvKZKEqkh5157LdUFjPKq7EF77yEeqU7fGE,90468
+langroid/agent/task.py,sha256=HB6N-Jn80HFqCf0ZYOC1v3Bn3oO7NLjShHQJJFwW0q4,90557
 langroid/agent/tool_message.py,sha256=BhjP-_TfQ2tgxuY4Yo_JHLOwwt0mJ4BwjPnREvEY4vk,14744
 langroid/agent/xml_tool_message.py,sha256=6SshYZJKIfi4mkE-gIoSwjkEYekQ8GwcSiCv7a5uO9E,15054
 langroid/agent/callbacks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -72,7 +72,7 @@ langroid/language_models/base.py,sha256=is4l3x858tdPHbrJU2jxJXe2j9PCGb9kk_c5nyfS
 langroid/language_models/config.py,sha256=9Q8wk5a7RQr8LGMT_0WkpjY8S4ywK06SalVRjXlfCiI,378
 langroid/language_models/mock_lm.py,sha256=5BgHKDVRWFbUwDT_PFgTZXz9-k8wJSA2e3PZmyDgQ1k,4022
 langroid/language_models/model_info.py,sha256=_EidEMIgAMx0RuELAf5Ans0yiE1QllybZALw5o-1HJg,12265
-langroid/language_models/openai_gpt.py,sha256=yuxbOTZp2TuhTdy88NmdhCvqcCSs1ls5j9Cn81yQQ6M,77402
+langroid/language_models/openai_gpt.py,sha256=lOQcExZO5Tja35Xi4F2HcG8pE-2LEnGrHwLTXLOOagk,77367
 langroid/language_models/utils.py,sha256=L4_CbihDMTGcsg0TOG1Yd5JFEto46--h7CX_14m89sQ,5016
 langroid/language_models/prompt_formatter/__init__.py,sha256=2-5cdE24XoFDhifOLl8yiscohil1ogbP1ECkYdBlBsk,372
 langroid/language_models/prompt_formatter/base.py,sha256=eDS1sgRNZVnoajwV_ZIha6cba5Dt8xjgzdRbPITwx3Q,1221
@@ -81,10 +81,10 @@ langroid/language_models/prompt_formatter/llama2_formatter.py,sha256=YdcO88qyBeu
 langroid/parsing/__init__.py,sha256=2oUWJJAxIavq9Wtw5RGlkXLq3GF3zgXeVLLW4j7yeb8,1138
 langroid/parsing/agent_chats.py,sha256=sbZRV9ujdM5QXvvuHVjIi2ysYSYlap-uqfMMUKulrW0,1068
 langroid/parsing/code_parser.py,sha256=5ze0MBytrGGkU69pA_bJDjRm6QZz_QYfPcIwkagUa7U,3796
-langroid/parsing/document_parser.py,sha256=NKmN_HjwNdfUjTbXhpyK_Wjay3QYEA26ZnewmbO6moA,33632
+langroid/parsing/document_parser.py,sha256=tov34uYB_2ecq7-G7P7CWSOv5alcfwkrrwfsnCCVdIk,49714
 langroid/parsing/para_sentence_split.py,sha256=AJBzZojP3zpB-_IMiiHismhqcvkrVBQ3ZINoQyx_bE4,2000
 langroid/parsing/parse_json.py,sha256=aADo38bAHQhC8on4aWZZzVzSDy-dK35vRLZsFI2ewh8,4756
-langroid/parsing/parser.py,sha256=moJKI5Cn_Pxd7xbNrY220dqQu-0FeEWUI7ogeq63Kec,12842
+langroid/parsing/parser.py,sha256=8MDoKQO60RGXod9E5jMj-k90QNhdim4blVJB9L0rrSA,13789
 langroid/parsing/pdf_utils.py,sha256=rmNJ9UzuBgXTAYwj1TtRJcD8h53x7cizhgyYHKO88I4,1513
 langroid/parsing/repo_loader.py,sha256=3GjvPJS6Vf5L6gV2zOU8s-Tf1oq_fZm-IB_RL_7CTsY,29373
 langroid/parsing/routing.py,sha256=-FcnlqldzL4ZoxuDwXjQPNHgBe9F9-F4R6q7b_z9CvI,1232
@@ -110,7 +110,7 @@ langroid/utils/logging.py,sha256=mwxHimq1wtVQ64PvDyfJJ7Upj-rjHLNHgx8EC2wClvo,402
 langroid/utils/object_registry.py,sha256=iPz9GHzvmCeVoidB3JdAMEKcxJEqTdUr0otQEexDZ5s,2100
 langroid/utils/pandas_utils.py,sha256=UctS986Jtl_MvU5rA7-GfrjEHXP7MNu8ePhepv0bTn0,755
 langroid/utils/pydantic_utils.py,sha256=R7Ps8VP56-eSo-LYHWllFo-SJ2zDmdItuuYpUq2gGJ8,20854
-langroid/utils/system.py,sha256=cJqDgOf9mM82l1GyUeQQdEYAwepYXQwtpJU8Xrz0-MA,8453
+langroid/utils/system.py,sha256=q3QJtTSapIwNe8MMhGEM03wgxPLmZiD47_sF1pKx53I,8472
 langroid/utils/types.py,sha256=-BvyIf_LmAJ5jR9NC7S4CSVNEr3XayAaxJ5o0TiIej0,2992
 langroid/utils/algorithms/__init__.py,sha256=WylYoZymA0fnzpB4vrsH_0n7WsoLhmuZq8qxsOCjUpM,41
 langroid/utils/algorithms/graph.py,sha256=JbdpPnUOhw4-D6O7ou101JLA3xPCD0Lr3qaPoFCaRfo,2866
@@ -127,7 +127,7 @@ langroid/vector_store/pineconedb.py,sha256=otxXZNaBKb9f_H75HTaU3lMHiaR2NUp5MqwLZ
 langroid/vector_store/postgres.py,sha256=DQHd6dt-OcV_QVNm-ymn28rlTfhI6hqgcpLTPCsm0jI,15990
 langroid/vector_store/qdrantdb.py,sha256=v7TAsIoj_vxeKDYS9tpwJLBZA8fuTweTYxHo0X_uawM,17949
 langroid/vector_store/weaviatedb.py,sha256=tjlqEtkwrhykelt-nbr2WIuHWJBuSAGjZuG6gsAMBsc,11753
-langroid-0.42.9.dist-info/METADATA,sha256=Uzl-1rTMbTbk-xKzqgJq4gNiV-hsWZyuFNHhsTs4UEQ,61699
-langroid-0.42.9.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-langroid-0.42.9.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
-langroid-0.42.9.dist-info/RECORD,,
+langroid-0.43.0.dist-info/METADATA,sha256=3BipLtBKwh-Ob9F-PRnmRPJIYPGgAdm_xzP57fJEi6E,61773
+langroid-0.43.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+langroid-0.43.0.dist-info/licenses/LICENSE,sha256=EgVbvA6VSYgUlvC3RvPKehSg7MFaxWDsFuzLOsPPfJg,1065
+langroid-0.43.0.dist-info/RECORD,,

{langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{langroid-0.42.9.dist-info → langroid-0.43.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

langroid 0.42.9__py3-none-any.whl → 0.43.0__py3-none-any.whl

langroid 0.42.9py3-none-any.whl → 0.43.0py3-none-any.whl