PyPI - pembot - Versions diffs - 0.0.3__py2.py3-none-any.whl → 0.0.5__py2.py3-none-any.whl - Mend

pembot 0.0.3py2.py3-none-any.whl → 0.0.5py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pembot might be problematic. Click here for more details.

Files changed (25) hide show

pembot/.git/COMMIT_EDITMSG +1 -1
pembot/.git/index +0 -0
pembot/.git/logs/HEAD +1 -0
pembot/.git/logs/refs/heads/main +1 -0
pembot/.git/logs/refs/remotes/origin/main +1 -0
pembot/.git/objects/0b/db4169fc0f312b8698f1df17a258fff163aeaa +0 -0
pembot/.git/objects/1f/83a471c8119f7794d98c049170a5d7d07a4b71 +0 -0
pembot/.git/objects/41/cbeb6bcb4c6fa9ef9be571082d95ecb4ea0ee3 +0 -0
pembot/.git/objects/63/1700a51c8fa97b543991f5f61bfcd1e7e1327d +0 -0
pembot/.git/objects/ab/139d2cd4798dd8e2c565b80440b1a44b376126 +0 -0
pembot/.git/objects/bf/068a0714e2145de83a5c004f4213b091439d0e +0 -0
pembot/.git/objects/d0/937f7d832266337289d5ec09459f931a46fcf7 +0 -0
pembot/.git/objects/fc/988aab7e2d46396dc595ad24345e8e77dda0e4 +0 -0
pembot/.git/refs/heads/main +1 -1
pembot/.git/refs/remotes/origin/main +1 -1
pembot/AnyToText/convertor.py +250 -146
pembot/__init__.py +1 -1
pembot/config/config.yaml +1 -1
pembot/main.py +26 -8
pembot/pdf2markdown/extract.py +266 -309
pembot/query.py +15 -9
{pembot-0.0.3.dist-info → pembot-0.0.5.dist-info}/METADATA +1 -1
{pembot-0.0.3.dist-info → pembot-0.0.5.dist-info}/RECORD +25 -17
{pembot-0.0.3.dist-info → pembot-0.0.5.dist-info}/WHEEL +0 -0
{pembot-0.0.3.dist-info → pembot-0.0.5.dist-info}/licenses/LICENSE +0 -0

pembot/pdf2markdown/extract.py CHANGED Viewed

@@ -1,11 +1,10 @@
-import fitz  # PyMuPDF
+import fitz
 import pdfplumber
 import re
 import yaml
 # import pytesseract
 import numpy as np
-from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
-# VisionEncoderDecoderModel, ViTImageProcessor,
+from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, VisionEncoderDecoderModel, ViTImageProcessor
 from typing import Literal, final
 import torch
 from PIL import Image
@@ -16,28 +15,26 @@ import warnings
 from pathlib import Path
 from abc import ABC, abstractmethod
 import argparse
-from PIL import Image
 import io
-from PIL import Image
-model_path = "nanonets/Nanonets-OCR-s"
-model = AutoModelForImageTextToText.from_pretrained(
-    model_path,
-    torch_dtype="auto",
-    device_map="auto",
-    attn_implementation="flash_attention_2"
-)
-model.eval()
+from google import genai
+from google.genai import types
+import mimetypes
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-processor = AutoProcessor.from_pretrained(model_path)
 warnings.filterwarnings("ignore")
-with open(Path("config/config.yaml").resolve(), "r", encoding="utf-8") as f:
-    config = yaml.safe_load(f)
+config= {}
+try:
+    with open(Path("config/config.yaml").resolve(), "r", encoding="utf-8") as f:
+        config = yaml.safe_load(f)
+except FileNotFoundError:
+    config= {
+        'OUTPUT_DIR': '.',
+        'PAGE_DELIMITER': '____NEXT PAGE____'
+    }
+except Exception as e:
+    print("unhandled while opening default config in pdf2markdown: ", e)
 class PDFExtractor(ABC):
@@ -74,9 +71,31 @@ class MarkdownPDFExtractor(PDFExtractor):
     BULLET_POINTS = "•◦▪▫●○"
-    def __init__(self, pdf_path, output_path= config["OUTPUT_DIR"], page_delimiter= config["PAGE_DELIMITER"]):
+    def __init__(self, pdf_path, output_path= config.get("OUTPUT_DIR", '.'), page_delimiter= config.get("PAGE_DELIMITER", ''), model_name: str | None= None):
         super().__init__(pdf_path)
+        if model_name is None:
+            self.MODEL_NAME= "gemini-2.5-flash"
+        else:
+            self.MODEL_NAME= model_name
+        if  "gemini" in self.MODEL_NAME:
+            self.gclient = genai.Client(api_key= os.getenv("GEMINI_API_KEY", ''))
+        else:
+            model_path = "nanonets/Nanonets-OCR-s"
+            self.model = AutoModelForImageTextToText.from_pretrained(
+                model_path,
+                torch_dtype="auto",
+                device_map="auto",
+                attn_implementation="flash_attention_2"
+            )
+            self.model.eval()
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+            self.processor = AutoProcessor.from_pretrained(model_path)
+            self.setup_image_captioning()
         self.markdown_content= ""
         self.pdf_filename = Path(pdf_path).stem
         self.output_path= output_path
@@ -87,26 +106,26 @@ class MarkdownPDFExtractor(PDFExtractor):
         self.page_delimiter= page_delimiter
         Path(output_path).mkdir(parents=True, exist_ok=True)
-        # self.setup_image_captioning()
-    # def setup_image_captioning(self):
-    #     """Set up the image captioning model."""
-    #     try:
-    #         self.model = VisionEncoderDecoderModel.from_pretrained(
-    #             "nlpconnect/vit-gpt2-image-captioning"
-    #         )
-    #         self.feature_extractor = ViTImageProcessor.from_pretrained(
-    #             "nlpconnect/vit-gpt2-image-captioning"
-    #         )
-    #         self.tokenizer = AutoTokenizer.from_pretrained(
-    #             "nlpconnect/vit-gpt2-image-captioning"
-    #         )
-    #         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    #         self.model.to(self.device)
-    #         self.logger.info("Image captioning model set up successfully.")
-    #     except Exception as e:
-    #         self.logger.error(f"Error setting up image captioning model: {e}")
-    #         self.logger.exception(traceback.format_exc())
+    def setup_image_captioning(self):
+        """Set up the image captioning model."""
+        try:
+            self.model = VisionEncoderDecoderModel.from_pretrained(
+                "nlpconnect/vit-gpt2-image-captioning"
+            )
+            self.feature_extractor = ViTImageProcessor.from_pretrained(
+                "nlpconnect/vit-gpt2-image-captioning"
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                "nlpconnect/vit-gpt2-image-captioning"
+            )
+            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+            self.model.to(self.device)
+            self.logger.info("Image captioning model set up successfully.")
+        except Exception as e:
+            self.logger.error(f"Error setting up image captioning model: {e}")
+            self.logger.exception(traceback.format_exc())
     def extract(self):
         try:
@@ -123,282 +142,197 @@ class MarkdownPDFExtractor(PDFExtractor):
             self.logger.exception(traceback.format_exc())
             return "", []
-    def extract_markdown_by_blocks(self):
-        """Main method to extract markdown from PDF."""
-        try:
-            doc = fitz.open(self.pdf_path)
-            markdown_content = ""
-            markdown_pages = []
-            tables = self.extract_tables()
-            table_index = 0
-            list_counter = 0
-            in_code_block = False
-            code_block_content = ""
-            code_block_lang = None
-            prev_line = ""
-            for page_num, page in enumerate(doc):
-                self.logger.info(f"Processing page {page_num + 1}")
-                page_content = ""
-                blocks = page.get_text("dict")["blocks"]
-                page_height = page.rect.height
-                links = self.extract_links(page)
-                if len(page.get_images()) > 0 and len(page.get_images()) <= 128:
-                    for block in blocks:
-                        if block["type"] == 0:  # Text
-                            page_content += self.process_text_block(
-                                block,
-                                page_height,
-                                links,
-                                list_counter,
-                                in_code_block,
-                                code_block_content,
-                                code_block_lang,
-                                prev_line,
-                            )
-                        elif block["type"] == 1:  # Image
-                            page_content += self.process_image_block(page, block)
-                else:
-                    for block in blocks:
-                        if block["type"] == 0:  # Text
-                            page_content += self.process_text_block(
-                                block,
-                                page_height,
-                                links,
-                                list_counter,
-                                in_code_block,
-                                code_block_content,
-                                code_block_lang,
-                                prev_line,
-                            )
-                # Insert tables at their approximate positions
-                while (
-                    table_index < len(tables)
-                    and tables[table_index]["page"] == page.number
-                ):
-                    page_content += (
-                        "\n\n"
-                        + self.table_to_markdown(tables[table_index]["content"])
-                        + "\n\n"
-                    )
-                    table_index += 1
-                markdown_pages.append(self.post_process_markdown(page_content))
-                markdown_content += page_content + config["PAGE_DELIMITER"]
-            markdown_content = self.post_process_markdown(markdown_content)
-            return markdown_content, markdown_pages
-        except Exception as e:
-            self.logger.error(f"Error extracting markdown: {e}")
-            self.logger.exception(traceback.format_exc())
-            return "", []
-    def ocr_page_with_nanonets_s(self, pil_image, model, processor, max_new_tokens: int | None = None):
+    def ocr_page_with_nanonets_s(self, pil_image, img_bytes, max_new_tokens: int | None = None):
         prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
         if max_new_tokens is None:
             max_new_tokens= 4096
-        # image = Image.open(image_path)
-        image = pil_image
-        messages = [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": [
-                {"type": "image", "image": image},
-                {"type": "text", "text": prompt},
-            ]},
-        ]
-        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-        inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
-        inputs = inputs.to(model.device)
+        if 'gemini' in self.MODEL_NAME:
+            image_format = pil_image.format
+            dummy_filename = f"dummy.{image_format.lower()}"
+            mime_type, _ = mimetypes.guess_type(dummy_filename)
+            response=  self.gclient.models.generate_content(
+                model= self.MODEL_NAME,
+                contents=[
+                    types.Part.from_bytes(
+                        data=img_bytes.getvalue(),
+                        mime_type= mime_type
+                    ),
+                    prompt
+                ]
+            )
+            # print("response :", response)
+            return response.text
+        else:
+            image = pil_image
+            messages = [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": prompt},
+                ]},
+            ]
+            text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+            inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
+            inputs = inputs.to(self.model.device)
-        output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-        generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+            output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+            generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
-        output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-        return output_text[0]
+            output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+            return output_text[0]
     def extract_markdown(self):
-        """
-        Extracts all possible text content from a PDF page, concatenating it
-        from direct text blocks, OCR from embedded image blocks, and OCR from
-        full-page raster images (scanned pages).
+            """
+            Extracts all possible content from a PDF, prioritizing searchable text,
+            then OCR for embedded images, and finally full-page OCR for scanned pages.
+            Avoids redundant OCR where possible.
+            Returns:
+                tuple: A tuple containing:
+                    - str: The concatenated markdown content of all pages.
+                    - list: A list of strings, where each string is the comprehensive markdown
+                            for a corresponding page.
+            """
+            all_pages_markdown = []
+            full_document_markdown = [] # Changed to list of lines/blocks to handle insertions better
+            try:
+                doc = fitz.open(self.pdf_path)
+                self.logger.info(f"Opened PDF: {self.pdf_path}")
+                tables = self.extract_tables()
+                table_index = 0
+                # State variables for process_text_block that might need to persist across blocks
+                # Re-initialize for each new document, but allow state management within process_text_block for lines
+                list_counter = 0
+                in_code_block = False
+                code_block_content = ""
+                code_block_lang = None
+                prev_line = ""
+                for page_num, page in enumerate(doc):
+                    current_page_markdown_blocks = [] # Collect markdown blocks for the current page
+                    page_has_searchable_text = False
+                    page_has_embedded_images = False
+                    self.logger.info(f"\nProcessing page {page_num + 1}...")
+                    blocks = page.get_text('dict')['blocks']
+                    page_height = page.rect.height
+                    links = self.extract_links(page)
+                    # Phase 1: Process text blocks and embedded image blocks
+                    for block_num, block in enumerate(blocks):
+                        if block['type'] == 0:  # Text block
+                            page_has_searchable_text = True
+                            processed_text = self.process_text_block(
+                                block,
+                                page_height,
+                                links,
+                                list_counter,
+                                in_code_block,
+                                code_block_content,
+                                code_block_lang,
+                                prev_line,
+                            )
+                            if processed_text.strip():
+                                current_page_markdown_blocks.append(processed_text)
+                        elif block['type'] == 1:  # Image block
+                            page_has_embedded_images = True
+                            self.logger.info(f"  Found embedded image block (Page {page_num+1}, Block {block_num+1})")
+                            img_data = block['image']
+                            try:
+                                image_bytes= io.BytesIO(img_data)
+                                pil_image = Image.open(image_bytes)
+                                ocr_text_from_block_image = self.ocr_page_with_nanonets_s(
+                                    pil_image, image_bytes, max_new_tokens=15000
+                                )
-        Returns:
-            list: A list of strings, where each string is the comprehensive text
-                  for a corresponding page. Returns an empty list if an error occurs.
-        """
+                                if ocr_text_from_block_image.strip():
+                                    self.logger.info("    OCR found text in embedded image block.")
+                                    current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_block_image.strip()}\n\n")
+                                else:
+                                    self.logger.info(f"    No OCR text from embedded image block. Adding generic placeholder.")
+                                    current_page_markdown_blocks.append("\n\n![Image Placeholder](image_on_page_{page_num+1}_block_{block_num+1}.png)\n\n") # Consider saving images
+                            except Exception as e:
+                                self.logger.error(f"    Error processing embedded image block for OCR: {e}")
+                                current_page_markdown_blocks.append("\n\n![Image Processing Error](error_on_page_{page_num+1}_block_{block_num+1}.png)\n\n")
+                    # Insert tables at their approximate positions (after blocks are processed for the page)
+                    # You might need more sophisticated logic here if table positions are granular
+                    while (
+                        table_index < len(tables)
+                        and tables[table_index]["page"] == page.number
+                    ):
+                        current_page_markdown_blocks.append(
+                            self.table_to_markdown(tables[table_index]["content"])
+                        )
+                        table_index += 1
-        """taken from self:
-            pdf_path (str): The path to the input PDF file.
-            output_path (str): Directory to save debug output (like rendered images).
-        """
+                    # Phase 2: Full-page OCR if the page seems to be a scanned image or lacks sufficient searchable text
+                    # We prioritize actual searchable text and embedded image OCR.
+                    # Only if very little or no text was found, we resort to full-page OCR.
+                    combined_current_page_text_length = len("".join(current_page_markdown_blocks).strip())
-        all_pages_text = []
-        the_text= ""
+                    # A heuristic: if almost no searchable text and no significant OCR from embedded images
+                    if not page_has_searchable_text and combined_current_page_text_length < 100: # Threshold for considering "minimal text"
+                        self.logger.info(f"  Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
+                        try:
+                            pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
+                            img_bytes = pix.tobytes("png")
+                            image_bytestream= io.BytesIO(img_bytes)
+                            pil_image = Image.open(image_bytestream)
-        try:
-            doc = fitz.open(self.pdf_path)
-            logging.info(f"Opened PDF: {self.pdf_path}")
-            tables = self.extract_tables()
-            table_index = 0
-            list_counter = 0
-            in_code_block = False
-            code_block_content = ""
-            code_block_lang = None
-            prev_line = ""
-            for page_num, page in enumerate(doc):
-                page_text_content = []
-                page_has_searchable_text = False
-                logging.info(f"\nProcessing page {page_num + 1}...")
-                # --- Phase 1: Extract text from direct text blocks and process embedded images ---
-                blocks = page.get_text('dict')['blocks']
-                text_blocks_content = []
-                image_block_text_content = []
-                page_height = page.rect.height
-                links = self.extract_links(page)
-                for block_num, block in enumerate(blocks):
-                    if block['type'] == 0:  # Text block
-                        page_has_searchable_text = True
-                        text_blocks_content.append(self.process_text_block(
-                            block,
-                            page_height,
-                            links,
-                            list_counter,
-                            in_code_block,
-                            code_block_content,
-                            code_block_lang,
-                            prev_line,
-                        ))
-                        # for line in block['lines']:
-                        #     for span in line['spans']:
-                        #         text_blocks_content.append(span['text'])
-                    elif block['type'] == 1:  # Image block
-                        logging.info(f"  Found embedded image block (Page {page_num+1}, Block {block_num+1})")
-                        img_data = block['image']
-                        img_ext = block['ext']
+                            ocr_text_from_page = self.ocr_page_with_nanonets_s(
+                                pil_image, image_bytestream, max_new_tokens=15000
+                            )
-                        try:
-                            # Attempt OCR on the embedded image block
-                            pil_image = Image.open(io.BytesIO(img_data))
-                            # ocr_text_from_block_image = pytesseract.image_to_string(pil_image)
-                            ocr_text_from_block_image= self.ocr_page_with_nanonets_s(pil_image, model, processor, max_new_tokens=15000)
-                            if ocr_text_from_block_image.strip():
-                                logging.info(f"    OCR found text in embedded image block.")
-                                image_block_text_content.append(ocr_text_from_block_image.strip())
+                            if ocr_text_from_page.strip():
+                                self.logger.info(f"  Successfully extracted text via full-page OCR for page {page_num + 1}.")
+                                # If full-page OCR yields significant content and other methods didn't,
+                                # replace or augment. Here, we'll replace to avoid double-counting if it's primarily scanned.
+                                # You might choose to append if you want to combine (e.g., if there's header text + scanned body)
+                                if combined_current_page_text_length < 50: # If almost nothing was found before, replace
+                                    current_page_markdown_blocks = [ocr_text_from_page.strip()]
+                                else: # Otherwise, augment (append)
+                                    current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_page.strip()}\n\n")
                             else:
-                                # If no OCR text, use the caption
-                                # caption = self.caption_image(pil_image)
-                                # if caption:
-                                #     logging.info(f"    No OCR text, using caption for embedded image block.")
-                                #     image_block_text_content.append(caption)
-                                # else:
-                                #     logging.info(f"    No OCR text and no caption for embedded image block.")
-                                # a) captioning sucks, b) no need
-                                image_block_text_content.append("An Image")
-                        # except pytesseract.TesseractNotFoundError:
-                        #     logging.warning("    Tesseract-OCR not found. Skipping OCR for embedded image block.")
-                            # caption = self.process_image_block(page, block)
-                            # if caption: image_block_text_content.append(caption)
-                            # image_block_text_content.append("An Image")
+                                self.logger.info(f"  Full-page OCR yielded no text for page {page_num+1}.")
                         except Exception as e:
-                            logging.error(f"    Error processing embedded image block for OCR/caption: {e}")
-                            # caption = self.process_image_block(page, block)
-                            # if caption: image_block_text_content.append(caption)
-                            image_block_text_content.append("An Image")
-                # Insert tables at their approximate positions
-                while (
-                    table_index < len(tables)
-                    and tables[table_index]["page"] == page.number
-                ):
-                    page_text_content += (
-                        "\n\n"
-                        + self.table_to_markdown(tables[table_index]["content"])
-                        + "\n\n"
-                    )
-                    table_index += 1
-                # Add content from text blocks
-                if text_blocks_content:
-                    page_text_content.append(" ".join(text_blocks_content))
-                # Add content from image blocks
-                if image_block_text_content:
-                    page_text_content.append("\n".join(image_block_text_content))
-                # --- Phase 2: OCR the entire page IF it seems to be a scanned image ---
-                # We check if page_has_searchable_text is False or if the amount of text
-                # is very small, suggesting it might be mostly a scanned page.
-                # A threshold of 50 characters is arbitrary; adjust as needed.
-                current_text_len = len(" ".join(page_text_content).strip())
-                if not page_has_searchable_text or current_text_len < 50:
-                    logging.info(f"  Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
-                    try:
-                        # Render the page as a high-resolution image (e.g., 300 DPI)
-                        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
-                        img_bytes = pix.tobytes("png")
-                        pil_image = Image.open(io.BytesIO(img_bytes))
-                        # Perform OCR on the entire page image
-                        # ocr_text_from_page = pytesseract.image_to_string(pil_image)
-                        ocr_text_from_page= self.ocr_page_with_nanonets_s(pil_image, model, processor, max_new_tokens=15000)
-                        if ocr_text_from_page.strip():
-                            logging.info(f"  Successfully extracted text via full-page OCR.")
-                            page_text_content.append(ocr_text_from_page.strip())
-                        else:
-                            logging.info(f"  Full-page OCR yielded no text for page {page_num+1}.")
-                    # except pytesseract.TesseractNotFoundError:
-                    #     logging.warning("  Tesseract-OCR not found. Skipping full-page OCR for this page.")
-                    except Exception as e:
-                        logging.error(f"  Error during full-page OCR on page {page_num+1}: {e}")
-                else:
-                    logging.info(f"  Page {page_num + 1} has sufficient searchable text; skipping full-page OCR.")
-                # Concatenate all collected text for the current page
-                final_page_text = "\n".join(filter(None, page_text_content)).strip() # Use filter(None, ...) to remove empty strings
-                all_pages_text.append(self.post_process_markdown(final_page_text))
-                the_text += final_page_text + self.page_delimiter
-                logging.info(f"  Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_text[:200]}...")
+                            self.logger.error(f"  Error during full-page OCR on page {page_num+1}: {e}")
+                    else:
+                        self.logger.info(f"  Page {page_num + 1} has sufficient searchable text or embedded image OCR; skipping full-page OCR.")
-                print("\npage done\n")
-                print(final_page_text)
+                    # Join collected markdown blocks for the current page
+                    final_page_markdown = "\n".join(filter(None, current_page_markdown_blocks)).strip()
+                    all_pages_markdown.append(self.post_process_markdown(final_page_markdown))
+                    full_document_markdown.append(self.post_process_markdown(final_page_markdown))
+                    full_document_markdown.append(self.page_delimiter)
-            doc.close()
-            return the_text, all_pages_text
+                    self.logger.info(f"  Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_markdown[:200]}...")
+                    print(f"\n--- Page {page_num+1} Done ---\n")
+                    print(final_page_markdown[:500]) # Print first 500 chars of page markdown
-        except fitz.FileNotFoundError:
-            logging.error(f"PDF file not found: {self.pdf_path}")
-            return []
-        except Exception as e:
-            logging.critical(f"An unexpected error occurred: {e}")
-            return []
+                doc.close()
+                return "".join(full_document_markdown), all_pages_markdown
+            except fitz.FileNotFoundError:
+                self.logger.error(f"PDF file not found: {self.pdf_path}")
+                return "", []
+            except Exception as e:
+                self.logger.critical(f"An unexpected error occurred during markdown extraction: {e}")
+                self.logger.exception(traceback.format_exc())
+                return "", []
     def extract_tables(self):
         """Extract tables from PDF using pdfplumber."""
@@ -449,13 +383,13 @@ class MarkdownPDFExtractor(PDFExtractor):
             self.logger.exception(traceback.format_exc())
             return ""
-    def perform_ocr(self, image):
+    def perform_ocr(self, image, image_bytes):
         """Perform OCR on the given image."""
         try:
             # ocr_result = pytesseract.image_to_string(
             #     image
             # )
-            ocr_result= self.ocr_page_with_nanonets_s(image, model, processor, max_new_tokens=15000)
+            ocr_result= self.ocr_page_with_nanonets_s(image, image_bytes, max_new_tokens=15000)
             return ocr_result.strip()
@@ -464,10 +398,10 @@ class MarkdownPDFExtractor(PDFExtractor):
             self.logger.exception(traceback.format_exc())
             return ""
-    def caption_image(self, image):
+    def caption_image(self, image, image_bytes):
         """Generate a caption for the given image."""
         try:
-            ocr_text = self.perform_ocr(image)
+            ocr_text = self.perform_ocr(image, image_bytes)
             if ocr_text:
                 return ocr_text
@@ -475,19 +409,38 @@ class MarkdownPDFExtractor(PDFExtractor):
             if image.mode != "RGB":
                 image = image.convert("RGB")
-            # Ensure the image is in the correct shape
-            image = np.array(image).transpose(2, 0, 1)  # Convert to (C, H, W) format
+            image_format = image.format
+            dummy_filename = f"dummy.{image_format.lower()}"
+            mime_type, _ = mimetypes.guess_type(dummy_filename)
+            if "gemini" in self.MODEL_NAME:
+                response=  self.gclient.models.generate_content(
+                    model= self.MODEL_NAME,
+                    contents=[
+                        types.Part.from_bytes(
+                            data=image_bytes.getvalue(),
+                            mime_type= mime_type
+                        ),
+                        "Write a caption for this image"
+                    ]
+                )
+                return response.text
+            else:
+                # Ensure the image is in the correct shape
+                image = np.array(image).transpose(2, 0, 1)  # Convert to (C, H, W) format
-            inputs = self.feature_extractor(images=image, return_tensors="pt").to(
-                self.device
-            )
-            pixel_values = inputs.pixel_values
+                inputs = self.feature_extractor(images=image, return_tensors="pt").to(
+                    self.device
+                )
+                pixel_values = inputs.pixel_values
-            generated_ids = self.model.generate(pixel_values, max_length=30)
-            generated_caption = self.tokenizer.batch_decode(
-                generated_ids, skip_special_tokens=True
-            )[0]
-            return generated_caption.strip()
+                generated_ids = self.model.generate(pixel_values, max_length=30)
+                generated_ids = self.model.generate(pixel_values, max_length=30)
+                generated_caption = self.tokenizer.batch_decode(
+                    generated_ids, skip_special_tokens=True
+                )[0]
+                return generated_caption.strip()
         except Exception as e:
             self.logger.error(f"Error captioning image: {e}")
             self.logger.exception(traceback.format_exc())
@@ -789,7 +742,11 @@ class MarkdownPDFExtractor(PDFExtractor):
                 Path(self.output_path) / image_filename
             )  # Convert to Path object
             image.save(image_path, "PNG", optimize=True, quality=95)
-            caption = self.caption_image(image)
+            img_byte_arr = io.BytesIO()
+            image.save(img_byte_arr)
+            caption = self.caption_image(image, img_byte_arr)
             if not caption:
                 caption = (
                     f"{self.pdf_filename}_image_{int(page.number)+1}_{block['number']}"

pembot 0.0.3__py2.py3-none-any.whl → 0.0.5__py2.py3-none-any.whl

Potentially problematic release.

pembot 0.0.3py2.py3-none-any.whl → 0.0.5py2.py3-none-any.whl