pembot 0.0.3__py2.py3-none-any.whl → 0.0.5__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pembot might be problematic. Click here for more details.

@@ -1,11 +1,10 @@
1
- import fitz # PyMuPDF
1
+ import fitz
2
2
  import pdfplumber
3
3
  import re
4
4
  import yaml
5
5
  # import pytesseract
6
6
  import numpy as np
7
- from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
8
- # VisionEncoderDecoderModel, ViTImageProcessor,
7
+ from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, VisionEncoderDecoderModel, ViTImageProcessor
9
8
  from typing import Literal, final
10
9
  import torch
11
10
  from PIL import Image
@@ -16,28 +15,26 @@ import warnings
16
15
  from pathlib import Path
17
16
  from abc import ABC, abstractmethod
18
17
  import argparse
19
- from PIL import Image
20
18
  import io
21
- from PIL import Image
22
-
23
- model_path = "nanonets/Nanonets-OCR-s"
24
-
25
- model = AutoModelForImageTextToText.from_pretrained(
26
- model_path,
27
- torch_dtype="auto",
28
- device_map="auto",
29
- attn_implementation="flash_attention_2"
30
- )
31
- model.eval()
19
+ from google import genai
20
+ from google.genai import types
21
+ import mimetypes
32
22
 
33
- tokenizer = AutoTokenizer.from_pretrained(model_path)
34
- processor = AutoProcessor.from_pretrained(model_path)
35
23
 
36
24
 
37
25
  warnings.filterwarnings("ignore")
38
26
 
39
- with open(Path("config/config.yaml").resolve(), "r", encoding="utf-8") as f:
40
- config = yaml.safe_load(f)
27
+ config= {}
28
+ try:
29
+ with open(Path("config/config.yaml").resolve(), "r", encoding="utf-8") as f:
30
+ config = yaml.safe_load(f)
31
+ except FileNotFoundError:
32
+ config= {
33
+ 'OUTPUT_DIR': '.',
34
+ 'PAGE_DELIMITER': '____NEXT PAGE____'
35
+ }
36
+ except Exception as e:
37
+ print("unhandled while opening default config in pdf2markdown: ", e)
41
38
 
42
39
 
43
40
  class PDFExtractor(ABC):
@@ -74,9 +71,31 @@ class MarkdownPDFExtractor(PDFExtractor):
74
71
 
75
72
  BULLET_POINTS = "•◦▪▫●○"
76
73
 
77
- def __init__(self, pdf_path, output_path= config["OUTPUT_DIR"], page_delimiter= config["PAGE_DELIMITER"]):
74
+ def __init__(self, pdf_path, output_path= config.get("OUTPUT_DIR", '.'), page_delimiter= config.get("PAGE_DELIMITER", ''), model_name: str | None= None):
78
75
  super().__init__(pdf_path)
79
76
 
77
+ if model_name is None:
78
+ self.MODEL_NAME= "gemini-2.5-flash"
79
+ else:
80
+ self.MODEL_NAME= model_name
81
+
82
+ if "gemini" in self.MODEL_NAME:
83
+ self.gclient = genai.Client(api_key= os.getenv("GEMINI_API_KEY", ''))
84
+ else:
85
+ model_path = "nanonets/Nanonets-OCR-s"
86
+ self.model = AutoModelForImageTextToText.from_pretrained(
87
+ model_path,
88
+ torch_dtype="auto",
89
+ device_map="auto",
90
+ attn_implementation="flash_attention_2"
91
+ )
92
+ self.model.eval()
93
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path)
94
+ self.processor = AutoProcessor.from_pretrained(model_path)
95
+ self.setup_image_captioning()
96
+
97
+
98
+
80
99
  self.markdown_content= ""
81
100
  self.pdf_filename = Path(pdf_path).stem
82
101
  self.output_path= output_path
@@ -87,26 +106,26 @@ class MarkdownPDFExtractor(PDFExtractor):
87
106
  self.page_delimiter= page_delimiter
88
107
  Path(output_path).mkdir(parents=True, exist_ok=True)
89
108
 
90
- # self.setup_image_captioning()
91
-
92
- # def setup_image_captioning(self):
93
- # """Set up the image captioning model."""
94
- # try:
95
- # self.model = VisionEncoderDecoderModel.from_pretrained(
96
- # "nlpconnect/vit-gpt2-image-captioning"
97
- # )
98
- # self.feature_extractor = ViTImageProcessor.from_pretrained(
99
- # "nlpconnect/vit-gpt2-image-captioning"
100
- # )
101
- # self.tokenizer = AutoTokenizer.from_pretrained(
102
- # "nlpconnect/vit-gpt2-image-captioning"
103
- # )
104
- # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
105
- # self.model.to(self.device)
106
- # self.logger.info("Image captioning model set up successfully.")
107
- # except Exception as e:
108
- # self.logger.error(f"Error setting up image captioning model: {e}")
109
- # self.logger.exception(traceback.format_exc())
109
+
110
+
111
+ def setup_image_captioning(self):
112
+ """Set up the image captioning model."""
113
+ try:
114
+ self.model = VisionEncoderDecoderModel.from_pretrained(
115
+ "nlpconnect/vit-gpt2-image-captioning"
116
+ )
117
+ self.feature_extractor = ViTImageProcessor.from_pretrained(
118
+ "nlpconnect/vit-gpt2-image-captioning"
119
+ )
120
+ self.tokenizer = AutoTokenizer.from_pretrained(
121
+ "nlpconnect/vit-gpt2-image-captioning"
122
+ )
123
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
124
+ self.model.to(self.device)
125
+ self.logger.info("Image captioning model set up successfully.")
126
+ except Exception as e:
127
+ self.logger.error(f"Error setting up image captioning model: {e}")
128
+ self.logger.exception(traceback.format_exc())
110
129
 
111
130
  def extract(self):
112
131
  try:
@@ -123,282 +142,197 @@ class MarkdownPDFExtractor(PDFExtractor):
123
142
  self.logger.exception(traceback.format_exc())
124
143
  return "", []
125
144
 
126
- def extract_markdown_by_blocks(self):
127
- """Main method to extract markdown from PDF."""
128
- try:
129
- doc = fitz.open(self.pdf_path)
130
- markdown_content = ""
131
- markdown_pages = []
132
- tables = self.extract_tables()
133
- table_index = 0
134
- list_counter = 0
135
- in_code_block = False
136
- code_block_content = ""
137
- code_block_lang = None
138
- prev_line = ""
139
-
140
- for page_num, page in enumerate(doc):
141
- self.logger.info(f"Processing page {page_num + 1}")
142
- page_content = ""
143
- blocks = page.get_text("dict")["blocks"]
144
- page_height = page.rect.height
145
- links = self.extract_links(page)
146
-
147
- if len(page.get_images()) > 0 and len(page.get_images()) <= 128:
148
- for block in blocks:
149
- if block["type"] == 0: # Text
150
- page_content += self.process_text_block(
151
- block,
152
- page_height,
153
- links,
154
- list_counter,
155
- in_code_block,
156
- code_block_content,
157
- code_block_lang,
158
- prev_line,
159
- )
160
- elif block["type"] == 1: # Image
161
- page_content += self.process_image_block(page, block)
162
-
163
- else:
164
- for block in blocks:
165
- if block["type"] == 0: # Text
166
- page_content += self.process_text_block(
167
- block,
168
- page_height,
169
- links,
170
- list_counter,
171
- in_code_block,
172
- code_block_content,
173
- code_block_lang,
174
- prev_line,
175
- )
176
-
177
- # Insert tables at their approximate positions
178
- while (
179
- table_index < len(tables)
180
- and tables[table_index]["page"] == page.number
181
- ):
182
- page_content += (
183
- "\n\n"
184
- + self.table_to_markdown(tables[table_index]["content"])
185
- + "\n\n"
186
- )
187
- table_index += 1
188
-
189
- markdown_pages.append(self.post_process_markdown(page_content))
190
- markdown_content += page_content + config["PAGE_DELIMITER"]
191
145
 
192
- markdown_content = self.post_process_markdown(markdown_content)
193
- return markdown_content, markdown_pages
194
- except Exception as e:
195
- self.logger.error(f"Error extracting markdown: {e}")
196
- self.logger.exception(traceback.format_exc())
197
- return "", []
198
-
199
-
200
- def ocr_page_with_nanonets_s(self, pil_image, model, processor, max_new_tokens: int | None = None):
146
+ def ocr_page_with_nanonets_s(self, pil_image, img_bytes, max_new_tokens: int | None = None):
201
147
  prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
202
148
  if max_new_tokens is None:
203
149
  max_new_tokens= 4096
204
150
 
205
- # image = Image.open(image_path)
206
- image = pil_image
207
- messages = [
208
- {"role": "system", "content": "You are a helpful assistant."},
209
- {"role": "user", "content": [
210
- {"type": "image", "image": image},
211
- {"type": "text", "text": prompt},
212
- ]},
213
- ]
214
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
215
- inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
216
- inputs = inputs.to(model.device)
151
+ if 'gemini' in self.MODEL_NAME:
152
+
153
+ image_format = pil_image.format
154
+ dummy_filename = f"dummy.{image_format.lower()}"
155
+ mime_type, _ = mimetypes.guess_type(dummy_filename)
156
+ response= self.gclient.models.generate_content(
157
+ model= self.MODEL_NAME,
158
+ contents=[
159
+ types.Part.from_bytes(
160
+ data=img_bytes.getvalue(),
161
+ mime_type= mime_type
162
+ ),
163
+ prompt
164
+ ]
165
+ )
166
+ # print("response :", response)
167
+ return response.text
168
+ else:
169
+ image = pil_image
170
+ messages = [
171
+ {"role": "system", "content": "You are a helpful assistant."},
172
+ {"role": "user", "content": [
173
+ {"type": "image", "image": image},
174
+ {"type": "text", "text": prompt},
175
+ ]},
176
+ ]
177
+ text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
178
+ inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
179
+ inputs = inputs.to(self.model.device)
217
180
 
218
- output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
219
- generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
181
+ output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
182
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
220
183
 
221
- output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
222
- return output_text[0]
184
+ output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
185
+ return output_text[0]
223
186
 
224
187
 
225
188
 
226
189
  def extract_markdown(self):
227
- """
228
- Extracts all possible text content from a PDF page, concatenating it
229
- from direct text blocks, OCR from embedded image blocks, and OCR from
230
- full-page raster images (scanned pages).
190
+ """
191
+ Extracts all possible content from a PDF, prioritizing searchable text,
192
+ then OCR for embedded images, and finally full-page OCR for scanned pages.
193
+ Avoids redundant OCR where possible.
194
+
195
+ Returns:
196
+ tuple: A tuple containing:
197
+ - str: The concatenated markdown content of all pages.
198
+ - list: A list of strings, where each string is the comprehensive markdown
199
+ for a corresponding page.
200
+ """
201
+ all_pages_markdown = []
202
+ full_document_markdown = [] # Changed to list of lines/blocks to handle insertions better
203
+
204
+ try:
205
+ doc = fitz.open(self.pdf_path)
206
+ self.logger.info(f"Opened PDF: {self.pdf_path}")
207
+
208
+ tables = self.extract_tables()
209
+ table_index = 0
210
+
211
+ # State variables for process_text_block that might need to persist across blocks
212
+ # Re-initialize for each new document, but allow state management within process_text_block for lines
213
+ list_counter = 0
214
+ in_code_block = False
215
+ code_block_content = ""
216
+ code_block_lang = None
217
+ prev_line = ""
218
+
219
+ for page_num, page in enumerate(doc):
220
+ current_page_markdown_blocks = [] # Collect markdown blocks for the current page
221
+ page_has_searchable_text = False
222
+ page_has_embedded_images = False
223
+
224
+ self.logger.info(f"\nProcessing page {page_num + 1}...")
225
+
226
+ blocks = page.get_text('dict')['blocks']
227
+ page_height = page.rect.height
228
+ links = self.extract_links(page)
229
+
230
+ # Phase 1: Process text blocks and embedded image blocks
231
+ for block_num, block in enumerate(blocks):
232
+ if block['type'] == 0: # Text block
233
+ page_has_searchable_text = True
234
+ processed_text = self.process_text_block(
235
+ block,
236
+ page_height,
237
+ links,
238
+ list_counter,
239
+ in_code_block,
240
+ code_block_content,
241
+ code_block_lang,
242
+ prev_line,
243
+ )
244
+ if processed_text.strip():
245
+ current_page_markdown_blocks.append(processed_text)
246
+
247
+ elif block['type'] == 1: # Image block
248
+ page_has_embedded_images = True
249
+ self.logger.info(f" Found embedded image block (Page {page_num+1}, Block {block_num+1})")
250
+ img_data = block['image']
251
+
252
+ try:
253
+ image_bytes= io.BytesIO(img_data)
254
+ pil_image = Image.open(image_bytes)
255
+ ocr_text_from_block_image = self.ocr_page_with_nanonets_s(
256
+ pil_image, image_bytes, max_new_tokens=15000
257
+ )
231
258
 
232
- Returns:
233
- list: A list of strings, where each string is the comprehensive text
234
- for a corresponding page. Returns an empty list if an error occurs.
235
- """
259
+ if ocr_text_from_block_image.strip():
260
+ self.logger.info(" OCR found text in embedded image block.")
261
+ current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_block_image.strip()}\n\n")
262
+ else:
263
+ self.logger.info(f" No OCR text from embedded image block. Adding generic placeholder.")
264
+ current_page_markdown_blocks.append("\n\n![Image Placeholder](image_on_page_{page_num+1}_block_{block_num+1}.png)\n\n") # Consider saving images
265
+ except Exception as e:
266
+ self.logger.error(f" Error processing embedded image block for OCR: {e}")
267
+ current_page_markdown_blocks.append("\n\n![Image Processing Error](error_on_page_{page_num+1}_block_{block_num+1}.png)\n\n")
268
+
269
+
270
+ # Insert tables at their approximate positions (after blocks are processed for the page)
271
+ # You might need more sophisticated logic here if table positions are granular
272
+ while (
273
+ table_index < len(tables)
274
+ and tables[table_index]["page"] == page.number
275
+ ):
276
+ current_page_markdown_blocks.append(
277
+ self.table_to_markdown(tables[table_index]["content"])
278
+ )
279
+ table_index += 1
236
280
 
237
- """taken from self:
238
- pdf_path (str): The path to the input PDF file.
239
- output_path (str): Directory to save debug output (like rendered images).
240
- """
281
+ # Phase 2: Full-page OCR if the page seems to be a scanned image or lacks sufficient searchable text
282
+ # We prioritize actual searchable text and embedded image OCR.
283
+ # Only if very little or no text was found, we resort to full-page OCR.
284
+ combined_current_page_text_length = len("".join(current_page_markdown_blocks).strip())
241
285
 
242
- all_pages_text = []
243
- the_text= ""
286
+ # A heuristic: if almost no searchable text and no significant OCR from embedded images
287
+ if not page_has_searchable_text and combined_current_page_text_length < 100: # Threshold for considering "minimal text"
288
+ self.logger.info(f" Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
289
+ try:
290
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
291
+ img_bytes = pix.tobytes("png")
292
+ image_bytestream= io.BytesIO(img_bytes)
293
+ pil_image = Image.open(image_bytestream)
244
294
 
245
- try:
246
- doc = fitz.open(self.pdf_path)
247
- logging.info(f"Opened PDF: {self.pdf_path}")
248
-
249
- tables = self.extract_tables()
250
- table_index = 0
251
- list_counter = 0
252
- in_code_block = False
253
- code_block_content = ""
254
- code_block_lang = None
255
- prev_line = ""
256
-
257
- for page_num, page in enumerate(doc):
258
- page_text_content = []
259
- page_has_searchable_text = False
260
-
261
- logging.info(f"\nProcessing page {page_num + 1}...")
262
-
263
- # --- Phase 1: Extract text from direct text blocks and process embedded images ---
264
- blocks = page.get_text('dict')['blocks']
265
- text_blocks_content = []
266
- image_block_text_content = []
267
-
268
- page_height = page.rect.height
269
- links = self.extract_links(page)
270
-
271
- for block_num, block in enumerate(blocks):
272
- if block['type'] == 0: # Text block
273
- page_has_searchable_text = True
274
- text_blocks_content.append(self.process_text_block(
275
- block,
276
- page_height,
277
- links,
278
- list_counter,
279
- in_code_block,
280
- code_block_content,
281
- code_block_lang,
282
- prev_line,
283
- ))
284
-
285
- # for line in block['lines']:
286
- # for span in line['spans']:
287
- # text_blocks_content.append(span['text'])
288
- elif block['type'] == 1: # Image block
289
- logging.info(f" Found embedded image block (Page {page_num+1}, Block {block_num+1})")
290
- img_data = block['image']
291
- img_ext = block['ext']
295
+ ocr_text_from_page = self.ocr_page_with_nanonets_s(
296
+ pil_image, image_bytestream, max_new_tokens=15000
297
+ )
292
298
 
293
- try:
294
- # Attempt OCR on the embedded image block
295
- pil_image = Image.open(io.BytesIO(img_data))
296
- # ocr_text_from_block_image = pytesseract.image_to_string(pil_image)
297
- ocr_text_from_block_image= self.ocr_page_with_nanonets_s(pil_image, model, processor, max_new_tokens=15000)
298
-
299
- if ocr_text_from_block_image.strip():
300
- logging.info(f" OCR found text in embedded image block.")
301
- image_block_text_content.append(ocr_text_from_block_image.strip())
299
+ if ocr_text_from_page.strip():
300
+ self.logger.info(f" Successfully extracted text via full-page OCR for page {page_num + 1}.")
301
+ # If full-page OCR yields significant content and other methods didn't,
302
+ # replace or augment. Here, we'll replace to avoid double-counting if it's primarily scanned.
303
+ # You might choose to append if you want to combine (e.g., if there's header text + scanned body)
304
+ if combined_current_page_text_length < 50: # If almost nothing was found before, replace
305
+ current_page_markdown_blocks = [ocr_text_from_page.strip()]
306
+ else: # Otherwise, augment (append)
307
+ current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_page.strip()}\n\n")
302
308
  else:
303
- # If no OCR text, use the caption
304
- # caption = self.caption_image(pil_image)
305
- # if caption:
306
- # logging.info(f" No OCR text, using caption for embedded image block.")
307
- # image_block_text_content.append(caption)
308
- # else:
309
- # logging.info(f" No OCR text and no caption for embedded image block.")
310
-
311
- # a) captioning sucks, b) no need
312
- image_block_text_content.append("An Image")
313
-
314
- # except pytesseract.TesseractNotFoundError:
315
- # logging.warning(" Tesseract-OCR not found. Skipping OCR for embedded image block.")
316
- # caption = self.process_image_block(page, block)
317
- # if caption: image_block_text_content.append(caption)
318
-
319
- # image_block_text_content.append("An Image")
309
+ self.logger.info(f" Full-page OCR yielded no text for page {page_num+1}.")
320
310
  except Exception as e:
321
- logging.error(f" Error processing embedded image block for OCR/caption: {e}")
322
- # caption = self.process_image_block(page, block)
323
- # if caption: image_block_text_content.append(caption)
324
- image_block_text_content.append("An Image")
325
-
326
-
327
- # Insert tables at their approximate positions
328
- while (
329
- table_index < len(tables)
330
- and tables[table_index]["page"] == page.number
331
- ):
332
- page_text_content += (
333
- "\n\n"
334
- + self.table_to_markdown(tables[table_index]["content"])
335
- + "\n\n"
336
- )
337
- table_index += 1
338
-
339
- # Add content from text blocks
340
- if text_blocks_content:
341
- page_text_content.append(" ".join(text_blocks_content))
342
-
343
- # Add content from image blocks
344
- if image_block_text_content:
345
- page_text_content.append("\n".join(image_block_text_content))
346
-
347
-
348
- # --- Phase 2: OCR the entire page IF it seems to be a scanned image ---
349
- # We check if page_has_searchable_text is False or if the amount of text
350
- # is very small, suggesting it might be mostly a scanned page.
351
- # A threshold of 50 characters is arbitrary; adjust as needed.
352
- current_text_len = len(" ".join(page_text_content).strip())
353
-
354
- if not page_has_searchable_text or current_text_len < 50:
355
- logging.info(f" Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
356
- try:
357
- # Render the page as a high-resolution image (e.g., 300 DPI)
358
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
359
- img_bytes = pix.tobytes("png")
360
-
361
- pil_image = Image.open(io.BytesIO(img_bytes))
362
-
363
- # Perform OCR on the entire page image
364
- # ocr_text_from_page = pytesseract.image_to_string(pil_image)
365
- ocr_text_from_page= self.ocr_page_with_nanonets_s(pil_image, model, processor, max_new_tokens=15000)
366
-
367
- if ocr_text_from_page.strip():
368
- logging.info(f" Successfully extracted text via full-page OCR.")
369
- page_text_content.append(ocr_text_from_page.strip())
370
- else:
371
- logging.info(f" Full-page OCR yielded no text for page {page_num+1}.")
372
-
373
- # except pytesseract.TesseractNotFoundError:
374
- # logging.warning(" Tesseract-OCR not found. Skipping full-page OCR for this page.")
375
- except Exception as e:
376
- logging.error(f" Error during full-page OCR on page {page_num+1}: {e}")
377
- else:
378
- logging.info(f" Page {page_num + 1} has sufficient searchable text; skipping full-page OCR.")
379
-
380
-
381
- # Concatenate all collected text for the current page
382
- final_page_text = "\n".join(filter(None, page_text_content)).strip() # Use filter(None, ...) to remove empty strings
383
- all_pages_text.append(self.post_process_markdown(final_page_text))
384
- the_text += final_page_text + self.page_delimiter
385
-
386
- logging.info(f" Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_text[:200]}...")
311
+ self.logger.error(f" Error during full-page OCR on page {page_num+1}: {e}")
312
+ else:
313
+ self.logger.info(f" Page {page_num + 1} has sufficient searchable text or embedded image OCR; skipping full-page OCR.")
387
314
 
388
- print("\npage done\n")
389
- print(final_page_text)
315
+ # Join collected markdown blocks for the current page
316
+ final_page_markdown = "\n".join(filter(None, current_page_markdown_blocks)).strip()
317
+ all_pages_markdown.append(self.post_process_markdown(final_page_markdown))
318
+ full_document_markdown.append(self.post_process_markdown(final_page_markdown))
319
+ full_document_markdown.append(self.page_delimiter)
390
320
 
391
321
 
392
- doc.close()
393
- return the_text, all_pages_text
322
+ self.logger.info(f" Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_markdown[:200]}...")
323
+ print(f"\n--- Page {page_num+1} Done ---\n")
324
+ print(final_page_markdown[:500]) # Print first 500 chars of page markdown
394
325
 
395
- except fitz.FileNotFoundError:
396
- logging.error(f"PDF file not found: {self.pdf_path}")
397
- return []
398
- except Exception as e:
399
- logging.critical(f"An unexpected error occurred: {e}")
400
- return []
326
+ doc.close()
327
+ return "".join(full_document_markdown), all_pages_markdown
401
328
 
329
+ except fitz.FileNotFoundError:
330
+ self.logger.error(f"PDF file not found: {self.pdf_path}")
331
+ return "", []
332
+ except Exception as e:
333
+ self.logger.critical(f"An unexpected error occurred during markdown extraction: {e}")
334
+ self.logger.exception(traceback.format_exc())
335
+ return "", []
402
336
 
403
337
  def extract_tables(self):
404
338
  """Extract tables from PDF using pdfplumber."""
@@ -449,13 +383,13 @@ class MarkdownPDFExtractor(PDFExtractor):
449
383
  self.logger.exception(traceback.format_exc())
450
384
  return ""
451
385
 
452
- def perform_ocr(self, image):
386
+ def perform_ocr(self, image, image_bytes):
453
387
  """Perform OCR on the given image."""
454
388
  try:
455
389
  # ocr_result = pytesseract.image_to_string(
456
390
  # image
457
391
  # )
458
- ocr_result= self.ocr_page_with_nanonets_s(image, model, processor, max_new_tokens=15000)
392
+ ocr_result= self.ocr_page_with_nanonets_s(image, image_bytes, max_new_tokens=15000)
459
393
 
460
394
 
461
395
  return ocr_result.strip()
@@ -464,10 +398,10 @@ class MarkdownPDFExtractor(PDFExtractor):
464
398
  self.logger.exception(traceback.format_exc())
465
399
  return ""
466
400
 
467
- def caption_image(self, image):
401
+ def caption_image(self, image, image_bytes):
468
402
  """Generate a caption for the given image."""
469
403
  try:
470
- ocr_text = self.perform_ocr(image)
404
+ ocr_text = self.perform_ocr(image, image_bytes)
471
405
  if ocr_text:
472
406
  return ocr_text
473
407
 
@@ -475,19 +409,38 @@ class MarkdownPDFExtractor(PDFExtractor):
475
409
  if image.mode != "RGB":
476
410
  image = image.convert("RGB")
477
411
 
478
- # Ensure the image is in the correct shape
479
- image = np.array(image).transpose(2, 0, 1) # Convert to (C, H, W) format
412
+ image_format = image.format
413
+ dummy_filename = f"dummy.{image_format.lower()}"
414
+ mime_type, _ = mimetypes.guess_type(dummy_filename)
415
+
416
+ if "gemini" in self.MODEL_NAME:
417
+ response= self.gclient.models.generate_content(
418
+ model= self.MODEL_NAME,
419
+ contents=[
420
+ types.Part.from_bytes(
421
+ data=image_bytes.getvalue(),
422
+ mime_type= mime_type
423
+ ),
424
+ "Write a caption for this image"
425
+ ]
426
+ )
427
+ return response.text
428
+ else:
429
+ # Ensure the image is in the correct shape
430
+ image = np.array(image).transpose(2, 0, 1) # Convert to (C, H, W) format
480
431
 
481
- inputs = self.feature_extractor(images=image, return_tensors="pt").to(
482
- self.device
483
- )
484
- pixel_values = inputs.pixel_values
432
+ inputs = self.feature_extractor(images=image, return_tensors="pt").to(
433
+ self.device
434
+ )
435
+ pixel_values = inputs.pixel_values
485
436
 
486
- generated_ids = self.model.generate(pixel_values, max_length=30)
487
- generated_caption = self.tokenizer.batch_decode(
488
- generated_ids, skip_special_tokens=True
489
- )[0]
490
- return generated_caption.strip()
437
+ generated_ids = self.model.generate(pixel_values, max_length=30)
438
+
439
+ generated_ids = self.model.generate(pixel_values, max_length=30)
440
+ generated_caption = self.tokenizer.batch_decode(
441
+ generated_ids, skip_special_tokens=True
442
+ )[0]
443
+ return generated_caption.strip()
491
444
  except Exception as e:
492
445
  self.logger.error(f"Error captioning image: {e}")
493
446
  self.logger.exception(traceback.format_exc())
@@ -789,7 +742,11 @@ class MarkdownPDFExtractor(PDFExtractor):
789
742
  Path(self.output_path) / image_filename
790
743
  ) # Convert to Path object
791
744
  image.save(image_path, "PNG", optimize=True, quality=95)
792
- caption = self.caption_image(image)
745
+
746
+ img_byte_arr = io.BytesIO()
747
+ image.save(img_byte_arr)
748
+ caption = self.caption_image(image, img_byte_arr)
749
+
793
750
  if not caption:
794
751
  caption = (
795
752
  f"{self.pdf_filename}_image_{int(page.number)+1}_{block['number']}"