pembot 0.0.3__py2.py3-none-any.whl → 0.0.5__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pembot might be problematic. Click here for more details.
- pembot/.git/COMMIT_EDITMSG +1 -1
- pembot/.git/index +0 -0
- pembot/.git/logs/HEAD +1 -0
- pembot/.git/logs/refs/heads/main +1 -0
- pembot/.git/logs/refs/remotes/origin/main +1 -0
- pembot/.git/objects/0b/db4169fc0f312b8698f1df17a258fff163aeaa +0 -0
- pembot/.git/objects/1f/83a471c8119f7794d98c049170a5d7d07a4b71 +0 -0
- pembot/.git/objects/41/cbeb6bcb4c6fa9ef9be571082d95ecb4ea0ee3 +0 -0
- pembot/.git/objects/63/1700a51c8fa97b543991f5f61bfcd1e7e1327d +0 -0
- pembot/.git/objects/ab/139d2cd4798dd8e2c565b80440b1a44b376126 +0 -0
- pembot/.git/objects/bf/068a0714e2145de83a5c004f4213b091439d0e +0 -0
- pembot/.git/objects/d0/937f7d832266337289d5ec09459f931a46fcf7 +0 -0
- pembot/.git/objects/fc/988aab7e2d46396dc595ad24345e8e77dda0e4 +0 -0
- pembot/.git/refs/heads/main +1 -1
- pembot/.git/refs/remotes/origin/main +1 -1
- pembot/AnyToText/convertor.py +250 -146
- pembot/__init__.py +1 -1
- pembot/config/config.yaml +1 -1
- pembot/main.py +26 -8
- pembot/pdf2markdown/extract.py +266 -309
- pembot/query.py +15 -9
- {pembot-0.0.3.dist-info → pembot-0.0.5.dist-info}/METADATA +1 -1
- {pembot-0.0.3.dist-info → pembot-0.0.5.dist-info}/RECORD +25 -17
- {pembot-0.0.3.dist-info → pembot-0.0.5.dist-info}/WHEEL +0 -0
- {pembot-0.0.3.dist-info → pembot-0.0.5.dist-info}/licenses/LICENSE +0 -0
pembot/pdf2markdown/extract.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
|
1
|
-
import fitz
|
|
1
|
+
import fitz
|
|
2
2
|
import pdfplumber
|
|
3
3
|
import re
|
|
4
4
|
import yaml
|
|
5
5
|
# import pytesseract
|
|
6
6
|
import numpy as np
|
|
7
|
-
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText
|
|
8
|
-
# VisionEncoderDecoderModel, ViTImageProcessor,
|
|
7
|
+
from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText, VisionEncoderDecoderModel, ViTImageProcessor
|
|
9
8
|
from typing import Literal, final
|
|
10
9
|
import torch
|
|
11
10
|
from PIL import Image
|
|
@@ -16,28 +15,26 @@ import warnings
|
|
|
16
15
|
from pathlib import Path
|
|
17
16
|
from abc import ABC, abstractmethod
|
|
18
17
|
import argparse
|
|
19
|
-
from PIL import Image
|
|
20
18
|
import io
|
|
21
|
-
from
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
model = AutoModelForImageTextToText.from_pretrained(
|
|
26
|
-
model_path,
|
|
27
|
-
torch_dtype="auto",
|
|
28
|
-
device_map="auto",
|
|
29
|
-
attn_implementation="flash_attention_2"
|
|
30
|
-
)
|
|
31
|
-
model.eval()
|
|
19
|
+
from google import genai
|
|
20
|
+
from google.genai import types
|
|
21
|
+
import mimetypes
|
|
32
22
|
|
|
33
|
-
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
34
|
-
processor = AutoProcessor.from_pretrained(model_path)
|
|
35
23
|
|
|
36
24
|
|
|
37
25
|
warnings.filterwarnings("ignore")
|
|
38
26
|
|
|
39
|
-
|
|
40
|
-
|
|
27
|
+
config= {}
|
|
28
|
+
try:
|
|
29
|
+
with open(Path("config/config.yaml").resolve(), "r", encoding="utf-8") as f:
|
|
30
|
+
config = yaml.safe_load(f)
|
|
31
|
+
except FileNotFoundError:
|
|
32
|
+
config= {
|
|
33
|
+
'OUTPUT_DIR': '.',
|
|
34
|
+
'PAGE_DELIMITER': '____NEXT PAGE____'
|
|
35
|
+
}
|
|
36
|
+
except Exception as e:
|
|
37
|
+
print("unhandled while opening default config in pdf2markdown: ", e)
|
|
41
38
|
|
|
42
39
|
|
|
43
40
|
class PDFExtractor(ABC):
|
|
@@ -74,9 +71,31 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
74
71
|
|
|
75
72
|
BULLET_POINTS = "•◦▪▫●○"
|
|
76
73
|
|
|
77
|
-
def __init__(self, pdf_path, output_path= config
|
|
74
|
+
def __init__(self, pdf_path, output_path= config.get("OUTPUT_DIR", '.'), page_delimiter= config.get("PAGE_DELIMITER", ''), model_name: str | None= None):
|
|
78
75
|
super().__init__(pdf_path)
|
|
79
76
|
|
|
77
|
+
if model_name is None:
|
|
78
|
+
self.MODEL_NAME= "gemini-2.5-flash"
|
|
79
|
+
else:
|
|
80
|
+
self.MODEL_NAME= model_name
|
|
81
|
+
|
|
82
|
+
if "gemini" in self.MODEL_NAME:
|
|
83
|
+
self.gclient = genai.Client(api_key= os.getenv("GEMINI_API_KEY", ''))
|
|
84
|
+
else:
|
|
85
|
+
model_path = "nanonets/Nanonets-OCR-s"
|
|
86
|
+
self.model = AutoModelForImageTextToText.from_pretrained(
|
|
87
|
+
model_path,
|
|
88
|
+
torch_dtype="auto",
|
|
89
|
+
device_map="auto",
|
|
90
|
+
attn_implementation="flash_attention_2"
|
|
91
|
+
)
|
|
92
|
+
self.model.eval()
|
|
93
|
+
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
|
|
94
|
+
self.processor = AutoProcessor.from_pretrained(model_path)
|
|
95
|
+
self.setup_image_captioning()
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
80
99
|
self.markdown_content= ""
|
|
81
100
|
self.pdf_filename = Path(pdf_path).stem
|
|
82
101
|
self.output_path= output_path
|
|
@@ -87,26 +106,26 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
87
106
|
self.page_delimiter= page_delimiter
|
|
88
107
|
Path(output_path).mkdir(parents=True, exist_ok=True)
|
|
89
108
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def setup_image_captioning(self):
|
|
112
|
+
"""Set up the image captioning model."""
|
|
113
|
+
try:
|
|
114
|
+
self.model = VisionEncoderDecoderModel.from_pretrained(
|
|
115
|
+
"nlpconnect/vit-gpt2-image-captioning"
|
|
116
|
+
)
|
|
117
|
+
self.feature_extractor = ViTImageProcessor.from_pretrained(
|
|
118
|
+
"nlpconnect/vit-gpt2-image-captioning"
|
|
119
|
+
)
|
|
120
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
121
|
+
"nlpconnect/vit-gpt2-image-captioning"
|
|
122
|
+
)
|
|
123
|
+
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
124
|
+
self.model.to(self.device)
|
|
125
|
+
self.logger.info("Image captioning model set up successfully.")
|
|
126
|
+
except Exception as e:
|
|
127
|
+
self.logger.error(f"Error setting up image captioning model: {e}")
|
|
128
|
+
self.logger.exception(traceback.format_exc())
|
|
110
129
|
|
|
111
130
|
def extract(self):
|
|
112
131
|
try:
|
|
@@ -123,282 +142,197 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
123
142
|
self.logger.exception(traceback.format_exc())
|
|
124
143
|
return "", []
|
|
125
144
|
|
|
126
|
-
def extract_markdown_by_blocks(self):
|
|
127
|
-
"""Main method to extract markdown from PDF."""
|
|
128
|
-
try:
|
|
129
|
-
doc = fitz.open(self.pdf_path)
|
|
130
|
-
markdown_content = ""
|
|
131
|
-
markdown_pages = []
|
|
132
|
-
tables = self.extract_tables()
|
|
133
|
-
table_index = 0
|
|
134
|
-
list_counter = 0
|
|
135
|
-
in_code_block = False
|
|
136
|
-
code_block_content = ""
|
|
137
|
-
code_block_lang = None
|
|
138
|
-
prev_line = ""
|
|
139
|
-
|
|
140
|
-
for page_num, page in enumerate(doc):
|
|
141
|
-
self.logger.info(f"Processing page {page_num + 1}")
|
|
142
|
-
page_content = ""
|
|
143
|
-
blocks = page.get_text("dict")["blocks"]
|
|
144
|
-
page_height = page.rect.height
|
|
145
|
-
links = self.extract_links(page)
|
|
146
|
-
|
|
147
|
-
if len(page.get_images()) > 0 and len(page.get_images()) <= 128:
|
|
148
|
-
for block in blocks:
|
|
149
|
-
if block["type"] == 0: # Text
|
|
150
|
-
page_content += self.process_text_block(
|
|
151
|
-
block,
|
|
152
|
-
page_height,
|
|
153
|
-
links,
|
|
154
|
-
list_counter,
|
|
155
|
-
in_code_block,
|
|
156
|
-
code_block_content,
|
|
157
|
-
code_block_lang,
|
|
158
|
-
prev_line,
|
|
159
|
-
)
|
|
160
|
-
elif block["type"] == 1: # Image
|
|
161
|
-
page_content += self.process_image_block(page, block)
|
|
162
|
-
|
|
163
|
-
else:
|
|
164
|
-
for block in blocks:
|
|
165
|
-
if block["type"] == 0: # Text
|
|
166
|
-
page_content += self.process_text_block(
|
|
167
|
-
block,
|
|
168
|
-
page_height,
|
|
169
|
-
links,
|
|
170
|
-
list_counter,
|
|
171
|
-
in_code_block,
|
|
172
|
-
code_block_content,
|
|
173
|
-
code_block_lang,
|
|
174
|
-
prev_line,
|
|
175
|
-
)
|
|
176
|
-
|
|
177
|
-
# Insert tables at their approximate positions
|
|
178
|
-
while (
|
|
179
|
-
table_index < len(tables)
|
|
180
|
-
and tables[table_index]["page"] == page.number
|
|
181
|
-
):
|
|
182
|
-
page_content += (
|
|
183
|
-
"\n\n"
|
|
184
|
-
+ self.table_to_markdown(tables[table_index]["content"])
|
|
185
|
-
+ "\n\n"
|
|
186
|
-
)
|
|
187
|
-
table_index += 1
|
|
188
|
-
|
|
189
|
-
markdown_pages.append(self.post_process_markdown(page_content))
|
|
190
|
-
markdown_content += page_content + config["PAGE_DELIMITER"]
|
|
191
145
|
|
|
192
|
-
|
|
193
|
-
return markdown_content, markdown_pages
|
|
194
|
-
except Exception as e:
|
|
195
|
-
self.logger.error(f"Error extracting markdown: {e}")
|
|
196
|
-
self.logger.exception(traceback.format_exc())
|
|
197
|
-
return "", []
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
def ocr_page_with_nanonets_s(self, pil_image, model, processor, max_new_tokens: int | None = None):
|
|
146
|
+
def ocr_page_with_nanonets_s(self, pil_image, img_bytes, max_new_tokens: int | None = None):
|
|
201
147
|
prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
|
|
202
148
|
if max_new_tokens is None:
|
|
203
149
|
max_new_tokens= 4096
|
|
204
150
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
151
|
+
if 'gemini' in self.MODEL_NAME:
|
|
152
|
+
|
|
153
|
+
image_format = pil_image.format
|
|
154
|
+
dummy_filename = f"dummy.{image_format.lower()}"
|
|
155
|
+
mime_type, _ = mimetypes.guess_type(dummy_filename)
|
|
156
|
+
response= self.gclient.models.generate_content(
|
|
157
|
+
model= self.MODEL_NAME,
|
|
158
|
+
contents=[
|
|
159
|
+
types.Part.from_bytes(
|
|
160
|
+
data=img_bytes.getvalue(),
|
|
161
|
+
mime_type= mime_type
|
|
162
|
+
),
|
|
163
|
+
prompt
|
|
164
|
+
]
|
|
165
|
+
)
|
|
166
|
+
# print("response :", response)
|
|
167
|
+
return response.text
|
|
168
|
+
else:
|
|
169
|
+
image = pil_image
|
|
170
|
+
messages = [
|
|
171
|
+
{"role": "system", "content": "You are a helpful assistant."},
|
|
172
|
+
{"role": "user", "content": [
|
|
173
|
+
{"type": "image", "image": image},
|
|
174
|
+
{"type": "text", "text": prompt},
|
|
175
|
+
]},
|
|
176
|
+
]
|
|
177
|
+
text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
|
178
|
+
inputs = self.processor(text=[text], images=[image], padding=True, return_tensors="pt")
|
|
179
|
+
inputs = inputs.to(self.model.device)
|
|
217
180
|
|
|
218
|
-
|
|
219
|
-
|
|
181
|
+
output_ids = self.model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
|
|
182
|
+
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
|
220
183
|
|
|
221
|
-
|
|
222
|
-
|
|
184
|
+
output_text = self.processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
|
185
|
+
return output_text[0]
|
|
223
186
|
|
|
224
187
|
|
|
225
188
|
|
|
226
189
|
def extract_markdown(self):
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
190
|
+
"""
|
|
191
|
+
Extracts all possible content from a PDF, prioritizing searchable text,
|
|
192
|
+
then OCR for embedded images, and finally full-page OCR for scanned pages.
|
|
193
|
+
Avoids redundant OCR where possible.
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
tuple: A tuple containing:
|
|
197
|
+
- str: The concatenated markdown content of all pages.
|
|
198
|
+
- list: A list of strings, where each string is the comprehensive markdown
|
|
199
|
+
for a corresponding page.
|
|
200
|
+
"""
|
|
201
|
+
all_pages_markdown = []
|
|
202
|
+
full_document_markdown = [] # Changed to list of lines/blocks to handle insertions better
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
doc = fitz.open(self.pdf_path)
|
|
206
|
+
self.logger.info(f"Opened PDF: {self.pdf_path}")
|
|
207
|
+
|
|
208
|
+
tables = self.extract_tables()
|
|
209
|
+
table_index = 0
|
|
210
|
+
|
|
211
|
+
# State variables for process_text_block that might need to persist across blocks
|
|
212
|
+
# Re-initialize for each new document, but allow state management within process_text_block for lines
|
|
213
|
+
list_counter = 0
|
|
214
|
+
in_code_block = False
|
|
215
|
+
code_block_content = ""
|
|
216
|
+
code_block_lang = None
|
|
217
|
+
prev_line = ""
|
|
218
|
+
|
|
219
|
+
for page_num, page in enumerate(doc):
|
|
220
|
+
current_page_markdown_blocks = [] # Collect markdown blocks for the current page
|
|
221
|
+
page_has_searchable_text = False
|
|
222
|
+
page_has_embedded_images = False
|
|
223
|
+
|
|
224
|
+
self.logger.info(f"\nProcessing page {page_num + 1}...")
|
|
225
|
+
|
|
226
|
+
blocks = page.get_text('dict')['blocks']
|
|
227
|
+
page_height = page.rect.height
|
|
228
|
+
links = self.extract_links(page)
|
|
229
|
+
|
|
230
|
+
# Phase 1: Process text blocks and embedded image blocks
|
|
231
|
+
for block_num, block in enumerate(blocks):
|
|
232
|
+
if block['type'] == 0: # Text block
|
|
233
|
+
page_has_searchable_text = True
|
|
234
|
+
processed_text = self.process_text_block(
|
|
235
|
+
block,
|
|
236
|
+
page_height,
|
|
237
|
+
links,
|
|
238
|
+
list_counter,
|
|
239
|
+
in_code_block,
|
|
240
|
+
code_block_content,
|
|
241
|
+
code_block_lang,
|
|
242
|
+
prev_line,
|
|
243
|
+
)
|
|
244
|
+
if processed_text.strip():
|
|
245
|
+
current_page_markdown_blocks.append(processed_text)
|
|
246
|
+
|
|
247
|
+
elif block['type'] == 1: # Image block
|
|
248
|
+
page_has_embedded_images = True
|
|
249
|
+
self.logger.info(f" Found embedded image block (Page {page_num+1}, Block {block_num+1})")
|
|
250
|
+
img_data = block['image']
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
image_bytes= io.BytesIO(img_data)
|
|
254
|
+
pil_image = Image.open(image_bytes)
|
|
255
|
+
ocr_text_from_block_image = self.ocr_page_with_nanonets_s(
|
|
256
|
+
pil_image, image_bytes, max_new_tokens=15000
|
|
257
|
+
)
|
|
231
258
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
259
|
+
if ocr_text_from_block_image.strip():
|
|
260
|
+
self.logger.info(" OCR found text in embedded image block.")
|
|
261
|
+
current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_block_image.strip()}\n\n")
|
|
262
|
+
else:
|
|
263
|
+
self.logger.info(f" No OCR text from embedded image block. Adding generic placeholder.")
|
|
264
|
+
current_page_markdown_blocks.append("\n\n\n\n") # Consider saving images
|
|
265
|
+
except Exception as e:
|
|
266
|
+
self.logger.error(f" Error processing embedded image block for OCR: {e}")
|
|
267
|
+
current_page_markdown_blocks.append("\n\n\n\n")
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# Insert tables at their approximate positions (after blocks are processed for the page)
|
|
271
|
+
# You might need more sophisticated logic here if table positions are granular
|
|
272
|
+
while (
|
|
273
|
+
table_index < len(tables)
|
|
274
|
+
and tables[table_index]["page"] == page.number
|
|
275
|
+
):
|
|
276
|
+
current_page_markdown_blocks.append(
|
|
277
|
+
self.table_to_markdown(tables[table_index]["content"])
|
|
278
|
+
)
|
|
279
|
+
table_index += 1
|
|
236
280
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
281
|
+
# Phase 2: Full-page OCR if the page seems to be a scanned image or lacks sufficient searchable text
|
|
282
|
+
# We prioritize actual searchable text and embedded image OCR.
|
|
283
|
+
# Only if very little or no text was found, we resort to full-page OCR.
|
|
284
|
+
combined_current_page_text_length = len("".join(current_page_markdown_blocks).strip())
|
|
241
285
|
|
|
242
|
-
|
|
243
|
-
|
|
286
|
+
# A heuristic: if almost no searchable text and no significant OCR from embedded images
|
|
287
|
+
if not page_has_searchable_text and combined_current_page_text_length < 100: # Threshold for considering "minimal text"
|
|
288
|
+
self.logger.info(f" Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
|
|
289
|
+
try:
|
|
290
|
+
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
|
|
291
|
+
img_bytes = pix.tobytes("png")
|
|
292
|
+
image_bytestream= io.BytesIO(img_bytes)
|
|
293
|
+
pil_image = Image.open(image_bytestream)
|
|
244
294
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
tables = self.extract_tables()
|
|
250
|
-
table_index = 0
|
|
251
|
-
list_counter = 0
|
|
252
|
-
in_code_block = False
|
|
253
|
-
code_block_content = ""
|
|
254
|
-
code_block_lang = None
|
|
255
|
-
prev_line = ""
|
|
256
|
-
|
|
257
|
-
for page_num, page in enumerate(doc):
|
|
258
|
-
page_text_content = []
|
|
259
|
-
page_has_searchable_text = False
|
|
260
|
-
|
|
261
|
-
logging.info(f"\nProcessing page {page_num + 1}...")
|
|
262
|
-
|
|
263
|
-
# --- Phase 1: Extract text from direct text blocks and process embedded images ---
|
|
264
|
-
blocks = page.get_text('dict')['blocks']
|
|
265
|
-
text_blocks_content = []
|
|
266
|
-
image_block_text_content = []
|
|
267
|
-
|
|
268
|
-
page_height = page.rect.height
|
|
269
|
-
links = self.extract_links(page)
|
|
270
|
-
|
|
271
|
-
for block_num, block in enumerate(blocks):
|
|
272
|
-
if block['type'] == 0: # Text block
|
|
273
|
-
page_has_searchable_text = True
|
|
274
|
-
text_blocks_content.append(self.process_text_block(
|
|
275
|
-
block,
|
|
276
|
-
page_height,
|
|
277
|
-
links,
|
|
278
|
-
list_counter,
|
|
279
|
-
in_code_block,
|
|
280
|
-
code_block_content,
|
|
281
|
-
code_block_lang,
|
|
282
|
-
prev_line,
|
|
283
|
-
))
|
|
284
|
-
|
|
285
|
-
# for line in block['lines']:
|
|
286
|
-
# for span in line['spans']:
|
|
287
|
-
# text_blocks_content.append(span['text'])
|
|
288
|
-
elif block['type'] == 1: # Image block
|
|
289
|
-
logging.info(f" Found embedded image block (Page {page_num+1}, Block {block_num+1})")
|
|
290
|
-
img_data = block['image']
|
|
291
|
-
img_ext = block['ext']
|
|
295
|
+
ocr_text_from_page = self.ocr_page_with_nanonets_s(
|
|
296
|
+
pil_image, image_bytestream, max_new_tokens=15000
|
|
297
|
+
)
|
|
292
298
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
299
|
+
if ocr_text_from_page.strip():
|
|
300
|
+
self.logger.info(f" Successfully extracted text via full-page OCR for page {page_num + 1}.")
|
|
301
|
+
# If full-page OCR yields significant content and other methods didn't,
|
|
302
|
+
# replace or augment. Here, we'll replace to avoid double-counting if it's primarily scanned.
|
|
303
|
+
# You might choose to append if you want to combine (e.g., if there's header text + scanned body)
|
|
304
|
+
if combined_current_page_text_length < 50: # If almost nothing was found before, replace
|
|
305
|
+
current_page_markdown_blocks = [ocr_text_from_page.strip()]
|
|
306
|
+
else: # Otherwise, augment (append)
|
|
307
|
+
current_page_markdown_blocks.append(f"\n\n\n{ocr_text_from_page.strip()}\n\n")
|
|
302
308
|
else:
|
|
303
|
-
|
|
304
|
-
# caption = self.caption_image(pil_image)
|
|
305
|
-
# if caption:
|
|
306
|
-
# logging.info(f" No OCR text, using caption for embedded image block.")
|
|
307
|
-
# image_block_text_content.append(caption)
|
|
308
|
-
# else:
|
|
309
|
-
# logging.info(f" No OCR text and no caption for embedded image block.")
|
|
310
|
-
|
|
311
|
-
# a) captioning sucks, b) no need
|
|
312
|
-
image_block_text_content.append("An Image")
|
|
313
|
-
|
|
314
|
-
# except pytesseract.TesseractNotFoundError:
|
|
315
|
-
# logging.warning(" Tesseract-OCR not found. Skipping OCR for embedded image block.")
|
|
316
|
-
# caption = self.process_image_block(page, block)
|
|
317
|
-
# if caption: image_block_text_content.append(caption)
|
|
318
|
-
|
|
319
|
-
# image_block_text_content.append("An Image")
|
|
309
|
+
self.logger.info(f" Full-page OCR yielded no text for page {page_num+1}.")
|
|
320
310
|
except Exception as e:
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
image_block_text_content.append("An Image")
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
# Insert tables at their approximate positions
|
|
328
|
-
while (
|
|
329
|
-
table_index < len(tables)
|
|
330
|
-
and tables[table_index]["page"] == page.number
|
|
331
|
-
):
|
|
332
|
-
page_text_content += (
|
|
333
|
-
"\n\n"
|
|
334
|
-
+ self.table_to_markdown(tables[table_index]["content"])
|
|
335
|
-
+ "\n\n"
|
|
336
|
-
)
|
|
337
|
-
table_index += 1
|
|
338
|
-
|
|
339
|
-
# Add content from text blocks
|
|
340
|
-
if text_blocks_content:
|
|
341
|
-
page_text_content.append(" ".join(text_blocks_content))
|
|
342
|
-
|
|
343
|
-
# Add content from image blocks
|
|
344
|
-
if image_block_text_content:
|
|
345
|
-
page_text_content.append("\n".join(image_block_text_content))
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
# --- Phase 2: OCR the entire page IF it seems to be a scanned image ---
|
|
349
|
-
# We check if page_has_searchable_text is False or if the amount of text
|
|
350
|
-
# is very small, suggesting it might be mostly a scanned page.
|
|
351
|
-
# A threshold of 50 characters is arbitrary; adjust as needed.
|
|
352
|
-
current_text_len = len(" ".join(page_text_content).strip())
|
|
353
|
-
|
|
354
|
-
if not page_has_searchable_text or current_text_len < 50:
|
|
355
|
-
logging.info(f" Page {page_num + 1} appears to be a scanned image or has minimal text. Attempting full-page OCR.")
|
|
356
|
-
try:
|
|
357
|
-
# Render the page as a high-resolution image (e.g., 300 DPI)
|
|
358
|
-
pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))
|
|
359
|
-
img_bytes = pix.tobytes("png")
|
|
360
|
-
|
|
361
|
-
pil_image = Image.open(io.BytesIO(img_bytes))
|
|
362
|
-
|
|
363
|
-
# Perform OCR on the entire page image
|
|
364
|
-
# ocr_text_from_page = pytesseract.image_to_string(pil_image)
|
|
365
|
-
ocr_text_from_page= self.ocr_page_with_nanonets_s(pil_image, model, processor, max_new_tokens=15000)
|
|
366
|
-
|
|
367
|
-
if ocr_text_from_page.strip():
|
|
368
|
-
logging.info(f" Successfully extracted text via full-page OCR.")
|
|
369
|
-
page_text_content.append(ocr_text_from_page.strip())
|
|
370
|
-
else:
|
|
371
|
-
logging.info(f" Full-page OCR yielded no text for page {page_num+1}.")
|
|
372
|
-
|
|
373
|
-
# except pytesseract.TesseractNotFoundError:
|
|
374
|
-
# logging.warning(" Tesseract-OCR not found. Skipping full-page OCR for this page.")
|
|
375
|
-
except Exception as e:
|
|
376
|
-
logging.error(f" Error during full-page OCR on page {page_num+1}: {e}")
|
|
377
|
-
else:
|
|
378
|
-
logging.info(f" Page {page_num + 1} has sufficient searchable text; skipping full-page OCR.")
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
# Concatenate all collected text for the current page
|
|
382
|
-
final_page_text = "\n".join(filter(None, page_text_content)).strip() # Use filter(None, ...) to remove empty strings
|
|
383
|
-
all_pages_text.append(self.post_process_markdown(final_page_text))
|
|
384
|
-
the_text += final_page_text + self.page_delimiter
|
|
385
|
-
|
|
386
|
-
logging.info(f" Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_text[:200]}...")
|
|
311
|
+
self.logger.error(f" Error during full-page OCR on page {page_num+1}: {e}")
|
|
312
|
+
else:
|
|
313
|
+
self.logger.info(f" Page {page_num + 1} has sufficient searchable text or embedded image OCR; skipping full-page OCR.")
|
|
387
314
|
|
|
388
|
-
|
|
389
|
-
|
|
315
|
+
# Join collected markdown blocks for the current page
|
|
316
|
+
final_page_markdown = "\n".join(filter(None, current_page_markdown_blocks)).strip()
|
|
317
|
+
all_pages_markdown.append(self.post_process_markdown(final_page_markdown))
|
|
318
|
+
full_document_markdown.append(self.post_process_markdown(final_page_markdown))
|
|
319
|
+
full_document_markdown.append(self.page_delimiter)
|
|
390
320
|
|
|
391
321
|
|
|
392
|
-
|
|
393
|
-
|
|
322
|
+
self.logger.info(f" Comprehensive text for page {page_num + 1} (first 200 chars):\n{final_page_markdown[:200]}...")
|
|
323
|
+
print(f"\n--- Page {page_num+1} Done ---\n")
|
|
324
|
+
print(final_page_markdown[:500]) # Print first 500 chars of page markdown
|
|
394
325
|
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
return []
|
|
398
|
-
except Exception as e:
|
|
399
|
-
logging.critical(f"An unexpected error occurred: {e}")
|
|
400
|
-
return []
|
|
326
|
+
doc.close()
|
|
327
|
+
return "".join(full_document_markdown), all_pages_markdown
|
|
401
328
|
|
|
329
|
+
except fitz.FileNotFoundError:
|
|
330
|
+
self.logger.error(f"PDF file not found: {self.pdf_path}")
|
|
331
|
+
return "", []
|
|
332
|
+
except Exception as e:
|
|
333
|
+
self.logger.critical(f"An unexpected error occurred during markdown extraction: {e}")
|
|
334
|
+
self.logger.exception(traceback.format_exc())
|
|
335
|
+
return "", []
|
|
402
336
|
|
|
403
337
|
def extract_tables(self):
|
|
404
338
|
"""Extract tables from PDF using pdfplumber."""
|
|
@@ -449,13 +383,13 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
449
383
|
self.logger.exception(traceback.format_exc())
|
|
450
384
|
return ""
|
|
451
385
|
|
|
452
|
-
def perform_ocr(self, image):
|
|
386
|
+
def perform_ocr(self, image, image_bytes):
|
|
453
387
|
"""Perform OCR on the given image."""
|
|
454
388
|
try:
|
|
455
389
|
# ocr_result = pytesseract.image_to_string(
|
|
456
390
|
# image
|
|
457
391
|
# )
|
|
458
|
-
ocr_result= self.ocr_page_with_nanonets_s(image,
|
|
392
|
+
ocr_result= self.ocr_page_with_nanonets_s(image, image_bytes, max_new_tokens=15000)
|
|
459
393
|
|
|
460
394
|
|
|
461
395
|
return ocr_result.strip()
|
|
@@ -464,10 +398,10 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
464
398
|
self.logger.exception(traceback.format_exc())
|
|
465
399
|
return ""
|
|
466
400
|
|
|
467
|
-
def caption_image(self, image):
|
|
401
|
+
def caption_image(self, image, image_bytes):
|
|
468
402
|
"""Generate a caption for the given image."""
|
|
469
403
|
try:
|
|
470
|
-
ocr_text = self.perform_ocr(image)
|
|
404
|
+
ocr_text = self.perform_ocr(image, image_bytes)
|
|
471
405
|
if ocr_text:
|
|
472
406
|
return ocr_text
|
|
473
407
|
|
|
@@ -475,19 +409,38 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
475
409
|
if image.mode != "RGB":
|
|
476
410
|
image = image.convert("RGB")
|
|
477
411
|
|
|
478
|
-
|
|
479
|
-
|
|
412
|
+
image_format = image.format
|
|
413
|
+
dummy_filename = f"dummy.{image_format.lower()}"
|
|
414
|
+
mime_type, _ = mimetypes.guess_type(dummy_filename)
|
|
415
|
+
|
|
416
|
+
if "gemini" in self.MODEL_NAME:
|
|
417
|
+
response= self.gclient.models.generate_content(
|
|
418
|
+
model= self.MODEL_NAME,
|
|
419
|
+
contents=[
|
|
420
|
+
types.Part.from_bytes(
|
|
421
|
+
data=image_bytes.getvalue(),
|
|
422
|
+
mime_type= mime_type
|
|
423
|
+
),
|
|
424
|
+
"Write a caption for this image"
|
|
425
|
+
]
|
|
426
|
+
)
|
|
427
|
+
return response.text
|
|
428
|
+
else:
|
|
429
|
+
# Ensure the image is in the correct shape
|
|
430
|
+
image = np.array(image).transpose(2, 0, 1) # Convert to (C, H, W) format
|
|
480
431
|
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
432
|
+
inputs = self.feature_extractor(images=image, return_tensors="pt").to(
|
|
433
|
+
self.device
|
|
434
|
+
)
|
|
435
|
+
pixel_values = inputs.pixel_values
|
|
485
436
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
generated_ids,
|
|
489
|
-
|
|
490
|
-
|
|
437
|
+
generated_ids = self.model.generate(pixel_values, max_length=30)
|
|
438
|
+
|
|
439
|
+
generated_ids = self.model.generate(pixel_values, max_length=30)
|
|
440
|
+
generated_caption = self.tokenizer.batch_decode(
|
|
441
|
+
generated_ids, skip_special_tokens=True
|
|
442
|
+
)[0]
|
|
443
|
+
return generated_caption.strip()
|
|
491
444
|
except Exception as e:
|
|
492
445
|
self.logger.error(f"Error captioning image: {e}")
|
|
493
446
|
self.logger.exception(traceback.format_exc())
|
|
@@ -789,7 +742,11 @@ class MarkdownPDFExtractor(PDFExtractor):
|
|
|
789
742
|
Path(self.output_path) / image_filename
|
|
790
743
|
) # Convert to Path object
|
|
791
744
|
image.save(image_path, "PNG", optimize=True, quality=95)
|
|
792
|
-
|
|
745
|
+
|
|
746
|
+
img_byte_arr = io.BytesIO()
|
|
747
|
+
image.save(img_byte_arr)
|
|
748
|
+
caption = self.caption_image(image, img_byte_arr)
|
|
749
|
+
|
|
793
750
|
if not caption:
|
|
794
751
|
caption = (
|
|
795
752
|
f"{self.pdf_filename}_image_{int(page.number)+1}_{block['number']}"
|