pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
- datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +525 -61
- datamax/parser/docx_parser.py +512 -62
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -208
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- pydatamax-0.1.15.dist-info/METADATA +340 -0
- pydatamax-0.1.15.dist-info/RECORD +38 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.13.dist-info/METADATA +0 -280
- pydatamax-0.1.13.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
datamax/utils/gotocr_pdf.py
CHANGED
@@ -1,265 +1,265 @@
|
|
1
|
-
import argparse
|
2
|
-
import ast
|
3
|
-
import os
|
4
|
-
import re
|
5
|
-
from datetime import datetime
|
6
|
-
|
7
|
-
import cv2
|
8
|
-
import numpy as np
|
9
|
-
import torch
|
10
|
-
from GOT.demo.process_results import punctuation_dict
|
11
|
-
from GOT.model import *
|
12
|
-
from GOT.model.plug.blip_process import BlipImageEvalProcessor
|
13
|
-
from GOT.utils.conversation import SeparatorStyle, conv_templates
|
14
|
-
from GOT.utils.utils import KeywordsStoppingCriteria, disable_torch_init
|
15
|
-
from paddle.utils import try_import
|
16
|
-
from PIL import Image
|
17
|
-
from transformers import AutoTokenizer
|
18
|
-
|
19
|
-
fitz = try_import("fitz")
|
20
|
-
|
21
|
-
DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 - 这是技术常量,不是密码
|
22
|
-
DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 - 这是技术常量,不是密码
|
23
|
-
|
24
|
-
DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 - 这是技术常量,不是密码
|
25
|
-
DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 - 这是技术常量,不是密码
|
26
|
-
|
27
|
-
translation_table = str.maketrans(punctuation_dict)
|
28
|
-
|
29
|
-
parser = argparse.ArgumentParser()
|
30
|
-
|
31
|
-
args = argparse.Namespace()
|
32
|
-
args.model_name = "./GOT_weights/"
|
33
|
-
args.type = "format"
|
34
|
-
args.box = ""
|
35
|
-
args.color = ""
|
36
|
-
|
37
|
-
# TODO vary old codes, NEED del
|
38
|
-
image_processor = BlipImageEvalProcessor(image_size=1024)
|
39
|
-
|
40
|
-
image_processor_high = BlipImageEvalProcessor(image_size=1024)
|
41
|
-
|
42
|
-
use_im_start_end = True
|
43
|
-
|
44
|
-
image_token_len = 256
|
45
|
-
|
46
|
-
|
47
|
-
def covert_pdf_to_image(image_path: str):
|
48
|
-
# step1: Convert PDF to images
|
49
|
-
imgs = []
|
50
|
-
with fitz.open(image_path) as pdf:
|
51
|
-
for pg in range(0, pdf.page_count):
|
52
|
-
page = pdf[pg]
|
53
|
-
mat = fitz.Matrix(4, 4) # 全程放大四倍
|
54
|
-
pm = page.get_pixmap(matrix=mat, alpha=False)
|
55
|
-
# if pm.width > 2000 or pm.height > 2000:
|
56
|
-
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
57
|
-
|
58
|
-
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
|
59
|
-
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
60
|
-
imgs.append(img)
|
61
|
-
|
62
|
-
img_name = datetime.now().strftime("%Y%m%d%H%M%S")
|
63
|
-
# step2: Process images
|
64
|
-
output = "output"
|
65
|
-
img_paths = []
|
66
|
-
for index, pdf_img in enumerate(imgs):
|
67
|
-
# 图片处理
|
68
|
-
|
69
|
-
gray_img = cv2.cvtColor(pdf_img, cv2.COLOR_BGR2GRAY)
|
70
|
-
|
71
|
-
# 二值化处理
|
72
|
-
_, binary_img = cv2.threshold(
|
73
|
-
gray_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
|
74
|
-
)
|
75
|
-
|
76
|
-
# 去噪
|
77
|
-
filtered_img = cv2.medianBlur(binary_img, 3)
|
78
|
-
processed_img = filtered_img
|
79
|
-
|
80
|
-
os.makedirs(os.path.join(output, img_name), exist_ok=True)
|
81
|
-
pdf_img_path = os.path.join(
|
82
|
-
output, img_name, img_name + "_" + str(index) + ".jpg"
|
83
|
-
)
|
84
|
-
cv2.imwrite(pdf_img_path, processed_img)
|
85
|
-
img_paths.append([pdf_img_path, processed_img])
|
86
|
-
|
87
|
-
return img_name
|
88
|
-
|
89
|
-
|
90
|
-
# def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
|
91
|
-
# model_name = os.path.expanduser(model_path)
|
92
|
-
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
93
|
-
# model = GOTQwenForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, device_map=f'cuda:{gpu_id}',
|
94
|
-
# use_safetensors=True, pad_token_id=151643).eval()
|
95
|
-
# model.to(device=f'cuda:{gpu_id}', dtype=torch.bfloat16)
|
96
|
-
# return model, tokenizer
|
97
|
-
|
98
|
-
|
99
|
-
def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
|
100
|
-
model_name = os.path.expanduser(model_path)
|
101
|
-
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
102
|
-
|
103
|
-
# 加载模型
|
104
|
-
model = GOTQwenForCausalLM.from_pretrained(
|
105
|
-
model_name,
|
106
|
-
low_cpu_mem_usage=True,
|
107
|
-
device_map=f"cuda:{gpu_id}",
|
108
|
-
use_safetensors=True,
|
109
|
-
pad_token_id=151643,
|
110
|
-
).eval()
|
111
|
-
|
112
|
-
# 确保模型和张量都移动到目标设备
|
113
|
-
device = torch.device(f"cuda:{gpu_id}")
|
114
|
-
model.to(device=device, dtype=torch.bfloat16)
|
115
|
-
|
116
|
-
# 确保分词器的输出也在目标设备上
|
117
|
-
tokenizer.model_max_length = 512 # 设置最大长度,根据需要调整
|
118
|
-
tokenizer.padding_side = "right" # 设置填充方向,根据需要调整
|
119
|
-
|
120
|
-
return model, tokenizer
|
121
|
-
|
122
|
-
|
123
|
-
def eval_model(file: str, model, tokenizer, gpu_id: int = 6):
|
124
|
-
# Model
|
125
|
-
# image = load_image(args.image_file)
|
126
|
-
image = Image.open(file).convert("RGB")
|
127
|
-
|
128
|
-
w, h = image.size
|
129
|
-
# print(image.size)
|
130
|
-
|
131
|
-
disable_torch_init()
|
132
|
-
|
133
|
-
if args.type == "format":
|
134
|
-
qs = "OCR with format: "
|
135
|
-
else:
|
136
|
-
qs = "OCR: "
|
137
|
-
|
138
|
-
if args.box:
|
139
|
-
bbox = ast.literal_eval(args.box)
|
140
|
-
if len(bbox) == 2:
|
141
|
-
bbox[0] = int(bbox[0] / w * 1000)
|
142
|
-
bbox[1] = int(bbox[1] / h * 1000)
|
143
|
-
if len(bbox) == 4:
|
144
|
-
bbox[0] = int(bbox[0] / w * 1000)
|
145
|
-
bbox[1] = int(bbox[1] / h * 1000)
|
146
|
-
bbox[2] = int(bbox[2] / w * 1000)
|
147
|
-
bbox[3] = int(bbox[3] / h * 1000)
|
148
|
-
if args.type == "format":
|
149
|
-
qs = str(bbox) + " " + "OCR with format: "
|
150
|
-
else:
|
151
|
-
qs = str(bbox) + " " + "OCR: "
|
152
|
-
|
153
|
-
if args.color:
|
154
|
-
if args.type == "format":
|
155
|
-
qs = "[" + args.color + "]" + " " + "OCR with format: "
|
156
|
-
else:
|
157
|
-
qs = "[" + args.color + "]" + " " + "OCR: "
|
158
|
-
|
159
|
-
if use_im_start_end:
|
160
|
-
qs = (
|
161
|
-
DEFAULT_IM_START_TOKEN
|
162
|
-
+ DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
|
163
|
-
+ DEFAULT_IM_END_TOKEN
|
164
|
-
+ "\n"
|
165
|
-
+ qs
|
166
|
-
)
|
167
|
-
else:
|
168
|
-
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
169
|
-
|
170
|
-
conv_mode = "mpt"
|
171
|
-
args.conv_mode = conv_mode
|
172
|
-
|
173
|
-
conv = conv_templates[args.conv_mode].copy()
|
174
|
-
conv.append_message(conv.roles[0], qs)
|
175
|
-
conv.append_message(conv.roles[1], None)
|
176
|
-
prompt = conv.get_prompt()
|
177
|
-
|
178
|
-
inputs = tokenizer([prompt])
|
179
|
-
|
180
|
-
# vary old codes, no use
|
181
|
-
image_1 = image.copy()
|
182
|
-
image_tensor = image_processor(image)
|
183
|
-
|
184
|
-
image_tensor_1 = image_processor_high(image_1)
|
185
|
-
|
186
|
-
input_ids = torch.as_tensor(inputs.input_ids).to(f"cuda:{gpu_id}")
|
187
|
-
|
188
|
-
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
189
|
-
keywords = [stop_str]
|
190
|
-
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
191
|
-
# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
192
|
-
|
193
|
-
with torch.autocast(f"cuda:{gpu_id}", dtype=torch.bfloat16):
|
194
|
-
output_ids = model.generate(
|
195
|
-
input_ids,
|
196
|
-
images=[
|
197
|
-
(
|
198
|
-
image_tensor.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
|
199
|
-
image_tensor_1.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
|
200
|
-
)
|
201
|
-
],
|
202
|
-
do_sample=False,
|
203
|
-
num_beams=1,
|
204
|
-
no_repeat_ngram_size=20,
|
205
|
-
# streamer=streamer,
|
206
|
-
max_new_tokens=4096,
|
207
|
-
stopping_criteria=[stopping_criteria],
|
208
|
-
)
|
209
|
-
|
210
|
-
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
|
211
|
-
|
212
|
-
if outputs.endswith(stop_str):
|
213
|
-
outputs = outputs[: -len(stop_str)]
|
214
|
-
outputs = outputs.strip()
|
215
|
-
return outputs + "\n"
|
216
|
-
|
217
|
-
|
218
|
-
def sorted_list_by_index(img_name):
|
219
|
-
files_with_index = []
|
220
|
-
for root, dirs, files in os.walk(f"./output/{img_name}"):
|
221
|
-
for file in files:
|
222
|
-
file_path = os.path.join(root, file)
|
223
|
-
match = re.search(r"_(\d+)(?:\.\w+)?$", file)
|
224
|
-
if match:
|
225
|
-
index = int(match.group(1))
|
226
|
-
files_with_index.append((file_path, index))
|
227
|
-
files_with_index.sort(key=lambda x: x[1])
|
228
|
-
sorted_files = [file[0] for file in files_with_index]
|
229
|
-
return sorted_files
|
230
|
-
|
231
|
-
|
232
|
-
def convert_to_markdown(md_content, pdf_path):
|
233
|
-
"""write into markdown"""
|
234
|
-
file_extension = os.path.splitext(pdf_path)[1].lower()
|
235
|
-
output_file = (
|
236
|
-
f'./got_output/{os.path.basename(pdf_path).replace(file_extension, ".mmd")}'
|
237
|
-
)
|
238
|
-
os.makedirs("got_output", exist_ok=True)
|
239
|
-
with open(output_file, "w", encoding="utf-8") as f:
|
240
|
-
f.write(md_content)
|
241
|
-
|
242
|
-
|
243
|
-
def main(image_list: str, pdf_path: str, model, tokenizer, gpu_id: int = 6):
|
244
|
-
res_list = sorted_list_by_index(image_list)
|
245
|
-
|
246
|
-
outputs = ""
|
247
|
-
for file_path in res_list:
|
248
|
-
outputs += eval_model(
|
249
|
-
file=file_path, model=model, tokenizer=tokenizer, gpu_id=gpu_id
|
250
|
-
)
|
251
|
-
|
252
|
-
convert_to_markdown(outputs, pdf_path)
|
253
|
-
return outputs
|
254
|
-
|
255
|
-
|
256
|
-
def generate_mathpix_markdown(pdf_path: str, model, tokenizer, gpu_id: int = 6):
|
257
|
-
image_list = covert_pdf_to_image(pdf_path)
|
258
|
-
outputs = main(
|
259
|
-
image_list=image_list,
|
260
|
-
pdf_path=pdf_path,
|
261
|
-
gpu_id=gpu_id,
|
262
|
-
model=model,
|
263
|
-
tokenizer=tokenizer,
|
264
|
-
)
|
265
|
-
return outputs
|
1
|
+
import argparse
|
2
|
+
import ast
|
3
|
+
import os
|
4
|
+
import re
|
5
|
+
from datetime import datetime
|
6
|
+
|
7
|
+
import cv2
|
8
|
+
import numpy as np
|
9
|
+
import torch
|
10
|
+
from GOT.demo.process_results import punctuation_dict
|
11
|
+
from GOT.model import *
|
12
|
+
from GOT.model.plug.blip_process import BlipImageEvalProcessor
|
13
|
+
from GOT.utils.conversation import SeparatorStyle, conv_templates
|
14
|
+
from GOT.utils.utils import KeywordsStoppingCriteria, disable_torch_init
|
15
|
+
from paddle.utils import try_import
|
16
|
+
from PIL import Image
|
17
|
+
from transformers import AutoTokenizer
|
18
|
+
|
19
|
+
fitz = try_import("fitz")
|
20
|
+
|
21
|
+
DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 - 这是技术常量,不是密码
|
22
|
+
DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 - 这是技术常量,不是密码
|
23
|
+
|
24
|
+
DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 - 这是技术常量,不是密码
|
25
|
+
DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 - 这是技术常量,不是密码
|
26
|
+
|
27
|
+
translation_table = str.maketrans(punctuation_dict)
|
28
|
+
|
29
|
+
parser = argparse.ArgumentParser()
|
30
|
+
|
31
|
+
args = argparse.Namespace()
|
32
|
+
args.model_name = "./GOT_weights/"
|
33
|
+
args.type = "format"
|
34
|
+
args.box = ""
|
35
|
+
args.color = ""
|
36
|
+
|
37
|
+
# TODO vary old codes, NEED del
|
38
|
+
image_processor = BlipImageEvalProcessor(image_size=1024)
|
39
|
+
|
40
|
+
image_processor_high = BlipImageEvalProcessor(image_size=1024)
|
41
|
+
|
42
|
+
use_im_start_end = True
|
43
|
+
|
44
|
+
image_token_len = 256
|
45
|
+
|
46
|
+
|
47
|
+
def covert_pdf_to_image(image_path: str):
|
48
|
+
# step1: Convert PDF to images
|
49
|
+
imgs = []
|
50
|
+
with fitz.open(image_path) as pdf:
|
51
|
+
for pg in range(0, pdf.page_count):
|
52
|
+
page = pdf[pg]
|
53
|
+
mat = fitz.Matrix(4, 4) # 全程放大四倍
|
54
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
55
|
+
# if pm.width > 2000 or pm.height > 2000:
|
56
|
+
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
57
|
+
|
58
|
+
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
|
59
|
+
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
60
|
+
imgs.append(img)
|
61
|
+
|
62
|
+
img_name = datetime.now().strftime("%Y%m%d%H%M%S")
|
63
|
+
# step2: Process images
|
64
|
+
output = "output"
|
65
|
+
img_paths = []
|
66
|
+
for index, pdf_img in enumerate(imgs):
|
67
|
+
# 图片处理
|
68
|
+
|
69
|
+
gray_img = cv2.cvtColor(pdf_img, cv2.COLOR_BGR2GRAY)
|
70
|
+
|
71
|
+
# 二值化处理
|
72
|
+
_, binary_img = cv2.threshold(
|
73
|
+
gray_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
|
74
|
+
)
|
75
|
+
|
76
|
+
# 去噪
|
77
|
+
filtered_img = cv2.medianBlur(binary_img, 3)
|
78
|
+
processed_img = filtered_img
|
79
|
+
|
80
|
+
os.makedirs(os.path.join(output, img_name), exist_ok=True)
|
81
|
+
pdf_img_path = os.path.join(
|
82
|
+
output, img_name, img_name + "_" + str(index) + ".jpg"
|
83
|
+
)
|
84
|
+
cv2.imwrite(pdf_img_path, processed_img)
|
85
|
+
img_paths.append([pdf_img_path, processed_img])
|
86
|
+
|
87
|
+
return img_name
|
88
|
+
|
89
|
+
|
90
|
+
# def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
|
91
|
+
# model_name = os.path.expanduser(model_path)
|
92
|
+
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
93
|
+
# model = GOTQwenForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, device_map=f'cuda:{gpu_id}',
|
94
|
+
# use_safetensors=True, pad_token_id=151643).eval()
|
95
|
+
# model.to(device=f'cuda:{gpu_id}', dtype=torch.bfloat16)
|
96
|
+
# return model, tokenizer
|
97
|
+
|
98
|
+
|
99
|
+
def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
|
100
|
+
model_name = os.path.expanduser(model_path)
|
101
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
102
|
+
|
103
|
+
# 加载模型
|
104
|
+
model = GOTQwenForCausalLM.from_pretrained(
|
105
|
+
model_name,
|
106
|
+
low_cpu_mem_usage=True,
|
107
|
+
device_map=f"cuda:{gpu_id}",
|
108
|
+
use_safetensors=True,
|
109
|
+
pad_token_id=151643,
|
110
|
+
).eval()
|
111
|
+
|
112
|
+
# 确保模型和张量都移动到目标设备
|
113
|
+
device = torch.device(f"cuda:{gpu_id}")
|
114
|
+
model.to(device=device, dtype=torch.bfloat16)
|
115
|
+
|
116
|
+
# 确保分词器的输出也在目标设备上
|
117
|
+
tokenizer.model_max_length = 512 # 设置最大长度,根据需要调整
|
118
|
+
tokenizer.padding_side = "right" # 设置填充方向,根据需要调整
|
119
|
+
|
120
|
+
return model, tokenizer
|
121
|
+
|
122
|
+
|
123
|
+
def eval_model(file: str, model, tokenizer, gpu_id: int = 6):
|
124
|
+
# Model
|
125
|
+
# image = load_image(args.image_file)
|
126
|
+
image = Image.open(file).convert("RGB")
|
127
|
+
|
128
|
+
w, h = image.size
|
129
|
+
# print(image.size)
|
130
|
+
|
131
|
+
disable_torch_init()
|
132
|
+
|
133
|
+
if args.type == "format":
|
134
|
+
qs = "OCR with format: "
|
135
|
+
else:
|
136
|
+
qs = "OCR: "
|
137
|
+
|
138
|
+
if args.box:
|
139
|
+
bbox = ast.literal_eval(args.box)
|
140
|
+
if len(bbox) == 2:
|
141
|
+
bbox[0] = int(bbox[0] / w * 1000)
|
142
|
+
bbox[1] = int(bbox[1] / h * 1000)
|
143
|
+
if len(bbox) == 4:
|
144
|
+
bbox[0] = int(bbox[0] / w * 1000)
|
145
|
+
bbox[1] = int(bbox[1] / h * 1000)
|
146
|
+
bbox[2] = int(bbox[2] / w * 1000)
|
147
|
+
bbox[3] = int(bbox[3] / h * 1000)
|
148
|
+
if args.type == "format":
|
149
|
+
qs = str(bbox) + " " + "OCR with format: "
|
150
|
+
else:
|
151
|
+
qs = str(bbox) + " " + "OCR: "
|
152
|
+
|
153
|
+
if args.color:
|
154
|
+
if args.type == "format":
|
155
|
+
qs = "[" + args.color + "]" + " " + "OCR with format: "
|
156
|
+
else:
|
157
|
+
qs = "[" + args.color + "]" + " " + "OCR: "
|
158
|
+
|
159
|
+
if use_im_start_end:
|
160
|
+
qs = (
|
161
|
+
DEFAULT_IM_START_TOKEN
|
162
|
+
+ DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
|
163
|
+
+ DEFAULT_IM_END_TOKEN
|
164
|
+
+ "\n"
|
165
|
+
+ qs
|
166
|
+
)
|
167
|
+
else:
|
168
|
+
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
169
|
+
|
170
|
+
conv_mode = "mpt"
|
171
|
+
args.conv_mode = conv_mode
|
172
|
+
|
173
|
+
conv = conv_templates[args.conv_mode].copy()
|
174
|
+
conv.append_message(conv.roles[0], qs)
|
175
|
+
conv.append_message(conv.roles[1], None)
|
176
|
+
prompt = conv.get_prompt()
|
177
|
+
|
178
|
+
inputs = tokenizer([prompt])
|
179
|
+
|
180
|
+
# vary old codes, no use
|
181
|
+
image_1 = image.copy()
|
182
|
+
image_tensor = image_processor(image)
|
183
|
+
|
184
|
+
image_tensor_1 = image_processor_high(image_1)
|
185
|
+
|
186
|
+
input_ids = torch.as_tensor(inputs.input_ids).to(f"cuda:{gpu_id}")
|
187
|
+
|
188
|
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
189
|
+
keywords = [stop_str]
|
190
|
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
191
|
+
# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
192
|
+
|
193
|
+
with torch.autocast(f"cuda:{gpu_id}", dtype=torch.bfloat16):
|
194
|
+
output_ids = model.generate(
|
195
|
+
input_ids,
|
196
|
+
images=[
|
197
|
+
(
|
198
|
+
image_tensor.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
|
199
|
+
image_tensor_1.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
|
200
|
+
)
|
201
|
+
],
|
202
|
+
do_sample=False,
|
203
|
+
num_beams=1,
|
204
|
+
no_repeat_ngram_size=20,
|
205
|
+
# streamer=streamer,
|
206
|
+
max_new_tokens=4096,
|
207
|
+
stopping_criteria=[stopping_criteria],
|
208
|
+
)
|
209
|
+
|
210
|
+
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
|
211
|
+
|
212
|
+
if outputs.endswith(stop_str):
|
213
|
+
outputs = outputs[: -len(stop_str)]
|
214
|
+
outputs = outputs.strip()
|
215
|
+
return outputs + "\n"
|
216
|
+
|
217
|
+
|
218
|
+
def sorted_list_by_index(img_name):
|
219
|
+
files_with_index = []
|
220
|
+
for root, dirs, files in os.walk(f"./output/{img_name}"):
|
221
|
+
for file in files:
|
222
|
+
file_path = os.path.join(root, file)
|
223
|
+
match = re.search(r"_(\d+)(?:\.\w+)?$", file)
|
224
|
+
if match:
|
225
|
+
index = int(match.group(1))
|
226
|
+
files_with_index.append((file_path, index))
|
227
|
+
files_with_index.sort(key=lambda x: x[1])
|
228
|
+
sorted_files = [file[0] for file in files_with_index]
|
229
|
+
return sorted_files
|
230
|
+
|
231
|
+
|
232
|
+
def convert_to_markdown(md_content, pdf_path):
|
233
|
+
"""write into markdown"""
|
234
|
+
file_extension = os.path.splitext(pdf_path)[1].lower()
|
235
|
+
output_file = (
|
236
|
+
f'./got_output/{os.path.basename(pdf_path).replace(file_extension, ".mmd")}'
|
237
|
+
)
|
238
|
+
os.makedirs("got_output", exist_ok=True)
|
239
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
240
|
+
f.write(md_content)
|
241
|
+
|
242
|
+
|
243
|
+
def main(image_list: str, pdf_path: str, model, tokenizer, gpu_id: int = 6):
|
244
|
+
res_list = sorted_list_by_index(image_list)
|
245
|
+
|
246
|
+
outputs = ""
|
247
|
+
for file_path in res_list:
|
248
|
+
outputs += eval_model(
|
249
|
+
file=file_path, model=model, tokenizer=tokenizer, gpu_id=gpu_id
|
250
|
+
)
|
251
|
+
|
252
|
+
convert_to_markdown(outputs, pdf_path)
|
253
|
+
return outputs
|
254
|
+
|
255
|
+
|
256
|
+
def generate_mathpix_markdown(pdf_path: str, model, tokenizer, gpu_id: int = 6):
|
257
|
+
image_list = covert_pdf_to_image(pdf_path)
|
258
|
+
outputs = main(
|
259
|
+
image_list=image_list,
|
260
|
+
pdf_path=pdf_path,
|
261
|
+
gpu_id=gpu_id,
|
262
|
+
model=model,
|
263
|
+
tokenizer=tokenizer,
|
264
|
+
)
|
265
|
+
return outputs
|
datamax/utils/mineru_operator.py
CHANGED
@@ -1,62 +1,62 @@
|
|
1
|
-
import os
|
2
|
-
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
3
|
-
from magic_pdf.data.dataset import PymuDocDataset
|
4
|
-
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
5
|
-
from magic_pdf.config.enums import SupportedPdfParseMethod
|
6
|
-
|
7
|
-
|
8
|
-
class PdfProcessor:
|
9
|
-
_instance = None
|
10
|
-
|
11
|
-
def __new__(cls):
|
12
|
-
if cls._instance is None:
|
13
|
-
cls._instance = super(PdfProcessor, cls).__new__(cls)
|
14
|
-
return cls._instance
|
15
|
-
|
16
|
-
def process_pdf(self, pdf_file_name):
|
17
|
-
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
|
18
|
-
print("Processing PDF: " + name_without_suff)
|
19
|
-
|
20
|
-
local_image_dir = "uploaded_files/images"
|
21
|
-
local_md_dir = "uploaded_files/markdown"
|
22
|
-
|
23
|
-
os.makedirs(local_image_dir, exist_ok=True)
|
24
|
-
os.makedirs(local_md_dir, exist_ok=True)
|
25
|
-
|
26
|
-
image_writer = FileBasedDataWriter(local_image_dir)
|
27
|
-
md_writer = FileBasedDataWriter(local_md_dir)
|
28
|
-
|
29
|
-
reader = FileBasedDataReader("")
|
30
|
-
pdf_bytes = reader.read(pdf_file_name)
|
31
|
-
|
32
|
-
# 处理流程
|
33
|
-
ds = PymuDocDataset(pdf_bytes)
|
34
|
-
markdown_path = os.path.join(local_md_dir, f"{name_without_suff}.md") # 完整路径
|
35
|
-
image_dir = os.path.basename(local_image_dir) # 保持相对路径为 "images"
|
36
|
-
|
37
|
-
if ds.classify() == SupportedPdfParseMethod.OCR:
|
38
|
-
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
|
39
|
-
md_writer,
|
40
|
-
os.path.basename(markdown_path), # 文件名部分
|
41
|
-
image_dir
|
42
|
-
)
|
43
|
-
else:
|
44
|
-
ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
|
45
|
-
md_writer,
|
46
|
-
os.path.basename(markdown_path), # 文件名部分
|
47
|
-
image_dir
|
48
|
-
)
|
49
|
-
|
50
|
-
with open(markdown_path, "r", encoding='utf-8') as f:
|
51
|
-
markdown_content = f.read()
|
52
|
-
|
53
|
-
return markdown_content
|
54
|
-
|
55
|
-
pdf_processor = PdfProcessor()
|
56
|
-
|
57
|
-
# 使用示例
|
58
|
-
if __name__ == "__main__":
|
59
|
-
# pdf_processor = PdfProcessor()
|
60
|
-
print(pdf_processor.process_pdf(
|
61
|
-
"/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
|
62
|
-
))
|
1
|
+
import os
|
2
|
+
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
3
|
+
from magic_pdf.data.dataset import PymuDocDataset
|
4
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
5
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
6
|
+
|
7
|
+
|
8
|
+
class PdfProcessor:
|
9
|
+
_instance = None
|
10
|
+
|
11
|
+
def __new__(cls):
|
12
|
+
if cls._instance is None:
|
13
|
+
cls._instance = super(PdfProcessor, cls).__new__(cls)
|
14
|
+
return cls._instance
|
15
|
+
|
16
|
+
def process_pdf(self, pdf_file_name):
|
17
|
+
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
|
18
|
+
print("Processing PDF: " + name_without_suff)
|
19
|
+
|
20
|
+
local_image_dir = "uploaded_files/images"
|
21
|
+
local_md_dir = "uploaded_files/markdown"
|
22
|
+
|
23
|
+
os.makedirs(local_image_dir, exist_ok=True)
|
24
|
+
os.makedirs(local_md_dir, exist_ok=True)
|
25
|
+
|
26
|
+
image_writer = FileBasedDataWriter(local_image_dir)
|
27
|
+
md_writer = FileBasedDataWriter(local_md_dir)
|
28
|
+
|
29
|
+
reader = FileBasedDataReader("")
|
30
|
+
pdf_bytes = reader.read(pdf_file_name)
|
31
|
+
|
32
|
+
# 处理流程
|
33
|
+
ds = PymuDocDataset(pdf_bytes)
|
34
|
+
markdown_path = os.path.join(local_md_dir, f"{name_without_suff}.md") # 完整路径
|
35
|
+
image_dir = os.path.basename(local_image_dir) # 保持相对路径为 "images"
|
36
|
+
|
37
|
+
if ds.classify() == SupportedPdfParseMethod.OCR:
|
38
|
+
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
|
39
|
+
md_writer,
|
40
|
+
os.path.basename(markdown_path), # 文件名部分
|
41
|
+
image_dir
|
42
|
+
)
|
43
|
+
else:
|
44
|
+
ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
|
45
|
+
md_writer,
|
46
|
+
os.path.basename(markdown_path), # 文件名部分
|
47
|
+
image_dir
|
48
|
+
)
|
49
|
+
|
50
|
+
with open(markdown_path, "r", encoding='utf-8') as f:
|
51
|
+
markdown_content = f.read()
|
52
|
+
|
53
|
+
return markdown_content
|
54
|
+
|
55
|
+
pdf_processor = PdfProcessor()
|
56
|
+
|
57
|
+
# 使用示例
|
58
|
+
if __name__ == "__main__":
|
59
|
+
# pdf_processor = PdfProcessor()
|
60
|
+
print(pdf_processor.process_pdf(
|
61
|
+
"/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
|
62
|
+
))
|