pydatamax 0.1.5__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/OssHandler.py +85 -51
- datamax/parser/__init__.py +1 -1
- datamax/parser/base.py +2 -2
- datamax/parser/core.py +205 -31
- datamax/parser/doc_parser.py +2 -5
- datamax/parser/docx_parser.py +3 -6
- datamax/parser/epub_parser.py +2 -5
- datamax/parser/html_parser.py +2 -5
- datamax/parser/image_parser.py +18 -14
- datamax/parser/md_parser.py +67 -4
- datamax/parser/pdf_parser.py +59 -20
- datamax/parser/ppt_parser.py +3 -5
- datamax/parser/pptx_parser.py +10 -13
- datamax/parser/txt_parser.py +2 -5
- datamax/parser/xls_parser.py +26 -0
- datamax/parser/xlsx_parser.py +65 -4
- datamax/utils/__init__.py +1 -0
- datamax/utils/constants.py +58 -0
- datamax/utils/data_cleaner.py +45 -28
- datamax/utils/env_setup.py +80 -0
- datamax/utils/gotocr_pdf.py +265 -0
- datamax/utils/mineru_operator.py +62 -0
- datamax/utils/paddleocr_pdf_operator.py +2 -1
- datamax/utils/qa_generator.py +376 -0
- datamax/utils/tokenizer.py +1 -1
- pydatamax-0.1.12.dist-info/METADATA +281 -0
- pydatamax-0.1.12.dist-info/RECORD +39 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.12.dist-info}/WHEEL +1 -1
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.12.dist-info/licenses}/LICENSE +0 -0
- {pydatamax-0.1.5.dist-info → pydatamax-0.1.12.dist-info}/top_level.txt +1 -0
- tests/__init__.py +0 -0
- tests/test_basic.py +20 -0
- pydatamax-0.1.5.dist-info/METADATA +0 -282
- pydatamax-0.1.5.dist-info/RECORD +0 -31
@@ -0,0 +1,80 @@
|
|
1
|
+
import subprocess
|
2
|
+
import sys
|
3
|
+
import os
|
4
|
+
import importlib.metadata
|
5
|
+
|
6
|
+
class EnvironmentSetup:
|
7
|
+
""" Responsible for setting up the correct environment,
|
8
|
+
including checking GPU support and installing the necessary packages
|
9
|
+
"""
|
10
|
+
|
11
|
+
def __init__(self, use_gpu: bool = False):
|
12
|
+
self._gpu_available = None
|
13
|
+
self._setup_completed = False
|
14
|
+
self.use_gpu = use_gpu # Use GPU if True, otherwise use CPU
|
15
|
+
|
16
|
+
def is_gpu_available(self):
|
17
|
+
"""Check whether the system supports Gpus"""
|
18
|
+
if self._gpu_available is None:
|
19
|
+
try:
|
20
|
+
# Check whether CUDA is available
|
21
|
+
subprocess.check_output(['nvcc', '--version'], stderr=subprocess.STDOUT)
|
22
|
+
self._gpu_available = True
|
23
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
24
|
+
self._gpu_available = False
|
25
|
+
return self._gpu_available
|
26
|
+
|
27
|
+
def is_conda(self):
|
28
|
+
""" Check whether the current environment is a Conda environment """
|
29
|
+
return os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
|
30
|
+
|
31
|
+
def install_package(self, package_name):
|
32
|
+
""" Select pip or conda or other installation specified package according to the environment """
|
33
|
+
installer = 'conda' if self.is_conda() else 'pip'
|
34
|
+
if installer == 'conda':
|
35
|
+
print(f"Detected Conda environment. Installing {package_name} with conda.")
|
36
|
+
try:
|
37
|
+
subprocess.check_call(['pip', 'install', package_name])
|
38
|
+
print(f"Successfully installed {package_name} with conda.")
|
39
|
+
except subprocess.CalledProcessError as e:
|
40
|
+
print(f"Failed to install {package_name} with conda: {e}")
|
41
|
+
elif installer == 'pip':
|
42
|
+
print(f"Using pip to install {package_name}.")
|
43
|
+
try:
|
44
|
+
# Invoke the pip installation package using the Python interpreter
|
45
|
+
subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
|
46
|
+
print(f"Successfully installed {package_name} with pip.")
|
47
|
+
except subprocess.CalledProcessError as e:
|
48
|
+
print(f"Failed to install {package_name} with pip: {e}")
|
49
|
+
else:
|
50
|
+
print("Unable to determine the package manager. Please install the package manually.")
|
51
|
+
|
52
|
+
def check_and_install(self):
|
53
|
+
"""Check and install appropriate packages based on user's choice and GPU availability"""
|
54
|
+
if self._setup_completed:
|
55
|
+
return
|
56
|
+
|
57
|
+
# Override GPU detection with the use_gpu parameter
|
58
|
+
if self.use_gpu:
|
59
|
+
pkg_name = 'paddlepaddle-gpu' if self.is_gpu_available() else 'paddlepaddle'
|
60
|
+
else:
|
61
|
+
pkg_name = 'paddlepaddle'
|
62
|
+
|
63
|
+
try:
|
64
|
+
_ = importlib.metadata.version(pkg_name.split()[0]) # Check if paddlepaddle is installed
|
65
|
+
# print(f"{pkg_name} version {1} is already installed.")
|
66
|
+
except importlib.metadata.PackageNotFoundError:
|
67
|
+
print(f"{pkg_name} is not installed. Installing now...")
|
68
|
+
self.install_package(pkg_name)
|
69
|
+
|
70
|
+
self._setup_completed = True
|
71
|
+
|
72
|
+
|
73
|
+
# Create an instance of EnvironmentSetup with the desired GPU option and call check_and_install when the program initializes
|
74
|
+
env_setup = EnvironmentSetup() # Set this flag as needed
|
75
|
+
|
76
|
+
|
77
|
+
def setup_environment(use_gpu: bool = False):
|
78
|
+
"""Used to set the environment when the program starts"""
|
79
|
+
env_setup.use_gpu = use_gpu
|
80
|
+
env_setup.check_and_install()
|
@@ -0,0 +1,265 @@
|
|
1
|
+
import argparse
|
2
|
+
import ast
|
3
|
+
import os
|
4
|
+
import re
|
5
|
+
from datetime import datetime
|
6
|
+
|
7
|
+
import cv2
|
8
|
+
import numpy as np
|
9
|
+
import torch
|
10
|
+
from GOT.demo.process_results import punctuation_dict
|
11
|
+
from GOT.model import *
|
12
|
+
from GOT.model.plug.blip_process import BlipImageEvalProcessor
|
13
|
+
from GOT.utils.conversation import SeparatorStyle, conv_templates
|
14
|
+
from GOT.utils.utils import KeywordsStoppingCriteria, disable_torch_init
|
15
|
+
from paddle.utils import try_import
|
16
|
+
from PIL import Image
|
17
|
+
from transformers import AutoTokenizer
|
18
|
+
|
19
|
+
fitz = try_import("fitz")
|
20
|
+
|
21
|
+
DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 - 这是技术常量,不是密码
|
22
|
+
DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 - 这是技术常量,不是密码
|
23
|
+
|
24
|
+
DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 - 这是技术常量,不是密码
|
25
|
+
DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 - 这是技术常量,不是密码
|
26
|
+
|
27
|
+
translation_table = str.maketrans(punctuation_dict)
|
28
|
+
|
29
|
+
parser = argparse.ArgumentParser()
|
30
|
+
|
31
|
+
args = argparse.Namespace()
|
32
|
+
args.model_name = "./GOT_weights/"
|
33
|
+
args.type = "format"
|
34
|
+
args.box = ""
|
35
|
+
args.color = ""
|
36
|
+
|
37
|
+
# TODO vary old codes, NEED del
|
38
|
+
image_processor = BlipImageEvalProcessor(image_size=1024)
|
39
|
+
|
40
|
+
image_processor_high = BlipImageEvalProcessor(image_size=1024)
|
41
|
+
|
42
|
+
use_im_start_end = True
|
43
|
+
|
44
|
+
image_token_len = 256
|
45
|
+
|
46
|
+
|
47
|
+
def covert_pdf_to_image(image_path: str):
|
48
|
+
# step1: Convert PDF to images
|
49
|
+
imgs = []
|
50
|
+
with fitz.open(image_path) as pdf:
|
51
|
+
for pg in range(0, pdf.page_count):
|
52
|
+
page = pdf[pg]
|
53
|
+
mat = fitz.Matrix(4, 4) # 全程放大四倍
|
54
|
+
pm = page.get_pixmap(matrix=mat, alpha=False)
|
55
|
+
# if pm.width > 2000 or pm.height > 2000:
|
56
|
+
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
57
|
+
|
58
|
+
img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
|
59
|
+
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
|
60
|
+
imgs.append(img)
|
61
|
+
|
62
|
+
img_name = datetime.now().strftime("%Y%m%d%H%M%S")
|
63
|
+
# step2: Process images
|
64
|
+
output = "output"
|
65
|
+
img_paths = []
|
66
|
+
for index, pdf_img in enumerate(imgs):
|
67
|
+
# 图片处理
|
68
|
+
|
69
|
+
gray_img = cv2.cvtColor(pdf_img, cv2.COLOR_BGR2GRAY)
|
70
|
+
|
71
|
+
# 二值化处理
|
72
|
+
_, binary_img = cv2.threshold(
|
73
|
+
gray_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
|
74
|
+
)
|
75
|
+
|
76
|
+
# 去噪
|
77
|
+
filtered_img = cv2.medianBlur(binary_img, 3)
|
78
|
+
processed_img = filtered_img
|
79
|
+
|
80
|
+
os.makedirs(os.path.join(output, img_name), exist_ok=True)
|
81
|
+
pdf_img_path = os.path.join(
|
82
|
+
output, img_name, img_name + "_" + str(index) + ".jpg"
|
83
|
+
)
|
84
|
+
cv2.imwrite(pdf_img_path, processed_img)
|
85
|
+
img_paths.append([pdf_img_path, processed_img])
|
86
|
+
|
87
|
+
return img_name
|
88
|
+
|
89
|
+
|
90
|
+
# def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
|
91
|
+
# model_name = os.path.expanduser(model_path)
|
92
|
+
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
93
|
+
# model = GOTQwenForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, device_map=f'cuda:{gpu_id}',
|
94
|
+
# use_safetensors=True, pad_token_id=151643).eval()
|
95
|
+
# model.to(device=f'cuda:{gpu_id}', dtype=torch.bfloat16)
|
96
|
+
# return model, tokenizer
|
97
|
+
|
98
|
+
|
99
|
+
def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
|
100
|
+
model_name = os.path.expanduser(model_path)
|
101
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
102
|
+
|
103
|
+
# 加载模型
|
104
|
+
model = GOTQwenForCausalLM.from_pretrained(
|
105
|
+
model_name,
|
106
|
+
low_cpu_mem_usage=True,
|
107
|
+
device_map=f"cuda:{gpu_id}",
|
108
|
+
use_safetensors=True,
|
109
|
+
pad_token_id=151643,
|
110
|
+
).eval()
|
111
|
+
|
112
|
+
# 确保模型和张量都移动到目标设备
|
113
|
+
device = torch.device(f"cuda:{gpu_id}")
|
114
|
+
model.to(device=device, dtype=torch.bfloat16)
|
115
|
+
|
116
|
+
# 确保分词器的输出也在目标设备上
|
117
|
+
tokenizer.model_max_length = 512 # 设置最大长度,根据需要调整
|
118
|
+
tokenizer.padding_side = "right" # 设置填充方向,根据需要调整
|
119
|
+
|
120
|
+
return model, tokenizer
|
121
|
+
|
122
|
+
|
123
|
+
def eval_model(file: str, model, tokenizer, gpu_id: int = 6):
|
124
|
+
# Model
|
125
|
+
# image = load_image(args.image_file)
|
126
|
+
image = Image.open(file).convert("RGB")
|
127
|
+
|
128
|
+
w, h = image.size
|
129
|
+
# print(image.size)
|
130
|
+
|
131
|
+
disable_torch_init()
|
132
|
+
|
133
|
+
if args.type == "format":
|
134
|
+
qs = "OCR with format: "
|
135
|
+
else:
|
136
|
+
qs = "OCR: "
|
137
|
+
|
138
|
+
if args.box:
|
139
|
+
bbox = ast.literal_eval(args.box)
|
140
|
+
if len(bbox) == 2:
|
141
|
+
bbox[0] = int(bbox[0] / w * 1000)
|
142
|
+
bbox[1] = int(bbox[1] / h * 1000)
|
143
|
+
if len(bbox) == 4:
|
144
|
+
bbox[0] = int(bbox[0] / w * 1000)
|
145
|
+
bbox[1] = int(bbox[1] / h * 1000)
|
146
|
+
bbox[2] = int(bbox[2] / w * 1000)
|
147
|
+
bbox[3] = int(bbox[3] / h * 1000)
|
148
|
+
if args.type == "format":
|
149
|
+
qs = str(bbox) + " " + "OCR with format: "
|
150
|
+
else:
|
151
|
+
qs = str(bbox) + " " + "OCR: "
|
152
|
+
|
153
|
+
if args.color:
|
154
|
+
if args.type == "format":
|
155
|
+
qs = "[" + args.color + "]" + " " + "OCR with format: "
|
156
|
+
else:
|
157
|
+
qs = "[" + args.color + "]" + " " + "OCR: "
|
158
|
+
|
159
|
+
if use_im_start_end:
|
160
|
+
qs = (
|
161
|
+
DEFAULT_IM_START_TOKEN
|
162
|
+
+ DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
|
163
|
+
+ DEFAULT_IM_END_TOKEN
|
164
|
+
+ "\n"
|
165
|
+
+ qs
|
166
|
+
)
|
167
|
+
else:
|
168
|
+
qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
|
169
|
+
|
170
|
+
conv_mode = "mpt"
|
171
|
+
args.conv_mode = conv_mode
|
172
|
+
|
173
|
+
conv = conv_templates[args.conv_mode].copy()
|
174
|
+
conv.append_message(conv.roles[0], qs)
|
175
|
+
conv.append_message(conv.roles[1], None)
|
176
|
+
prompt = conv.get_prompt()
|
177
|
+
|
178
|
+
inputs = tokenizer([prompt])
|
179
|
+
|
180
|
+
# vary old codes, no use
|
181
|
+
image_1 = image.copy()
|
182
|
+
image_tensor = image_processor(image)
|
183
|
+
|
184
|
+
image_tensor_1 = image_processor_high(image_1)
|
185
|
+
|
186
|
+
input_ids = torch.as_tensor(inputs.input_ids).to(f"cuda:{gpu_id}")
|
187
|
+
|
188
|
+
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
|
189
|
+
keywords = [stop_str]
|
190
|
+
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
|
191
|
+
# streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
|
192
|
+
|
193
|
+
with torch.autocast(f"cuda:{gpu_id}", dtype=torch.bfloat16):
|
194
|
+
output_ids = model.generate(
|
195
|
+
input_ids,
|
196
|
+
images=[
|
197
|
+
(
|
198
|
+
image_tensor.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
|
199
|
+
image_tensor_1.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
|
200
|
+
)
|
201
|
+
],
|
202
|
+
do_sample=False,
|
203
|
+
num_beams=1,
|
204
|
+
no_repeat_ngram_size=20,
|
205
|
+
# streamer=streamer,
|
206
|
+
max_new_tokens=4096,
|
207
|
+
stopping_criteria=[stopping_criteria],
|
208
|
+
)
|
209
|
+
|
210
|
+
outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
|
211
|
+
|
212
|
+
if outputs.endswith(stop_str):
|
213
|
+
outputs = outputs[: -len(stop_str)]
|
214
|
+
outputs = outputs.strip()
|
215
|
+
return outputs + "\n"
|
216
|
+
|
217
|
+
|
218
|
+
def sorted_list_by_index(img_name):
|
219
|
+
files_with_index = []
|
220
|
+
for root, dirs, files in os.walk(f"./output/{img_name}"):
|
221
|
+
for file in files:
|
222
|
+
file_path = os.path.join(root, file)
|
223
|
+
match = re.search(r"_(\d+)(?:\.\w+)?$", file)
|
224
|
+
if match:
|
225
|
+
index = int(match.group(1))
|
226
|
+
files_with_index.append((file_path, index))
|
227
|
+
files_with_index.sort(key=lambda x: x[1])
|
228
|
+
sorted_files = [file[0] for file in files_with_index]
|
229
|
+
return sorted_files
|
230
|
+
|
231
|
+
|
232
|
+
def convert_to_markdown(md_content, pdf_path):
|
233
|
+
"""write into markdown"""
|
234
|
+
file_extension = os.path.splitext(pdf_path)[1].lower()
|
235
|
+
output_file = (
|
236
|
+
f'./got_output/{os.path.basename(pdf_path).replace(file_extension, ".mmd")}'
|
237
|
+
)
|
238
|
+
os.makedirs("got_output", exist_ok=True)
|
239
|
+
with open(output_file, "w", encoding="utf-8") as f:
|
240
|
+
f.write(md_content)
|
241
|
+
|
242
|
+
|
243
|
+
def main(image_list: str, pdf_path: str, model, tokenizer, gpu_id: int = 6):
|
244
|
+
res_list = sorted_list_by_index(image_list)
|
245
|
+
|
246
|
+
outputs = ""
|
247
|
+
for file_path in res_list:
|
248
|
+
outputs += eval_model(
|
249
|
+
file=file_path, model=model, tokenizer=tokenizer, gpu_id=gpu_id
|
250
|
+
)
|
251
|
+
|
252
|
+
convert_to_markdown(outputs, pdf_path)
|
253
|
+
return outputs
|
254
|
+
|
255
|
+
|
256
|
+
def generate_mathpix_markdown(pdf_path: str, model, tokenizer, gpu_id: int = 6):
|
257
|
+
image_list = covert_pdf_to_image(pdf_path)
|
258
|
+
outputs = main(
|
259
|
+
image_list=image_list,
|
260
|
+
pdf_path=pdf_path,
|
261
|
+
gpu_id=gpu_id,
|
262
|
+
model=model,
|
263
|
+
tokenizer=tokenizer,
|
264
|
+
)
|
265
|
+
return outputs
|
@@ -0,0 +1,62 @@
|
|
1
|
+
import os
|
2
|
+
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
|
3
|
+
from magic_pdf.data.dataset import PymuDocDataset
|
4
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
5
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
6
|
+
|
7
|
+
|
8
|
+
class PdfProcessor:
|
9
|
+
_instance = None
|
10
|
+
|
11
|
+
def __new__(cls):
|
12
|
+
if cls._instance is None:
|
13
|
+
cls._instance = super(PdfProcessor, cls).__new__(cls)
|
14
|
+
return cls._instance
|
15
|
+
|
16
|
+
def process_pdf(self, pdf_file_name):
|
17
|
+
name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
|
18
|
+
print("Processing PDF: " + name_without_suff)
|
19
|
+
|
20
|
+
local_image_dir = "uploaded_files/images"
|
21
|
+
local_md_dir = "uploaded_files/markdown"
|
22
|
+
|
23
|
+
os.makedirs(local_image_dir, exist_ok=True)
|
24
|
+
os.makedirs(local_md_dir, exist_ok=True)
|
25
|
+
|
26
|
+
image_writer = FileBasedDataWriter(local_image_dir)
|
27
|
+
md_writer = FileBasedDataWriter(local_md_dir)
|
28
|
+
|
29
|
+
reader = FileBasedDataReader("")
|
30
|
+
pdf_bytes = reader.read(pdf_file_name)
|
31
|
+
|
32
|
+
# 处理流程
|
33
|
+
ds = PymuDocDataset(pdf_bytes)
|
34
|
+
markdown_path = os.path.join(local_md_dir, f"{name_without_suff}.md") # 完整路径
|
35
|
+
image_dir = os.path.basename(local_image_dir) # 保持相对路径为 "images"
|
36
|
+
|
37
|
+
if ds.classify() == SupportedPdfParseMethod.OCR:
|
38
|
+
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
|
39
|
+
md_writer,
|
40
|
+
os.path.basename(markdown_path), # 文件名部分
|
41
|
+
image_dir
|
42
|
+
)
|
43
|
+
else:
|
44
|
+
ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
|
45
|
+
md_writer,
|
46
|
+
os.path.basename(markdown_path), # 文件名部分
|
47
|
+
image_dir
|
48
|
+
)
|
49
|
+
|
50
|
+
with open(markdown_path, "r", encoding='utf-8') as f:
|
51
|
+
markdown_content = f.read()
|
52
|
+
|
53
|
+
return markdown_content
|
54
|
+
|
55
|
+
pdf_processor = PdfProcessor()
|
56
|
+
|
57
|
+
# 使用示例
|
58
|
+
if __name__ == "__main__":
|
59
|
+
# pdf_processor = PdfProcessor()
|
60
|
+
print(pdf_processor.process_pdf(
|
61
|
+
"/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
|
62
|
+
))
|
@@ -8,6 +8,7 @@ from PIL import Image
|
|
8
8
|
from copy import deepcopy
|
9
9
|
from datetime import datetime
|
10
10
|
|
11
|
+
os.environ['KMP_DUPLICATE_LIB_OK']='True'
|
11
12
|
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
12
13
|
sys.path.append(str(ROOT_DIR))
|
13
14
|
|
@@ -82,7 +83,7 @@ def use_paddleocr(input_files: str, output_files: str, use_gpu: bool = False, gp
|
|
82
83
|
if not os.path.exists(output_files):
|
83
84
|
os.makedirs(output_files)
|
84
85
|
try:
|
85
|
-
recovery(input_files, output_files, use_gpu, gpu_id)
|
86
|
+
recovery(img_path=input_files, output=output_files, use_gpu=use_gpu, gpu_id=gpu_id)
|
86
87
|
except Exception as e:
|
87
88
|
raise e
|
88
89
|
except Exception as e:
|