pydatamax 0.1.5__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,80 @@
1
+ import subprocess
2
+ import sys
3
+ import os
4
+ import importlib.metadata
5
+
6
+ class EnvironmentSetup:
7
+ """ Responsible for setting up the correct environment,
8
+ including checking GPU support and installing the necessary packages
9
+ """
10
+
11
+ def __init__(self, use_gpu: bool = False):
12
+ self._gpu_available = None
13
+ self._setup_completed = False
14
+ self.use_gpu = use_gpu # Use GPU if True, otherwise use CPU
15
+
16
+ def is_gpu_available(self):
17
+ """Check whether the system supports Gpus"""
18
+ if self._gpu_available is None:
19
+ try:
20
+ # Check whether CUDA is available
21
+ subprocess.check_output(['nvcc', '--version'], stderr=subprocess.STDOUT)
22
+ self._gpu_available = True
23
+ except (subprocess.CalledProcessError, FileNotFoundError):
24
+ self._gpu_available = False
25
+ return self._gpu_available
26
+
27
+ def is_conda(self):
28
+ """ Check whether the current environment is a Conda environment """
29
+ return os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
30
+
31
+ def install_package(self, package_name):
32
+ """ Select pip or conda or other installation specified package according to the environment """
33
+ installer = 'conda' if self.is_conda() else 'pip'
34
+ if installer == 'conda':
35
+ print(f"Detected Conda environment. Installing {package_name} with conda.")
36
+ try:
37
+ subprocess.check_call(['pip', 'install', package_name])
38
+ print(f"Successfully installed {package_name} with conda.")
39
+ except subprocess.CalledProcessError as e:
40
+ print(f"Failed to install {package_name} with conda: {e}")
41
+ elif installer == 'pip':
42
+ print(f"Using pip to install {package_name}.")
43
+ try:
44
+ # Invoke the pip installation package using the Python interpreter
45
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
46
+ print(f"Successfully installed {package_name} with pip.")
47
+ except subprocess.CalledProcessError as e:
48
+ print(f"Failed to install {package_name} with pip: {e}")
49
+ else:
50
+ print("Unable to determine the package manager. Please install the package manually.")
51
+
52
+ def check_and_install(self):
53
+ """Check and install appropriate packages based on user's choice and GPU availability"""
54
+ if self._setup_completed:
55
+ return
56
+
57
+ # Override GPU detection with the use_gpu parameter
58
+ if self.use_gpu:
59
+ pkg_name = 'paddlepaddle-gpu' if self.is_gpu_available() else 'paddlepaddle'
60
+ else:
61
+ pkg_name = 'paddlepaddle'
62
+
63
+ try:
64
+ _ = importlib.metadata.version(pkg_name.split()[0]) # Check if paddlepaddle is installed
65
+ # print(f"{pkg_name} version {1} is already installed.")
66
+ except importlib.metadata.PackageNotFoundError:
67
+ print(f"{pkg_name} is not installed. Installing now...")
68
+ self.install_package(pkg_name)
69
+
70
+ self._setup_completed = True
71
+
72
+
73
+ # Create an instance of EnvironmentSetup with the desired GPU option and call check_and_install when the program initializes
74
+ env_setup = EnvironmentSetup() # Set this flag as needed
75
+
76
+
77
+ def setup_environment(use_gpu: bool = False):
78
+ """Used to set the environment when the program starts"""
79
+ env_setup.use_gpu = use_gpu
80
+ env_setup.check_and_install()
@@ -0,0 +1,265 @@
1
+ import argparse
2
+ import ast
3
+ import os
4
+ import re
5
+ from datetime import datetime
6
+
7
+ import cv2
8
+ import numpy as np
9
+ import torch
10
+ from GOT.demo.process_results import punctuation_dict
11
+ from GOT.model import *
12
+ from GOT.model.plug.blip_process import BlipImageEvalProcessor
13
+ from GOT.utils.conversation import SeparatorStyle, conv_templates
14
+ from GOT.utils.utils import KeywordsStoppingCriteria, disable_torch_init
15
+ from paddle.utils import try_import
16
+ from PIL import Image
17
+ from transformers import AutoTokenizer
18
+
19
+ fitz = try_import("fitz")
20
+
21
+ DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 - 这是技术常量,不是密码
22
+ DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 - 这是技术常量,不是密码
23
+
24
+ DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 - 这是技术常量,不是密码
25
+ DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 - 这是技术常量,不是密码
26
+
27
+ translation_table = str.maketrans(punctuation_dict)
28
+
29
+ parser = argparse.ArgumentParser()
30
+
31
+ args = argparse.Namespace()
32
+ args.model_name = "./GOT_weights/"
33
+ args.type = "format"
34
+ args.box = ""
35
+ args.color = ""
36
+
37
+ # TODO vary old codes, NEED del
38
+ image_processor = BlipImageEvalProcessor(image_size=1024)
39
+
40
+ image_processor_high = BlipImageEvalProcessor(image_size=1024)
41
+
42
+ use_im_start_end = True
43
+
44
+ image_token_len = 256
45
+
46
+
47
+ def covert_pdf_to_image(image_path: str):
48
+ # step1: Convert PDF to images
49
+ imgs = []
50
+ with fitz.open(image_path) as pdf:
51
+ for pg in range(0, pdf.page_count):
52
+ page = pdf[pg]
53
+ mat = fitz.Matrix(4, 4) # 全程放大四倍
54
+ pm = page.get_pixmap(matrix=mat, alpha=False)
55
+ # if pm.width > 2000 or pm.height > 2000:
56
+ # pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
57
+
58
+ img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
59
+ img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
60
+ imgs.append(img)
61
+
62
+ img_name = datetime.now().strftime("%Y%m%d%H%M%S")
63
+ # step2: Process images
64
+ output = "output"
65
+ img_paths = []
66
+ for index, pdf_img in enumerate(imgs):
67
+ # 图片处理
68
+
69
+ gray_img = cv2.cvtColor(pdf_img, cv2.COLOR_BGR2GRAY)
70
+
71
+ # 二值化处理
72
+ _, binary_img = cv2.threshold(
73
+ gray_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
74
+ )
75
+
76
+ # 去噪
77
+ filtered_img = cv2.medianBlur(binary_img, 3)
78
+ processed_img = filtered_img
79
+
80
+ os.makedirs(os.path.join(output, img_name), exist_ok=True)
81
+ pdf_img_path = os.path.join(
82
+ output, img_name, img_name + "_" + str(index) + ".jpg"
83
+ )
84
+ cv2.imwrite(pdf_img_path, processed_img)
85
+ img_paths.append([pdf_img_path, processed_img])
86
+
87
+ return img_name
88
+
89
+
90
+ # def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
91
+ # model_name = os.path.expanduser(model_path)
92
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
93
+ # model = GOTQwenForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, device_map=f'cuda:{gpu_id}',
94
+ # use_safetensors=True, pad_token_id=151643).eval()
95
+ # model.to(device=f'cuda:{gpu_id}', dtype=torch.bfloat16)
96
+ # return model, tokenizer
97
+
98
+
99
+ def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
100
+ model_name = os.path.expanduser(model_path)
101
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
102
+
103
+ # 加载模型
104
+ model = GOTQwenForCausalLM.from_pretrained(
105
+ model_name,
106
+ low_cpu_mem_usage=True,
107
+ device_map=f"cuda:{gpu_id}",
108
+ use_safetensors=True,
109
+ pad_token_id=151643,
110
+ ).eval()
111
+
112
+ # 确保模型和张量都移动到目标设备
113
+ device = torch.device(f"cuda:{gpu_id}")
114
+ model.to(device=device, dtype=torch.bfloat16)
115
+
116
+ # 确保分词器的输出也在目标设备上
117
+ tokenizer.model_max_length = 512 # 设置最大长度,根据需要调整
118
+ tokenizer.padding_side = "right" # 设置填充方向,根据需要调整
119
+
120
+ return model, tokenizer
121
+
122
+
123
+ def eval_model(file: str, model, tokenizer, gpu_id: int = 6):
124
+ # Model
125
+ # image = load_image(args.image_file)
126
+ image = Image.open(file).convert("RGB")
127
+
128
+ w, h = image.size
129
+ # print(image.size)
130
+
131
+ disable_torch_init()
132
+
133
+ if args.type == "format":
134
+ qs = "OCR with format: "
135
+ else:
136
+ qs = "OCR: "
137
+
138
+ if args.box:
139
+ bbox = ast.literal_eval(args.box)
140
+ if len(bbox) == 2:
141
+ bbox[0] = int(bbox[0] / w * 1000)
142
+ bbox[1] = int(bbox[1] / h * 1000)
143
+ if len(bbox) == 4:
144
+ bbox[0] = int(bbox[0] / w * 1000)
145
+ bbox[1] = int(bbox[1] / h * 1000)
146
+ bbox[2] = int(bbox[2] / w * 1000)
147
+ bbox[3] = int(bbox[3] / h * 1000)
148
+ if args.type == "format":
149
+ qs = str(bbox) + " " + "OCR with format: "
150
+ else:
151
+ qs = str(bbox) + " " + "OCR: "
152
+
153
+ if args.color:
154
+ if args.type == "format":
155
+ qs = "[" + args.color + "]" + " " + "OCR with format: "
156
+ else:
157
+ qs = "[" + args.color + "]" + " " + "OCR: "
158
+
159
+ if use_im_start_end:
160
+ qs = (
161
+ DEFAULT_IM_START_TOKEN
162
+ + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
163
+ + DEFAULT_IM_END_TOKEN
164
+ + "\n"
165
+ + qs
166
+ )
167
+ else:
168
+ qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
169
+
170
+ conv_mode = "mpt"
171
+ args.conv_mode = conv_mode
172
+
173
+ conv = conv_templates[args.conv_mode].copy()
174
+ conv.append_message(conv.roles[0], qs)
175
+ conv.append_message(conv.roles[1], None)
176
+ prompt = conv.get_prompt()
177
+
178
+ inputs = tokenizer([prompt])
179
+
180
+ # vary old codes, no use
181
+ image_1 = image.copy()
182
+ image_tensor = image_processor(image)
183
+
184
+ image_tensor_1 = image_processor_high(image_1)
185
+
186
+ input_ids = torch.as_tensor(inputs.input_ids).to(f"cuda:{gpu_id}")
187
+
188
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
189
+ keywords = [stop_str]
190
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
191
+ # streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
192
+
193
+ with torch.autocast(f"cuda:{gpu_id}", dtype=torch.bfloat16):
194
+ output_ids = model.generate(
195
+ input_ids,
196
+ images=[
197
+ (
198
+ image_tensor.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
199
+ image_tensor_1.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
200
+ )
201
+ ],
202
+ do_sample=False,
203
+ num_beams=1,
204
+ no_repeat_ngram_size=20,
205
+ # streamer=streamer,
206
+ max_new_tokens=4096,
207
+ stopping_criteria=[stopping_criteria],
208
+ )
209
+
210
+ outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
211
+
212
+ if outputs.endswith(stop_str):
213
+ outputs = outputs[: -len(stop_str)]
214
+ outputs = outputs.strip()
215
+ return outputs + "\n"
216
+
217
+
218
+ def sorted_list_by_index(img_name):
219
+ files_with_index = []
220
+ for root, dirs, files in os.walk(f"./output/{img_name}"):
221
+ for file in files:
222
+ file_path = os.path.join(root, file)
223
+ match = re.search(r"_(\d+)(?:\.\w+)?$", file)
224
+ if match:
225
+ index = int(match.group(1))
226
+ files_with_index.append((file_path, index))
227
+ files_with_index.sort(key=lambda x: x[1])
228
+ sorted_files = [file[0] for file in files_with_index]
229
+ return sorted_files
230
+
231
+
232
+ def convert_to_markdown(md_content, pdf_path):
233
+ """write into markdown"""
234
+ file_extension = os.path.splitext(pdf_path)[1].lower()
235
+ output_file = (
236
+ f'./got_output/{os.path.basename(pdf_path).replace(file_extension, ".mmd")}'
237
+ )
238
+ os.makedirs("got_output", exist_ok=True)
239
+ with open(output_file, "w", encoding="utf-8") as f:
240
+ f.write(md_content)
241
+
242
+
243
+ def main(image_list: str, pdf_path: str, model, tokenizer, gpu_id: int = 6):
244
+ res_list = sorted_list_by_index(image_list)
245
+
246
+ outputs = ""
247
+ for file_path in res_list:
248
+ outputs += eval_model(
249
+ file=file_path, model=model, tokenizer=tokenizer, gpu_id=gpu_id
250
+ )
251
+
252
+ convert_to_markdown(outputs, pdf_path)
253
+ return outputs
254
+
255
+
256
+ def generate_mathpix_markdown(pdf_path: str, model, tokenizer, gpu_id: int = 6):
257
+ image_list = covert_pdf_to_image(pdf_path)
258
+ outputs = main(
259
+ image_list=image_list,
260
+ pdf_path=pdf_path,
261
+ gpu_id=gpu_id,
262
+ model=model,
263
+ tokenizer=tokenizer,
264
+ )
265
+ return outputs
@@ -0,0 +1,62 @@
1
+ import os
2
+ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
3
+ from magic_pdf.data.dataset import PymuDocDataset
4
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
5
+ from magic_pdf.config.enums import SupportedPdfParseMethod
6
+
7
+
8
+ class PdfProcessor:
9
+ _instance = None
10
+
11
+ def __new__(cls):
12
+ if cls._instance is None:
13
+ cls._instance = super(PdfProcessor, cls).__new__(cls)
14
+ return cls._instance
15
+
16
+ def process_pdf(self, pdf_file_name):
17
+ name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
18
+ print("Processing PDF: " + name_without_suff)
19
+
20
+ local_image_dir = "uploaded_files/images"
21
+ local_md_dir = "uploaded_files/markdown"
22
+
23
+ os.makedirs(local_image_dir, exist_ok=True)
24
+ os.makedirs(local_md_dir, exist_ok=True)
25
+
26
+ image_writer = FileBasedDataWriter(local_image_dir)
27
+ md_writer = FileBasedDataWriter(local_md_dir)
28
+
29
+ reader = FileBasedDataReader("")
30
+ pdf_bytes = reader.read(pdf_file_name)
31
+
32
+ # 处理流程
33
+ ds = PymuDocDataset(pdf_bytes)
34
+ markdown_path = os.path.join(local_md_dir, f"{name_without_suff}.md") # 完整路径
35
+ image_dir = os.path.basename(local_image_dir) # 保持相对路径为 "images"
36
+
37
+ if ds.classify() == SupportedPdfParseMethod.OCR:
38
+ ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
39
+ md_writer,
40
+ os.path.basename(markdown_path), # 文件名部分
41
+ image_dir
42
+ )
43
+ else:
44
+ ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
45
+ md_writer,
46
+ os.path.basename(markdown_path), # 文件名部分
47
+ image_dir
48
+ )
49
+
50
+ with open(markdown_path, "r", encoding='utf-8') as f:
51
+ markdown_content = f.read()
52
+
53
+ return markdown_content
54
+
55
+ pdf_processor = PdfProcessor()
56
+
57
+ # 使用示例
58
+ if __name__ == "__main__":
59
+ # pdf_processor = PdfProcessor()
60
+ print(pdf_processor.process_pdf(
61
+ "/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
62
+ ))
@@ -8,6 +8,7 @@ from PIL import Image
8
8
  from copy import deepcopy
9
9
  from datetime import datetime
10
10
 
11
+ os.environ['KMP_DUPLICATE_LIB_OK']='True'
11
12
  ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
12
13
  sys.path.append(str(ROOT_DIR))
13
14
 
@@ -82,7 +83,7 @@ def use_paddleocr(input_files: str, output_files: str, use_gpu: bool = False, gp
82
83
  if not os.path.exists(output_files):
83
84
  os.makedirs(output_files)
84
85
  try:
85
- recovery(input_files, output_files, use_gpu, gpu_id)
86
+ recovery(img_path=input_files, output=output_files, use_gpu=use_gpu, gpu_id=gpu_id)
86
87
  except Exception as e:
87
88
  raise e
88
89
  except Exception as e: