pydatamax 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/minio_handler.py +171 -171
  4. datamax/loader/oss_handler.py +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +466 -10
  10. datamax/parser/docx_parser.py +449 -11
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -215
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/METADATA +117 -5
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.14.dist-info/RECORD +0 -39
  38. tests/__init__.py +0 -0
  39. tests/test_basic.py +0 -20
  40. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
@@ -1,265 +1,265 @@
1
- import argparse
2
- import ast
3
- import os
4
- import re
5
- from datetime import datetime
6
-
7
- import cv2
8
- import numpy as np
9
- import torch
10
- from GOT.demo.process_results import punctuation_dict
11
- from GOT.model import *
12
- from GOT.model.plug.blip_process import BlipImageEvalProcessor
13
- from GOT.utils.conversation import SeparatorStyle, conv_templates
14
- from GOT.utils.utils import KeywordsStoppingCriteria, disable_torch_init
15
- from paddle.utils import try_import
16
- from PIL import Image
17
- from transformers import AutoTokenizer
18
-
19
- fitz = try_import("fitz")
20
-
21
- DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 - 这是技术常量,不是密码
22
- DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 - 这是技术常量,不是密码
23
-
24
- DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 - 这是技术常量,不是密码
25
- DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 - 这是技术常量,不是密码
26
-
27
- translation_table = str.maketrans(punctuation_dict)
28
-
29
- parser = argparse.ArgumentParser()
30
-
31
- args = argparse.Namespace()
32
- args.model_name = "./GOT_weights/"
33
- args.type = "format"
34
- args.box = ""
35
- args.color = ""
36
-
37
- # TODO vary old codes, NEED del
38
- image_processor = BlipImageEvalProcessor(image_size=1024)
39
-
40
- image_processor_high = BlipImageEvalProcessor(image_size=1024)
41
-
42
- use_im_start_end = True
43
-
44
- image_token_len = 256
45
-
46
-
47
- def covert_pdf_to_image(image_path: str):
48
- # step1: Convert PDF to images
49
- imgs = []
50
- with fitz.open(image_path) as pdf:
51
- for pg in range(0, pdf.page_count):
52
- page = pdf[pg]
53
- mat = fitz.Matrix(4, 4) # 全程放大四倍
54
- pm = page.get_pixmap(matrix=mat, alpha=False)
55
- # if pm.width > 2000 or pm.height > 2000:
56
- # pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
57
-
58
- img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
59
- img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
60
- imgs.append(img)
61
-
62
- img_name = datetime.now().strftime("%Y%m%d%H%M%S")
63
- # step2: Process images
64
- output = "output"
65
- img_paths = []
66
- for index, pdf_img in enumerate(imgs):
67
- # 图片处理
68
-
69
- gray_img = cv2.cvtColor(pdf_img, cv2.COLOR_BGR2GRAY)
70
-
71
- # 二值化处理
72
- _, binary_img = cv2.threshold(
73
- gray_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
74
- )
75
-
76
- # 去噪
77
- filtered_img = cv2.medianBlur(binary_img, 3)
78
- processed_img = filtered_img
79
-
80
- os.makedirs(os.path.join(output, img_name), exist_ok=True)
81
- pdf_img_path = os.path.join(
82
- output, img_name, img_name + "_" + str(index) + ".jpg"
83
- )
84
- cv2.imwrite(pdf_img_path, processed_img)
85
- img_paths.append([pdf_img_path, processed_img])
86
-
87
- return img_name
88
-
89
-
90
- # def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
91
- # model_name = os.path.expanduser(model_path)
92
- # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
93
- # model = GOTQwenForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, device_map=f'cuda:{gpu_id}',
94
- # use_safetensors=True, pad_token_id=151643).eval()
95
- # model.to(device=f'cuda:{gpu_id}', dtype=torch.bfloat16)
96
- # return model, tokenizer
97
-
98
-
99
- def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
100
- model_name = os.path.expanduser(model_path)
101
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
102
-
103
- # 加载模型
104
- model = GOTQwenForCausalLM.from_pretrained(
105
- model_name,
106
- low_cpu_mem_usage=True,
107
- device_map=f"cuda:{gpu_id}",
108
- use_safetensors=True,
109
- pad_token_id=151643,
110
- ).eval()
111
-
112
- # 确保模型和张量都移动到目标设备
113
- device = torch.device(f"cuda:{gpu_id}")
114
- model.to(device=device, dtype=torch.bfloat16)
115
-
116
- # 确保分词器的输出也在目标设备上
117
- tokenizer.model_max_length = 512 # 设置最大长度,根据需要调整
118
- tokenizer.padding_side = "right" # 设置填充方向,根据需要调整
119
-
120
- return model, tokenizer
121
-
122
-
123
- def eval_model(file: str, model, tokenizer, gpu_id: int = 6):
124
- # Model
125
- # image = load_image(args.image_file)
126
- image = Image.open(file).convert("RGB")
127
-
128
- w, h = image.size
129
- # print(image.size)
130
-
131
- disable_torch_init()
132
-
133
- if args.type == "format":
134
- qs = "OCR with format: "
135
- else:
136
- qs = "OCR: "
137
-
138
- if args.box:
139
- bbox = ast.literal_eval(args.box)
140
- if len(bbox) == 2:
141
- bbox[0] = int(bbox[0] / w * 1000)
142
- bbox[1] = int(bbox[1] / h * 1000)
143
- if len(bbox) == 4:
144
- bbox[0] = int(bbox[0] / w * 1000)
145
- bbox[1] = int(bbox[1] / h * 1000)
146
- bbox[2] = int(bbox[2] / w * 1000)
147
- bbox[3] = int(bbox[3] / h * 1000)
148
- if args.type == "format":
149
- qs = str(bbox) + " " + "OCR with format: "
150
- else:
151
- qs = str(bbox) + " " + "OCR: "
152
-
153
- if args.color:
154
- if args.type == "format":
155
- qs = "[" + args.color + "]" + " " + "OCR with format: "
156
- else:
157
- qs = "[" + args.color + "]" + " " + "OCR: "
158
-
159
- if use_im_start_end:
160
- qs = (
161
- DEFAULT_IM_START_TOKEN
162
- + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
163
- + DEFAULT_IM_END_TOKEN
164
- + "\n"
165
- + qs
166
- )
167
- else:
168
- qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
169
-
170
- conv_mode = "mpt"
171
- args.conv_mode = conv_mode
172
-
173
- conv = conv_templates[args.conv_mode].copy()
174
- conv.append_message(conv.roles[0], qs)
175
- conv.append_message(conv.roles[1], None)
176
- prompt = conv.get_prompt()
177
-
178
- inputs = tokenizer([prompt])
179
-
180
- # vary old codes, no use
181
- image_1 = image.copy()
182
- image_tensor = image_processor(image)
183
-
184
- image_tensor_1 = image_processor_high(image_1)
185
-
186
- input_ids = torch.as_tensor(inputs.input_ids).to(f"cuda:{gpu_id}")
187
-
188
- stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
189
- keywords = [stop_str]
190
- stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
191
- # streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
192
-
193
- with torch.autocast(f"cuda:{gpu_id}", dtype=torch.bfloat16):
194
- output_ids = model.generate(
195
- input_ids,
196
- images=[
197
- (
198
- image_tensor.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
199
- image_tensor_1.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
200
- )
201
- ],
202
- do_sample=False,
203
- num_beams=1,
204
- no_repeat_ngram_size=20,
205
- # streamer=streamer,
206
- max_new_tokens=4096,
207
- stopping_criteria=[stopping_criteria],
208
- )
209
-
210
- outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
211
-
212
- if outputs.endswith(stop_str):
213
- outputs = outputs[: -len(stop_str)]
214
- outputs = outputs.strip()
215
- return outputs + "\n"
216
-
217
-
218
- def sorted_list_by_index(img_name):
219
- files_with_index = []
220
- for root, dirs, files in os.walk(f"./output/{img_name}"):
221
- for file in files:
222
- file_path = os.path.join(root, file)
223
- match = re.search(r"_(\d+)(?:\.\w+)?$", file)
224
- if match:
225
- index = int(match.group(1))
226
- files_with_index.append((file_path, index))
227
- files_with_index.sort(key=lambda x: x[1])
228
- sorted_files = [file[0] for file in files_with_index]
229
- return sorted_files
230
-
231
-
232
- def convert_to_markdown(md_content, pdf_path):
233
- """write into markdown"""
234
- file_extension = os.path.splitext(pdf_path)[1].lower()
235
- output_file = (
236
- f'./got_output/{os.path.basename(pdf_path).replace(file_extension, ".mmd")}'
237
- )
238
- os.makedirs("got_output", exist_ok=True)
239
- with open(output_file, "w", encoding="utf-8") as f:
240
- f.write(md_content)
241
-
242
-
243
- def main(image_list: str, pdf_path: str, model, tokenizer, gpu_id: int = 6):
244
- res_list = sorted_list_by_index(image_list)
245
-
246
- outputs = ""
247
- for file_path in res_list:
248
- outputs += eval_model(
249
- file=file_path, model=model, tokenizer=tokenizer, gpu_id=gpu_id
250
- )
251
-
252
- convert_to_markdown(outputs, pdf_path)
253
- return outputs
254
-
255
-
256
- def generate_mathpix_markdown(pdf_path: str, model, tokenizer, gpu_id: int = 6):
257
- image_list = covert_pdf_to_image(pdf_path)
258
- outputs = main(
259
- image_list=image_list,
260
- pdf_path=pdf_path,
261
- gpu_id=gpu_id,
262
- model=model,
263
- tokenizer=tokenizer,
264
- )
265
- return outputs
1
+ import argparse
2
+ import ast
3
+ import os
4
+ import re
5
+ from datetime import datetime
6
+
7
+ import cv2
8
+ import numpy as np
9
+ import torch
10
+ from GOT.demo.process_results import punctuation_dict
11
+ from GOT.model import *
12
+ from GOT.model.plug.blip_process import BlipImageEvalProcessor
13
+ from GOT.utils.conversation import SeparatorStyle, conv_templates
14
+ from GOT.utils.utils import KeywordsStoppingCriteria, disable_torch_init
15
+ from paddle.utils import try_import
16
+ from PIL import Image
17
+ from transformers import AutoTokenizer
18
+
19
+ fitz = try_import("fitz")
20
+
21
+ DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 - 这是技术常量,不是密码
22
+ DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 - 这是技术常量,不是密码
23
+
24
+ DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 - 这是技术常量,不是密码
25
+ DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 - 这是技术常量,不是密码
26
+
27
+ translation_table = str.maketrans(punctuation_dict)
28
+
29
+ parser = argparse.ArgumentParser()
30
+
31
+ args = argparse.Namespace()
32
+ args.model_name = "./GOT_weights/"
33
+ args.type = "format"
34
+ args.box = ""
35
+ args.color = ""
36
+
37
+ # TODO vary old codes, NEED del
38
+ image_processor = BlipImageEvalProcessor(image_size=1024)
39
+
40
+ image_processor_high = BlipImageEvalProcessor(image_size=1024)
41
+
42
+ use_im_start_end = True
43
+
44
+ image_token_len = 256
45
+
46
+
47
+ def covert_pdf_to_image(image_path: str):
48
+ # step1: Convert PDF to images
49
+ imgs = []
50
+ with fitz.open(image_path) as pdf:
51
+ for pg in range(0, pdf.page_count):
52
+ page = pdf[pg]
53
+ mat = fitz.Matrix(4, 4) # 全程放大四倍
54
+ pm = page.get_pixmap(matrix=mat, alpha=False)
55
+ # if pm.width > 2000 or pm.height > 2000:
56
+ # pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
57
+
58
+ img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
59
+ img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
60
+ imgs.append(img)
61
+
62
+ img_name = datetime.now().strftime("%Y%m%d%H%M%S")
63
+ # step2: Process images
64
+ output = "output"
65
+ img_paths = []
66
+ for index, pdf_img in enumerate(imgs):
67
+ # 图片处理
68
+
69
+ gray_img = cv2.cvtColor(pdf_img, cv2.COLOR_BGR2GRAY)
70
+
71
+ # 二值化处理
72
+ _, binary_img = cv2.threshold(
73
+ gray_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
74
+ )
75
+
76
+ # 去噪
77
+ filtered_img = cv2.medianBlur(binary_img, 3)
78
+ processed_img = filtered_img
79
+
80
+ os.makedirs(os.path.join(output, img_name), exist_ok=True)
81
+ pdf_img_path = os.path.join(
82
+ output, img_name, img_name + "_" + str(index) + ".jpg"
83
+ )
84
+ cv2.imwrite(pdf_img_path, processed_img)
85
+ img_paths.append([pdf_img_path, processed_img])
86
+
87
+ return img_name
88
+
89
+
90
+ # def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
91
+ # model_name = os.path.expanduser(model_path)
92
+ # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
93
+ # model = GOTQwenForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, device_map=f'cuda:{gpu_id}',
94
+ # use_safetensors=True, pad_token_id=151643).eval()
95
+ # model.to(device=f'cuda:{gpu_id}', dtype=torch.bfloat16)
96
+ # return model, tokenizer
97
+
98
+
99
+ def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
100
+ model_name = os.path.expanduser(model_path)
101
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
102
+
103
+ # 加载模型
104
+ model = GOTQwenForCausalLM.from_pretrained(
105
+ model_name,
106
+ low_cpu_mem_usage=True,
107
+ device_map=f"cuda:{gpu_id}",
108
+ use_safetensors=True,
109
+ pad_token_id=151643,
110
+ ).eval()
111
+
112
+ # 确保模型和张量都移动到目标设备
113
+ device = torch.device(f"cuda:{gpu_id}")
114
+ model.to(device=device, dtype=torch.bfloat16)
115
+
116
+ # 确保分词器的输出也在目标设备上
117
+ tokenizer.model_max_length = 512 # 设置最大长度,根据需要调整
118
+ tokenizer.padding_side = "right" # 设置填充方向,根据需要调整
119
+
120
+ return model, tokenizer
121
+
122
+
123
+ def eval_model(file: str, model, tokenizer, gpu_id: int = 6):
124
+ # Model
125
+ # image = load_image(args.image_file)
126
+ image = Image.open(file).convert("RGB")
127
+
128
+ w, h = image.size
129
+ # print(image.size)
130
+
131
+ disable_torch_init()
132
+
133
+ if args.type == "format":
134
+ qs = "OCR with format: "
135
+ else:
136
+ qs = "OCR: "
137
+
138
+ if args.box:
139
+ bbox = ast.literal_eval(args.box)
140
+ if len(bbox) == 2:
141
+ bbox[0] = int(bbox[0] / w * 1000)
142
+ bbox[1] = int(bbox[1] / h * 1000)
143
+ if len(bbox) == 4:
144
+ bbox[0] = int(bbox[0] / w * 1000)
145
+ bbox[1] = int(bbox[1] / h * 1000)
146
+ bbox[2] = int(bbox[2] / w * 1000)
147
+ bbox[3] = int(bbox[3] / h * 1000)
148
+ if args.type == "format":
149
+ qs = str(bbox) + " " + "OCR with format: "
150
+ else:
151
+ qs = str(bbox) + " " + "OCR: "
152
+
153
+ if args.color:
154
+ if args.type == "format":
155
+ qs = "[" + args.color + "]" + " " + "OCR with format: "
156
+ else:
157
+ qs = "[" + args.color + "]" + " " + "OCR: "
158
+
159
+ if use_im_start_end:
160
+ qs = (
161
+ DEFAULT_IM_START_TOKEN
162
+ + DEFAULT_IMAGE_PATCH_TOKEN * image_token_len
163
+ + DEFAULT_IM_END_TOKEN
164
+ + "\n"
165
+ + qs
166
+ )
167
+ else:
168
+ qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
169
+
170
+ conv_mode = "mpt"
171
+ args.conv_mode = conv_mode
172
+
173
+ conv = conv_templates[args.conv_mode].copy()
174
+ conv.append_message(conv.roles[0], qs)
175
+ conv.append_message(conv.roles[1], None)
176
+ prompt = conv.get_prompt()
177
+
178
+ inputs = tokenizer([prompt])
179
+
180
+ # vary old codes, no use
181
+ image_1 = image.copy()
182
+ image_tensor = image_processor(image)
183
+
184
+ image_tensor_1 = image_processor_high(image_1)
185
+
186
+ input_ids = torch.as_tensor(inputs.input_ids).to(f"cuda:{gpu_id}")
187
+
188
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
189
+ keywords = [stop_str]
190
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
191
+ # streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
192
+
193
+ with torch.autocast(f"cuda:{gpu_id}", dtype=torch.bfloat16):
194
+ output_ids = model.generate(
195
+ input_ids,
196
+ images=[
197
+ (
198
+ image_tensor.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
199
+ image_tensor_1.unsqueeze(0).half().to(f"cuda:{gpu_id}"),
200
+ )
201
+ ],
202
+ do_sample=False,
203
+ num_beams=1,
204
+ no_repeat_ngram_size=20,
205
+ # streamer=streamer,
206
+ max_new_tokens=4096,
207
+ stopping_criteria=[stopping_criteria],
208
+ )
209
+
210
+ outputs = tokenizer.decode(output_ids[0, input_ids.shape[1] :]).strip()
211
+
212
+ if outputs.endswith(stop_str):
213
+ outputs = outputs[: -len(stop_str)]
214
+ outputs = outputs.strip()
215
+ return outputs + "\n"
216
+
217
+
218
+ def sorted_list_by_index(img_name):
219
+ files_with_index = []
220
+ for root, dirs, files in os.walk(f"./output/{img_name}"):
221
+ for file in files:
222
+ file_path = os.path.join(root, file)
223
+ match = re.search(r"_(\d+)(?:\.\w+)?$", file)
224
+ if match:
225
+ index = int(match.group(1))
226
+ files_with_index.append((file_path, index))
227
+ files_with_index.sort(key=lambda x: x[1])
228
+ sorted_files = [file[0] for file in files_with_index]
229
+ return sorted_files
230
+
231
+
232
+ def convert_to_markdown(md_content, pdf_path):
233
+ """write into markdown"""
234
+ file_extension = os.path.splitext(pdf_path)[1].lower()
235
+ output_file = (
236
+ f'./got_output/{os.path.basename(pdf_path).replace(file_extension, ".mmd")}'
237
+ )
238
+ os.makedirs("got_output", exist_ok=True)
239
+ with open(output_file, "w", encoding="utf-8") as f:
240
+ f.write(md_content)
241
+
242
+
243
+ def main(image_list: str, pdf_path: str, model, tokenizer, gpu_id: int = 6):
244
+ res_list = sorted_list_by_index(image_list)
245
+
246
+ outputs = ""
247
+ for file_path in res_list:
248
+ outputs += eval_model(
249
+ file=file_path, model=model, tokenizer=tokenizer, gpu_id=gpu_id
250
+ )
251
+
252
+ convert_to_markdown(outputs, pdf_path)
253
+ return outputs
254
+
255
+
256
+ def generate_mathpix_markdown(pdf_path: str, model, tokenizer, gpu_id: int = 6):
257
+ image_list = covert_pdf_to_image(pdf_path)
258
+ outputs = main(
259
+ image_list=image_list,
260
+ pdf_path=pdf_path,
261
+ gpu_id=gpu_id,
262
+ model=model,
263
+ tokenizer=tokenizer,
264
+ )
265
+ return outputs
@@ -1,62 +1,62 @@
1
- import os
2
- from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
3
- from magic_pdf.data.dataset import PymuDocDataset
4
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
5
- from magic_pdf.config.enums import SupportedPdfParseMethod
6
-
7
-
8
- class PdfProcessor:
9
- _instance = None
10
-
11
- def __new__(cls):
12
- if cls._instance is None:
13
- cls._instance = super(PdfProcessor, cls).__new__(cls)
14
- return cls._instance
15
-
16
- def process_pdf(self, pdf_file_name):
17
- name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
18
- print("Processing PDF: " + name_without_suff)
19
-
20
- local_image_dir = "uploaded_files/images"
21
- local_md_dir = "uploaded_files/markdown"
22
-
23
- os.makedirs(local_image_dir, exist_ok=True)
24
- os.makedirs(local_md_dir, exist_ok=True)
25
-
26
- image_writer = FileBasedDataWriter(local_image_dir)
27
- md_writer = FileBasedDataWriter(local_md_dir)
28
-
29
- reader = FileBasedDataReader("")
30
- pdf_bytes = reader.read(pdf_file_name)
31
-
32
- # 处理流程
33
- ds = PymuDocDataset(pdf_bytes)
34
- markdown_path = os.path.join(local_md_dir, f"{name_without_suff}.md") # 完整路径
35
- image_dir = os.path.basename(local_image_dir) # 保持相对路径为 "images"
36
-
37
- if ds.classify() == SupportedPdfParseMethod.OCR:
38
- ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
39
- md_writer,
40
- os.path.basename(markdown_path), # 文件名部分
41
- image_dir
42
- )
43
- else:
44
- ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
45
- md_writer,
46
- os.path.basename(markdown_path), # 文件名部分
47
- image_dir
48
- )
49
-
50
- with open(markdown_path, "r", encoding='utf-8') as f:
51
- markdown_content = f.read()
52
-
53
- return markdown_content
54
-
55
- pdf_processor = PdfProcessor()
56
-
57
- # 使用示例
58
- if __name__ == "__main__":
59
- # pdf_processor = PdfProcessor()
60
- print(pdf_processor.process_pdf(
61
- "/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
62
- ))
1
+ import os
2
+ from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
3
+ from magic_pdf.data.dataset import PymuDocDataset
4
+ from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
5
+ from magic_pdf.config.enums import SupportedPdfParseMethod
6
+
7
+
8
+ class PdfProcessor:
9
+ _instance = None
10
+
11
+ def __new__(cls):
12
+ if cls._instance is None:
13
+ cls._instance = super(PdfProcessor, cls).__new__(cls)
14
+ return cls._instance
15
+
16
+ def process_pdf(self, pdf_file_name):
17
+ name_without_suff = os.path.basename(pdf_file_name).split(".")[0]
18
+ print("Processing PDF: " + name_without_suff)
19
+
20
+ local_image_dir = "uploaded_files/images"
21
+ local_md_dir = "uploaded_files/markdown"
22
+
23
+ os.makedirs(local_image_dir, exist_ok=True)
24
+ os.makedirs(local_md_dir, exist_ok=True)
25
+
26
+ image_writer = FileBasedDataWriter(local_image_dir)
27
+ md_writer = FileBasedDataWriter(local_md_dir)
28
+
29
+ reader = FileBasedDataReader("")
30
+ pdf_bytes = reader.read(pdf_file_name)
31
+
32
+ # 处理流程
33
+ ds = PymuDocDataset(pdf_bytes)
34
+ markdown_path = os.path.join(local_md_dir, f"{name_without_suff}.md") # 完整路径
35
+ image_dir = os.path.basename(local_image_dir) # 保持相对路径为 "images"
36
+
37
+ if ds.classify() == SupportedPdfParseMethod.OCR:
38
+ ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
39
+ md_writer,
40
+ os.path.basename(markdown_path), # 文件名部分
41
+ image_dir
42
+ )
43
+ else:
44
+ ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
45
+ md_writer,
46
+ os.path.basename(markdown_path), # 文件名部分
47
+ image_dir
48
+ )
49
+
50
+ with open(markdown_path, "r", encoding='utf-8') as f:
51
+ markdown_content = f.read()
52
+
53
+ return markdown_content
54
+
55
+ pdf_processor = PdfProcessor()
56
+
57
+ # 使用示例
58
+ if __name__ == "__main__":
59
+ # pdf_processor = PdfProcessor()
60
+ print(pdf_processor.process_pdf(
61
+ "/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
62
+ ))