pydatamax 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +84 -72
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.post1.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/utils/gotocr_pdf.py
CHANGED
@@ -18,11 +18,11 @@ from transformers import AutoTokenizer
|
|
18
18
|
|
19
19
|
fitz = try_import("fitz")
|
20
20
|
|
21
|
-
DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 -
|
22
|
-
DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 -
|
21
|
+
DEFAULT_IMAGE_TOKEN = "<image>" # nosec B105 - technical const,not a passward
|
22
|
+
DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>" # nosec B105 - technical const,not a passward
|
23
23
|
|
24
|
-
DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 -
|
25
|
-
DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 -
|
24
|
+
DEFAULT_IM_START_TOKEN = "<img>" # nosec B105 - technical const,not a passward
|
25
|
+
DEFAULT_IM_END_TOKEN = "</img>" # nosec B105 - technical const,not a passward
|
26
26
|
|
27
27
|
translation_table = str.maketrans(punctuation_dict)
|
28
28
|
|
@@ -50,7 +50,7 @@ def covert_pdf_to_image(image_path: str):
|
|
50
50
|
with fitz.open(image_path) as pdf:
|
51
51
|
for pg in range(0, pdf.page_count):
|
52
52
|
page = pdf[pg]
|
53
|
-
mat = fitz.Matrix(4, 4) #
|
53
|
+
mat = fitz.Matrix(4, 4) # Magnify by four times throughout the process
|
54
54
|
pm = page.get_pixmap(matrix=mat, alpha=False)
|
55
55
|
# if pm.width > 2000 or pm.height > 2000:
|
56
56
|
# pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
|
@@ -64,16 +64,16 @@ def covert_pdf_to_image(image_path: str):
|
|
64
64
|
output = "output"
|
65
65
|
img_paths = []
|
66
66
|
for index, pdf_img in enumerate(imgs):
|
67
|
-
#
|
67
|
+
# img processing
|
68
68
|
|
69
69
|
gray_img = cv2.cvtColor(pdf_img, cv2.COLOR_BGR2GRAY)
|
70
70
|
|
71
|
-
#
|
71
|
+
# Binarization processing
|
72
72
|
_, binary_img = cv2.threshold(
|
73
73
|
gray_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
|
74
74
|
)
|
75
75
|
|
76
|
-
#
|
76
|
+
# denoise
|
77
77
|
filtered_img = cv2.medianBlur(binary_img, 3)
|
78
78
|
processed_img = filtered_img
|
79
79
|
|
@@ -100,7 +100,7 @@ def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
|
|
100
100
|
model_name = os.path.expanduser(model_path)
|
101
101
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
102
102
|
|
103
|
-
#
|
103
|
+
# load model
|
104
104
|
model = GOTQwenForCausalLM.from_pretrained(
|
105
105
|
model_name,
|
106
106
|
low_cpu_mem_usage=True,
|
@@ -109,13 +109,13 @@ def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
|
|
109
109
|
pad_token_id=151643,
|
110
110
|
).eval()
|
111
111
|
|
112
|
-
#
|
112
|
+
# Ensure that both the model and the tensor are moved to the target device.
|
113
113
|
device = torch.device(f"cuda:{gpu_id}")
|
114
114
|
model.to(device=device, dtype=torch.bfloat16)
|
115
115
|
|
116
|
-
#
|
117
|
-
tokenizer.model_max_length = 512 #
|
118
|
-
tokenizer.padding_side = "right" #
|
116
|
+
# Ensure that the output of the tokenizer is also on the target device.
|
117
|
+
tokenizer.model_max_length = 512 # maxlength,adjust to need
|
118
|
+
tokenizer.padding_side = "right" # padding side,adjust to need
|
119
119
|
|
120
120
|
return model, tokenizer
|
121
121
|
|
@@ -0,0 +1,18 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
|
3
|
+
|
4
|
+
class LifeType(Enum):
|
5
|
+
# 数据处理阶段
|
6
|
+
DATA_PROCESSING = "DATA_PROCESSING" # 正在处理
|
7
|
+
DATA_PROCESSED = "DATA_PROCESSED" # 处理完成
|
8
|
+
DATA_PROCESS_FAILED = "DATA_PROCESS_FAILED" # 处理失败
|
9
|
+
|
10
|
+
# 数据清洗阶段
|
11
|
+
DATA_CLEANING = "DATA_CLEANING" # 正在清洗
|
12
|
+
DATA_CLEANED = "DATA_CLEANED" # 清洗完成
|
13
|
+
DATA_CLEAN_FAILED = "DATA_CLEAN_FAILED" # 清洗失败
|
14
|
+
|
15
|
+
# 数据标注阶段
|
16
|
+
DATA_LABELLING = "DATA_LABELLING" # 正在标注
|
17
|
+
DATA_LABELLED = "DATA_LABELLED" # 标注完成
|
18
|
+
DATA_LABEL_FAILED = "DATA_LABEL_FAILED" # 标注失败
|
datamax/utils/mineru_operator.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
import os
|
2
|
-
|
2
|
+
|
3
|
+
from magic_pdf.config.enums import SupportedPdfParseMethod
|
4
|
+
from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
|
3
5
|
from magic_pdf.data.dataset import PymuDocDataset
|
4
6
|
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
5
|
-
from magic_pdf.config.enums import SupportedPdfParseMethod
|
6
7
|
|
7
8
|
|
8
9
|
class PdfProcessor:
|
@@ -31,32 +32,33 @@ class PdfProcessor:
|
|
31
32
|
|
32
33
|
# 处理流程
|
33
34
|
ds = PymuDocDataset(pdf_bytes)
|
34
|
-
markdown_path = os.path.join(
|
35
|
-
|
35
|
+
markdown_path = os.path.join(
|
36
|
+
local_md_dir, f"{name_without_suff}.md"
|
37
|
+
) # absolute path
|
38
|
+
image_dir = os.path.basename(local_image_dir) # keep relative path as "images"
|
36
39
|
|
37
40
|
if ds.classify() == SupportedPdfParseMethod.OCR:
|
38
41
|
ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
|
39
|
-
md_writer,
|
40
|
-
os.path.basename(markdown_path), # 文件名部分
|
41
|
-
image_dir
|
42
|
+
md_writer, os.path.basename(markdown_path), image_dir # filename
|
42
43
|
)
|
43
44
|
else:
|
44
45
|
ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
|
45
|
-
md_writer,
|
46
|
-
os.path.basename(markdown_path), # 文件名部分
|
47
|
-
image_dir
|
46
|
+
md_writer, os.path.basename(markdown_path), image_dir # filename
|
48
47
|
)
|
49
48
|
|
50
|
-
with open(markdown_path, "r", encoding=
|
49
|
+
with open(markdown_path, "r", encoding="utf-8") as f:
|
51
50
|
markdown_content = f.read()
|
52
51
|
|
53
52
|
return markdown_content
|
54
53
|
|
54
|
+
|
55
55
|
pdf_processor = PdfProcessor()
|
56
56
|
|
57
|
-
#
|
57
|
+
# usage example
|
58
58
|
if __name__ == "__main__":
|
59
59
|
# pdf_processor = PdfProcessor()
|
60
|
-
print(
|
61
|
-
|
62
|
-
|
60
|
+
print(
|
61
|
+
pdf_processor.process_pdf(
|
62
|
+
"/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
|
63
|
+
)
|
64
|
+
)
|
@@ -1,22 +1,24 @@
|
|
1
1
|
"""layout_recovery"""
|
2
|
+
|
2
3
|
import os
|
3
4
|
import pathlib
|
4
5
|
import sys
|
6
|
+
from copy import deepcopy
|
7
|
+
from datetime import datetime
|
8
|
+
|
5
9
|
import cv2
|
6
10
|
import numpy as np
|
7
11
|
from PIL import Image
|
8
|
-
from copy import deepcopy
|
9
|
-
from datetime import datetime
|
10
12
|
|
11
|
-
os.environ[
|
13
|
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
|
12
14
|
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
13
15
|
sys.path.append(str(ROOT_DIR))
|
14
16
|
|
15
17
|
from paddle.utils import try_import
|
16
18
|
from paddleocr import PPStructure, save_structure_res
|
17
|
-
|
18
|
-
|
19
|
-
from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
|
19
|
+
|
20
|
+
sys.path.append("/usr/local/lib/python3.10/dist-packages/paddleocr")
|
21
|
+
from ppstructure.recovery.recovery_to_doc import convert_info_docx, sorted_layout_boxes
|
20
22
|
|
21
23
|
|
22
24
|
def recovery(img_path, output, use_gpu, gpu_id):
|
@@ -48,20 +50,24 @@ def recovery(img_path, output, use_gpu, gpu_id):
|
|
48
50
|
img_paths = []
|
49
51
|
for index, pdf_img in enumerate(imgs):
|
50
52
|
os.makedirs(os.path.join(output, img_name), exist_ok=True)
|
51
|
-
pdf_img_path = os.path.join(
|
53
|
+
pdf_img_path = os.path.join(
|
54
|
+
output, img_name, img_name + "_" + str(index) + ".jpg"
|
55
|
+
)
|
52
56
|
cv2.imwrite(pdf_img_path, pdf_img)
|
53
57
|
img_paths.append([pdf_img_path, pdf_img])
|
54
58
|
|
55
59
|
# step3: Convert images to DOCX
|
56
60
|
all_res = []
|
57
|
-
engine = PPStructure(
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
61
|
+
engine = PPStructure(
|
62
|
+
recovery=True,
|
63
|
+
use_gpu=use_gpu,
|
64
|
+
gpu_id=gpu_id,
|
65
|
+
det_model_dir=f"{ROOT_DIR}/ocr_model_dir/det/en/en_PP-OCRv3_det_infer",
|
66
|
+
rec_model_dir=f"{ROOT_DIR}/ocr_model_dir/rec/ch/ch_PP-OCRv4_rec_infer",
|
67
|
+
table_model_dir=f"{ROOT_DIR}/ocr_model_dir/table/en_ppstructure_mobile_v2.0_SLANet_infer",
|
68
|
+
layout_model_dir=f"{ROOT_DIR}/ocr_model_dir/layout/picodet_lcnet_x1_0_fgd_layout_infer",
|
69
|
+
formula_model_dir=f"{ROOT_DIR}/ocr_model_dir/formula/rec_latex_ocr_infer",
|
70
|
+
)
|
65
71
|
for index, (new_img_path, imgs) in enumerate(img_paths):
|
66
72
|
print("processing {}/{} page:".format(index + 1, len(img_paths)))
|
67
73
|
result = engine(imgs, img_idx=index)
|
@@ -72,18 +78,27 @@ def recovery(img_path, output, use_gpu, gpu_id):
|
|
72
78
|
all_res += result_sorted
|
73
79
|
try:
|
74
80
|
convert_info_docx(imgs, all_res, output, img_name)
|
75
|
-
os.rename(
|
76
|
-
|
81
|
+
os.rename(
|
82
|
+
f"./output/{img_name}_ocr.docx",
|
83
|
+
f'./output/{os.path.basename(img_path).replace(".pdf", "")}_ocr.docx',
|
84
|
+
)
|
77
85
|
except Exception as e:
|
78
86
|
raise e
|
79
87
|
|
80
88
|
|
81
|
-
def use_paddleocr(
|
89
|
+
def use_paddleocr(
|
90
|
+
input_files: str, output_files: str, use_gpu: bool = False, gpu_id: int = 6
|
91
|
+
):
|
82
92
|
try:
|
83
93
|
if not os.path.exists(output_files):
|
84
94
|
os.makedirs(output_files)
|
85
95
|
try:
|
86
|
-
recovery(
|
96
|
+
recovery(
|
97
|
+
img_path=input_files,
|
98
|
+
output=output_files,
|
99
|
+
use_gpu=use_gpu,
|
100
|
+
gpu_id=gpu_id,
|
101
|
+
)
|
87
102
|
except Exception as e:
|
88
103
|
raise e
|
89
104
|
except Exception as e:
|
datamax/utils/ppt_extract.py
CHANGED
@@ -1,17 +1,18 @@
|
|
1
1
|
import os
|
2
|
-
from loguru import logger
|
3
2
|
from functools import lru_cache
|
4
|
-
from typing import List, Dict, Union
|
5
3
|
from pathlib import Path
|
4
|
+
from typing import Dict, List, Union
|
5
|
+
|
6
|
+
from loguru import logger
|
6
7
|
from PIL.Image import Image
|
7
8
|
from pptx import Presentation
|
8
9
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
9
10
|
from pptx.shapes.base import BaseShape as Shape
|
10
|
-
from pptx.shapes.picture import Picture
|
11
11
|
from pptx.shapes.graphfrm import GraphicFrame
|
12
12
|
from pptx.shapes.group import GroupShape
|
13
|
+
from pptx.shapes.picture import Picture
|
13
14
|
from pptx.slide import Slide
|
14
|
-
from pptx.table import Table,
|
15
|
+
from pptx.table import Table, _Cell, _Row
|
15
16
|
from werkzeug.datastructures import FileStorage
|
16
17
|
|
17
18
|
|
@@ -25,7 +26,15 @@ class PPtExtractor:
|
|
25
26
|
raise ValueError("img_name must be a string")
|
26
27
|
return f"media/{id}/{img_name}"
|
27
28
|
|
28
|
-
def handle_shape(
|
29
|
+
def handle_shape(
|
30
|
+
self,
|
31
|
+
shape: Shape,
|
32
|
+
content_list: List[Dict[str, str]],
|
33
|
+
media_dir: Path,
|
34
|
+
img_map: Dict[Path, str],
|
35
|
+
id: str,
|
36
|
+
skip_image: bool,
|
37
|
+
):
|
29
38
|
if not isinstance(shape, Shape):
|
30
39
|
raise ValueError("Invalid shape object")
|
31
40
|
if not isinstance(content_list, list):
|
@@ -53,7 +62,9 @@ class PPtExtractor:
|
|
53
62
|
shape: Picture
|
54
63
|
image: Image = shape.image
|
55
64
|
image_bytes = image.blob
|
56
|
-
img_path = media_dir.resolve().joinpath(
|
65
|
+
img_path = media_dir.resolve().joinpath(
|
66
|
+
f"pic-{len(img_map)}.{image.ext}"
|
67
|
+
)
|
57
68
|
if not media_dir.exists():
|
58
69
|
media_dir.mkdir(parents=True, exist_ok=True)
|
59
70
|
if not os.access(media_dir, os.W_OK):
|
@@ -76,14 +87,18 @@ class PPtExtractor:
|
|
76
87
|
md += "\n|"
|
77
88
|
for col in row.cells:
|
78
89
|
cell: _Cell = col
|
79
|
-
md +=
|
90
|
+
md += (
|
91
|
+
" " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
|
92
|
+
)
|
80
93
|
md += "\n"
|
81
94
|
md += "\n"
|
82
95
|
content_list.append({"type": "md", "data": md})
|
83
96
|
elif shape_type == MSO_SHAPE_TYPE.GROUP:
|
84
97
|
shape: GroupShape
|
85
98
|
for sub_shape in shape.shapes:
|
86
|
-
self.handle_shape(
|
99
|
+
self.handle_shape(
|
100
|
+
sub_shape, content_list, media_dir, img_map, id, skip_image
|
101
|
+
)
|
87
102
|
else:
|
88
103
|
logger.info(f"Unknown shape type: {shape_type}, {type(shape)}")
|
89
104
|
except PermissionError as pe:
|
@@ -93,8 +108,14 @@ class PPtExtractor:
|
|
93
108
|
except Exception as e:
|
94
109
|
logger.error(f"Error handling shape: {e}")
|
95
110
|
|
96
|
-
def extract(
|
97
|
-
|
111
|
+
def extract(
|
112
|
+
self,
|
113
|
+
presentation_source: Union[FileStorage, Path],
|
114
|
+
id: str,
|
115
|
+
dir: Path,
|
116
|
+
media_dir: Path,
|
117
|
+
skip_image: bool,
|
118
|
+
):
|
98
119
|
if not isinstance(presentation_source, (FileStorage, Path)):
|
99
120
|
raise ValueError("presentation_source must be a FileStorage or Path object")
|
100
121
|
if not isinstance(id, str):
|
@@ -115,7 +136,9 @@ class PPtExtractor:
|
|
115
136
|
slide: Slide
|
116
137
|
page = {"page_no": page_no, "content_list": []}
|
117
138
|
for shape in slide.shapes:
|
118
|
-
self.handle_shape(
|
139
|
+
self.handle_shape(
|
140
|
+
shape, page["content_list"], media_dir, img_map, id, skip_image
|
141
|
+
)
|
119
142
|
pages.append(page)
|
120
143
|
except FileNotFoundError as fnfe:
|
121
144
|
logger.error(f"File not found: {fnfe}")
|