magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +12 -12
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +6 -6
- magic_pdf/libs/draw_bbox.py +13 -6
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +142 -351
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
- magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
- magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
- magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
- magic_pdf/model/sub_modules/model_init.py +149 -0
- magic_pdf/model/sub_modules/model_utils.py +51 -0
- magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
- magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
- magic_pdf/model/sub_modules/table/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
- magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
- magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
- magic_pdf/model/sub_modules/table/table_utils.py +11 -0
- magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
- magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +121 -66
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +253 -50
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/resources/model_config/model_configs.yaml +2 -1
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +70 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
- magic_pdf-0.10.0.dist-info/RECORD +198 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -53
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/model/pek_sub_modules/post_process.py +0 -36
- magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
- magic_pdf/para/para_pipeline.py +0 -297
- magic_pdf-0.9.2.dist-info/RECORD +0 -178
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
magic_pdf/pipe/UNIPipe.py
CHANGED
@@ -2,22 +2,21 @@ import json
|
|
2
2
|
|
3
3
|
from loguru import logger
|
4
4
|
|
5
|
-
from magic_pdf.
|
6
|
-
from magic_pdf.
|
7
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
8
|
-
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
|
5
|
+
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
6
|
+
from magic_pdf.data.data_reader_writer import DataWriter
|
9
7
|
from magic_pdf.libs.commons import join_path
|
8
|
+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
10
9
|
from magic_pdf.pipe.AbsPipe import AbsPipe
|
11
|
-
from magic_pdf.user_api import
|
10
|
+
from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
|
12
11
|
|
13
12
|
|
14
13
|
class UNIPipe(AbsPipe):
|
15
14
|
|
16
|
-
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer:
|
15
|
+
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: DataWriter, is_debug: bool = False,
|
17
16
|
start_page_id=0, end_page_id=None, lang=None,
|
18
17
|
layout_model=None, formula_enable=None, table_enable=None):
|
19
|
-
self.pdf_type = jso_useful_key[
|
20
|
-
super().__init__(pdf_bytes, jso_useful_key[
|
18
|
+
self.pdf_type = jso_useful_key['_pdf_type']
|
19
|
+
super().__init__(pdf_bytes, jso_useful_key['model_list'], image_writer, is_debug, start_page_id, end_page_id,
|
21
20
|
lang, layout_model, formula_enable, table_enable)
|
22
21
|
if len(self.model_list) == 0:
|
23
22
|
self.input_model_is_empty = True
|
@@ -54,27 +53,28 @@ class UNIPipe(AbsPipe):
|
|
54
53
|
|
55
54
|
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
|
56
55
|
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
57
|
-
logger.info(
|
56
|
+
logger.info('uni_pipe mk content list finished')
|
58
57
|
return result
|
59
58
|
|
60
59
|
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
61
60
|
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
62
|
-
logger.info(f
|
61
|
+
logger.info(f'uni_pipe mk {md_make_mode} finished')
|
63
62
|
return result
|
64
63
|
|
65
64
|
|
66
65
|
if __name__ == '__main__':
|
67
66
|
# 测试
|
68
|
-
|
67
|
+
from magic_pdf.data.data_reader_writer import DataReader
|
68
|
+
drw = DataReader(r'D:/project/20231108code-clean')
|
69
69
|
|
70
|
-
pdf_file_path = r
|
71
|
-
model_file_path = r
|
72
|
-
pdf_bytes = drw.read(pdf_file_path
|
73
|
-
model_json_txt = drw.read(model_file_path
|
70
|
+
pdf_file_path = r'linshixuqiu\19983-00.pdf'
|
71
|
+
model_file_path = r'linshixuqiu\19983-00.json'
|
72
|
+
pdf_bytes = drw.read(pdf_file_path)
|
73
|
+
model_json_txt = drw.read(model_file_path).decode()
|
74
74
|
model_list = json.loads(model_json_txt)
|
75
|
-
write_path = r
|
76
|
-
img_bucket_path =
|
77
|
-
img_writer =
|
75
|
+
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
|
76
|
+
img_bucket_path = 'imgs'
|
77
|
+
img_writer = DataWriter(join_path(write_path, img_bucket_path))
|
78
78
|
|
79
79
|
# pdf_type = UNIPipe.classify(pdf_bytes)
|
80
80
|
# jso_useful_key = {
|
@@ -83,8 +83,8 @@ if __name__ == '__main__':
|
|
83
83
|
# }
|
84
84
|
|
85
85
|
jso_useful_key = {
|
86
|
-
|
87
|
-
|
86
|
+
'_pdf_type': '',
|
87
|
+
'model_list': model_list
|
88
88
|
}
|
89
89
|
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
|
90
90
|
pipe.pipe_classify()
|
@@ -92,8 +92,7 @@ if __name__ == '__main__':
|
|
92
92
|
md_content = pipe.pipe_mk_markdown(img_bucket_path)
|
93
93
|
content_list = pipe.pipe_mk_uni_format(img_bucket_path)
|
94
94
|
|
95
|
-
md_writer =
|
96
|
-
md_writer.
|
97
|
-
md_writer.
|
98
|
-
|
99
|
-
md_writer.write(str(content_list), "19983-00.txt", AbsReaderWriter.MODE_TXT)
|
95
|
+
md_writer = DataWriter(write_path)
|
96
|
+
md_writer.write_string('19983-00.md', md_content)
|
97
|
+
md_writer.write_string('19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4))
|
98
|
+
md_writer.write_string('19983-00.txt', str(content_list))
|
@@ -1,19 +1,17 @@
|
|
1
1
|
from loguru import logger
|
2
2
|
|
3
|
+
from magic_pdf.config.drop_reason import DropReason
|
3
4
|
from magic_pdf.layout.layout_sort import get_columns_cnt_of_layout
|
4
|
-
from magic_pdf.libs.drop_reason import DropReason
|
5
5
|
|
6
6
|
|
7
7
|
def __is_pseudo_single_column(page_info) -> bool:
|
8
|
-
"""
|
9
|
-
判断一个页面是否伪单列。
|
8
|
+
"""判断一个页面是否伪单列。
|
10
9
|
|
11
10
|
Args:
|
12
11
|
page_info (dict): 页面信息字典,包括'_layout_tree'和'preproc_blocks'。
|
13
12
|
|
14
13
|
Returns:
|
15
14
|
Tuple[bool, Optional[str]]: 如果页面伪单列返回(True, extra_info),否则返回(False, None)。
|
16
|
-
|
17
15
|
"""
|
18
16
|
layout_tree = page_info['_layout_tree']
|
19
17
|
layout_column_width = get_columns_cnt_of_layout(layout_tree)
|
@@ -41,27 +39,22 @@ def __is_pseudo_single_column(page_info) -> bool:
|
|
41
39
|
if num_lines > 20:
|
42
40
|
radio = num_satisfying_lines / num_lines
|
43
41
|
if radio >= 0.5:
|
44
|
-
extra_info = f
|
42
|
+
extra_info = f'{{num_lines: {num_lines}, num_satisfying_lines: {num_satisfying_lines}}}'
|
45
43
|
block_text = []
|
46
44
|
for line in lines:
|
47
45
|
if line['spans']:
|
48
46
|
for span in line['spans']:
|
49
47
|
block_text.append(span['text'])
|
50
|
-
logger.warning(f
|
48
|
+
logger.warning(f'pseudo_single_column block_text: {block_text}')
|
51
49
|
return True, extra_info
|
52
50
|
|
53
51
|
return False, None
|
54
52
|
|
55
53
|
|
56
54
|
def pdf_post_filter(page_info) -> tuple:
|
57
|
-
"""
|
58
|
-
return:(True|False, err_msg)
|
59
|
-
True, 如果pdf符合要求
|
60
|
-
False, 如果pdf不符合要求
|
61
|
-
|
62
|
-
"""
|
55
|
+
"""return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
|
63
56
|
bool_is_pseudo_single_column, extra_info = __is_pseudo_single_column(page_info)
|
64
57
|
if bool_is_pseudo_single_column:
|
65
|
-
return False, {
|
58
|
+
return False, {'_need_drop': True, '_drop_reason': DropReason.PSEUDO_SINGLE_COLUMN, 'extra_info': extra_info}
|
66
59
|
|
67
|
-
return True, None
|
60
|
+
return True, None
|
magic_pdf/pre_proc/cut_image.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from loguru import logger
|
2
2
|
|
3
|
+
from magic_pdf.config.ocr_content_type import ContentType
|
3
4
|
from magic_pdf.libs.commons import join_path
|
4
|
-
from magic_pdf.libs.ocr_content_type import ContentType
|
5
5
|
from magic_pdf.libs.pdf_image_tools import cut_image
|
6
6
|
|
7
7
|
|
@@ -29,9 +29,7 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
|
|
29
29
|
image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
|
30
30
|
equation_inline_bboxes: list,
|
31
31
|
equation_interline_bboxes: list, imageWriter) -> dict:
|
32
|
-
"""
|
33
|
-
返回一个dict, key为bbox, 值是图片地址
|
34
|
-
"""
|
32
|
+
"""返回一个dict, key为bbox, 值是图片地址."""
|
35
33
|
image_info = []
|
36
34
|
image_backup_info = []
|
37
35
|
table_info = []
|
@@ -46,26 +44,26 @@ def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
|
|
46
44
|
for bbox in image_bboxes:
|
47
45
|
if not check_img_bbox(bbox):
|
48
46
|
continue
|
49
|
-
image_path = cut_image(bbox, page_num, page, return_path(
|
50
|
-
image_info.append({
|
47
|
+
image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
|
48
|
+
image_info.append({'bbox': bbox, 'image_path': image_path})
|
51
49
|
|
52
50
|
for bbox in images_overlap_backup:
|
53
51
|
if not check_img_bbox(bbox):
|
54
52
|
continue
|
55
|
-
image_path = cut_image(bbox, page_num, page, return_path(
|
56
|
-
image_backup_info.append({
|
53
|
+
image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
|
54
|
+
image_backup_info.append({'bbox': bbox, 'image_path': image_path})
|
57
55
|
|
58
56
|
for bbox in table_bboxes:
|
59
57
|
if not check_img_bbox(bbox):
|
60
58
|
continue
|
61
|
-
image_path = cut_image(bbox, page_num, page, return_path(
|
62
|
-
table_info.append({
|
59
|
+
image_path = cut_image(bbox, page_num, page, return_path('tables'), imageWriter)
|
60
|
+
table_info.append({'bbox': bbox, 'image_path': image_path})
|
63
61
|
|
64
62
|
return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
|
65
63
|
|
66
64
|
|
67
65
|
def check_img_bbox(bbox) -> bool:
|
68
66
|
if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
|
69
|
-
logger.warning(f
|
67
|
+
logger.warning(f'image_bboxes: 错误的box, {bbox}')
|
70
68
|
return False
|
71
69
|
return True
|