magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +12 -12
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +6 -6
- magic_pdf/libs/draw_bbox.py +13 -6
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +142 -351
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
- magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
- magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
- magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
- magic_pdf/model/sub_modules/model_init.py +149 -0
- magic_pdf/model/sub_modules/model_utils.py +51 -0
- magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
- magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
- magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
- magic_pdf/model/sub_modules/table/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
- magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
- magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
- magic_pdf/model/sub_modules/table/table_utils.py +11 -0
- magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
- magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +121 -66
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +253 -50
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/resources/model_config/model_configs.yaml +2 -1
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +70 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
- magic_pdf-0.10.0.dist-info/RECORD +198 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -53
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/model/pek_sub_modules/post_process.py +0 -36
- magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
- magic_pdf/para/para_pipeline.py +0 -297
- magic_pdf-0.9.2.dist-info/RECORD +0 -178
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
- /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
- /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
"""span维度自定义字段."""
|
2
|
+
# span是否是跨页合并的
|
3
|
+
CROSS_PAGE = 'cross_page'
|
4
|
+
|
5
|
+
"""
|
6
|
+
block维度自定义字段
|
7
|
+
"""
|
8
|
+
# block中lines是否被删除
|
9
|
+
LINES_DELETED = 'lines_deleted'
|
10
|
+
|
11
|
+
# table recognition max time default value
|
12
|
+
TABLE_MAX_TIME_VALUE = 400
|
13
|
+
|
14
|
+
# pp_table_result_max_length
|
15
|
+
TABLE_MAX_LEN = 480
|
16
|
+
|
17
|
+
# table master structure dict
|
18
|
+
TABLE_MASTER_DICT = 'table_master_structure_dict.txt'
|
19
|
+
|
20
|
+
# table master dir
|
21
|
+
TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/'
|
22
|
+
|
23
|
+
# pp detect model dir
|
24
|
+
DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer'
|
25
|
+
|
26
|
+
# pp rec model dir
|
27
|
+
REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer'
|
28
|
+
|
29
|
+
# pp rec char dict path
|
30
|
+
REC_CHAR_DICT = 'ppocr_keys_v1.txt'
|
31
|
+
|
32
|
+
# pp rec copy rec directory
|
33
|
+
PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
|
34
|
+
|
35
|
+
# pp rec copy det directory
|
36
|
+
PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
|
37
|
+
|
38
|
+
|
39
|
+
class MODEL_NAME:
|
40
|
+
# pp table structure algorithm
|
41
|
+
TABLE_MASTER = 'tablemaster'
|
42
|
+
# struct eqtable
|
43
|
+
STRUCT_EQTABLE = 'struct_eqtable'
|
44
|
+
|
45
|
+
DocLayout_YOLO = 'doclayout_yolo'
|
46
|
+
|
47
|
+
LAYOUTLMv3 = 'layoutlmv3'
|
48
|
+
|
49
|
+
YOLO_V8_MFD = 'yolo_v8_mfd'
|
50
|
+
|
51
|
+
UniMerNet_v2_Small = 'unimernet_small'
|
52
|
+
|
53
|
+
RAPID_TABLE = 'rapid_table'
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class DropReason:
|
2
|
+
TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap' # 文字块有水平互相覆盖,导致无法准确定位文字顺序
|
3
|
+
USEFUL_BLOCK_HOR_OVERLAP = (
|
4
|
+
'useful_block_horizontal_overlap' # 需保留的block水平覆盖
|
5
|
+
)
|
6
|
+
COMPLICATED_LAYOUT = 'complicated_layout' # 复杂的布局,暂时不支持
|
7
|
+
TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns' # 目前不支持分栏超过2列的
|
8
|
+
COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box' # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
|
9
|
+
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = (
|
10
|
+
'high_computational_load_by_imgs' # 含特殊图片,计算量太大,从而丢弃
|
11
|
+
)
|
12
|
+
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = (
|
13
|
+
'high_computational_load_by_svgs' # 特殊的SVG图,计算量太大,从而丢弃
|
14
|
+
)
|
15
|
+
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages' # 计算量超过负荷,当前方法下计算量消耗过大
|
16
|
+
MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result' # 版面分析失败
|
17
|
+
Exception = '_exception' # 解析中发生异常
|
18
|
+
ENCRYPTED = 'encrypted' # PDF是加密的
|
19
|
+
EMPTY_PDF = 'total_page=0' # PDF页面总数为0
|
20
|
+
NOT_IS_TEXT_PDF = 'not_is_text_pdf' # 不是文字版PDF,无法直接解析
|
21
|
+
DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block' # 无法清晰的分段
|
22
|
+
TITLE_DETECTION_FAILED = 'title_detection_failed' # 探测标题失败
|
23
|
+
TITLE_LEVEL_FAILED = (
|
24
|
+
'title_level_failed' # 分析标题级别失败(例如一级、二级、三级标题)
|
25
|
+
)
|
26
|
+
PARA_SPLIT_FAILED = 'para_split_failed' # 识别段落失败
|
27
|
+
PARA_MERGE_FAILED = 'para_merge_failed' # 段落合并失败
|
28
|
+
NOT_ALLOW_LANGUAGE = 'not_allow_language' # 不支持的语种
|
29
|
+
SPECIAL_PDF = 'special_pdf'
|
30
|
+
PSEUDO_SINGLE_COLUMN = 'pseudo_single_column' # 无法精确判断文字分栏
|
31
|
+
CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout' # 无法分析页面的版面
|
32
|
+
NEGATIVE_BBOX_AREA = 'negative_bbox_area' # 缩放导致 bbox 面积为负
|
33
|
+
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = (
|
34
|
+
'overlap_blocks_can_t_separation' # 无法分离重叠的block
|
35
|
+
)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block'
|
3
|
+
PAGE_NO = 'page-no' # 页码
|
4
|
+
CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
|
5
|
+
VERTICAL_TEXT = 'vertical-text' # 垂直文本
|
6
|
+
ROTATE_TEXT = 'rotate-text' # 旋转文本
|
7
|
+
EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
|
8
|
+
ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
|
9
|
+
ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
|
10
|
+
|
11
|
+
|
12
|
+
class DropTag:
|
13
|
+
PAGE_NUMBER = 'page_no'
|
14
|
+
HEADER = 'header'
|
15
|
+
FOOTER = 'footer'
|
16
|
+
FOOTNOTE = 'footnote'
|
17
|
+
NOT_IN_LAYOUT = 'not_in_layout'
|
18
|
+
SPAN_OVERLAP = 'span_overlap'
|
19
|
+
BLOCK_OVERLAP = 'block_overlap'
|
magic_pdf/data/read_api.py
CHANGED
@@ -35,7 +35,7 @@ def read_jsonl(
|
|
35
35
|
jsonl_d = [
|
36
36
|
json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
|
37
37
|
]
|
38
|
-
for d in jsonl_d
|
38
|
+
for d in jsonl_d:
|
39
39
|
pdf_path = d.get('file_location', '') or d.get('path', '')
|
40
40
|
if len(pdf_path) == 0:
|
41
41
|
raise EmptyData('pdf file location is empty')
|