magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +11 -11
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +5 -5
- magic_pdf/libs/draw_bbox.py +3 -2
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +122 -76
- magic_pdf/model/sub_modules/model_init.py +40 -35
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +110 -53
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +202 -36
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +26 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
- magic_pdf/libs/Constants.py +0 -55
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/para/para_pipeline.py +0 -297
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
"""span维度自定义字段."""
|
2
|
+
# span是否是跨页合并的
|
3
|
+
CROSS_PAGE = 'cross_page'
|
4
|
+
|
5
|
+
"""
|
6
|
+
block维度自定义字段
|
7
|
+
"""
|
8
|
+
# block中lines是否被删除
|
9
|
+
LINES_DELETED = 'lines_deleted'
|
10
|
+
|
11
|
+
# table recognition max time default value
|
12
|
+
TABLE_MAX_TIME_VALUE = 400
|
13
|
+
|
14
|
+
# pp_table_result_max_length
|
15
|
+
TABLE_MAX_LEN = 480
|
16
|
+
|
17
|
+
# table master structure dict
|
18
|
+
TABLE_MASTER_DICT = 'table_master_structure_dict.txt'
|
19
|
+
|
20
|
+
# table master dir
|
21
|
+
TABLE_MASTER_DIR = 'table_structure_tablemaster_infer/'
|
22
|
+
|
23
|
+
# pp detect model dir
|
24
|
+
DETECT_MODEL_DIR = 'ch_PP-OCRv4_det_infer'
|
25
|
+
|
26
|
+
# pp rec model dir
|
27
|
+
REC_MODEL_DIR = 'ch_PP-OCRv4_rec_infer'
|
28
|
+
|
29
|
+
# pp rec char dict path
|
30
|
+
REC_CHAR_DICT = 'ppocr_keys_v1.txt'
|
31
|
+
|
32
|
+
# pp rec copy rec directory
|
33
|
+
PP_REC_DIRECTORY = '.paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer'
|
34
|
+
|
35
|
+
# pp rec copy det directory
|
36
|
+
PP_DET_DIRECTORY = '.paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer'
|
37
|
+
|
38
|
+
|
39
|
+
class MODEL_NAME:
|
40
|
+
# pp table structure algorithm
|
41
|
+
TABLE_MASTER = 'tablemaster'
|
42
|
+
# struct eqtable
|
43
|
+
STRUCT_EQTABLE = 'struct_eqtable'
|
44
|
+
|
45
|
+
DocLayout_YOLO = 'doclayout_yolo'
|
46
|
+
|
47
|
+
LAYOUTLMv3 = 'layoutlmv3'
|
48
|
+
|
49
|
+
YOLO_V8_MFD = 'yolo_v8_mfd'
|
50
|
+
|
51
|
+
UniMerNet_v2_Small = 'unimernet_small'
|
52
|
+
|
53
|
+
RAPID_TABLE = 'rapid_table'
|
@@ -0,0 +1,35 @@
|
|
1
|
+
class DropReason:
|
2
|
+
TEXT_BLCOK_HOR_OVERLAP = 'text_block_horizontal_overlap' # 文字块有水平互相覆盖,导致无法准确定位文字顺序
|
3
|
+
USEFUL_BLOCK_HOR_OVERLAP = (
|
4
|
+
'useful_block_horizontal_overlap' # 需保留的block水平覆盖
|
5
|
+
)
|
6
|
+
COMPLICATED_LAYOUT = 'complicated_layout' # 复杂的布局,暂时不支持
|
7
|
+
TOO_MANY_LAYOUT_COLUMNS = 'too_many_layout_columns' # 目前不支持分栏超过2列的
|
8
|
+
COLOR_BACKGROUND_TEXT_BOX = 'color_background_text_box' # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
|
9
|
+
HIGH_COMPUTATIONAL_lOAD_BY_IMGS = (
|
10
|
+
'high_computational_load_by_imgs' # 含特殊图片,计算量太大,从而丢弃
|
11
|
+
)
|
12
|
+
HIGH_COMPUTATIONAL_lOAD_BY_SVGS = (
|
13
|
+
'high_computational_load_by_svgs' # 特殊的SVG图,计算量太大,从而丢弃
|
14
|
+
)
|
15
|
+
HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES = 'high_computational_load_by_total_pages' # 计算量超过负荷,当前方法下计算量消耗过大
|
16
|
+
MISS_DOC_LAYOUT_RESULT = 'missing doc_layout_result' # 版面分析失败
|
17
|
+
Exception = '_exception' # 解析中发生异常
|
18
|
+
ENCRYPTED = 'encrypted' # PDF是加密的
|
19
|
+
EMPTY_PDF = 'total_page=0' # PDF页面总数为0
|
20
|
+
NOT_IS_TEXT_PDF = 'not_is_text_pdf' # 不是文字版PDF,无法直接解析
|
21
|
+
DENSE_SINGLE_LINE_BLOCK = 'dense_single_line_block' # 无法清晰的分段
|
22
|
+
TITLE_DETECTION_FAILED = 'title_detection_failed' # 探测标题失败
|
23
|
+
TITLE_LEVEL_FAILED = (
|
24
|
+
'title_level_failed' # 分析标题级别失败(例如一级、二级、三级标题)
|
25
|
+
)
|
26
|
+
PARA_SPLIT_FAILED = 'para_split_failed' # 识别段落失败
|
27
|
+
PARA_MERGE_FAILED = 'para_merge_failed' # 段落合并失败
|
28
|
+
NOT_ALLOW_LANGUAGE = 'not_allow_language' # 不支持的语种
|
29
|
+
SPECIAL_PDF = 'special_pdf'
|
30
|
+
PSEUDO_SINGLE_COLUMN = 'pseudo_single_column' # 无法精确判断文字分栏
|
31
|
+
CAN_NOT_DETECT_PAGE_LAYOUT = 'can_not_detect_page_layout' # 无法分析页面的版面
|
32
|
+
NEGATIVE_BBOX_AREA = 'negative_bbox_area' # 缩放导致 bbox 面积为负
|
33
|
+
OVERLAP_BLOCKS_CAN_NOT_SEPARATION = (
|
34
|
+
'overlap_blocks_can_t_separation' # 无法分离重叠的block
|
35
|
+
)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
|
2
|
+
COLOR_BG_HEADER_TXT_BLOCK = 'color_background_header_txt_block'
|
3
|
+
PAGE_NO = 'page-no' # 页码
|
4
|
+
CONTENT_IN_FOOT_OR_HEADER = 'in-foot-header-area' # 页眉页脚内的文本
|
5
|
+
VERTICAL_TEXT = 'vertical-text' # 垂直文本
|
6
|
+
ROTATE_TEXT = 'rotate-text' # 旋转文本
|
7
|
+
EMPTY_SIDE_BLOCK = 'empty-side-block' # 边缘上的空白没有任何内容的block
|
8
|
+
ON_IMAGE_TEXT = 'on-image-text' # 文本在图片上
|
9
|
+
ON_TABLE_TEXT = 'on-table-text' # 文本在表格上
|
10
|
+
|
11
|
+
|
12
|
+
class DropTag:
|
13
|
+
PAGE_NUMBER = 'page_no'
|
14
|
+
HEADER = 'header'
|
15
|
+
FOOTER = 'footer'
|
16
|
+
FOOTNOTE = 'footnote'
|
17
|
+
NOT_IN_LAYOUT = 'not_in_layout'
|
18
|
+
SPAN_OVERLAP = 'span_overlap'
|
19
|
+
BLOCK_OVERLAP = 'block_overlap'
|
magic_pdf/data/read_api.py
CHANGED
@@ -35,7 +35,7 @@ def read_jsonl(
|
|
35
35
|
jsonl_d = [
|
36
36
|
json.loads(line) for line in jsonl_bits.decode().split('\n') if line.strip()
|
37
37
|
]
|
38
|
-
for d in jsonl_d
|
38
|
+
for d in jsonl_d:
|
39
39
|
pdf_path = d.get('file_location', '') or d.get('path', '')
|
40
40
|
if len(pdf_path) == 0:
|
41
41
|
raise EmptyData('pdf file location is empty')
|