magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -17
- magic_pdf/libs/commons.py +0 -161
- magic_pdf/libs/draw_bbox.py +2 -3
- magic_pdf/libs/markdown_utils.py +0 -21
- magic_pdf/libs/pdf_image_tools.py +2 -1
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
- magic_pdf/model/magic_model.py +0 -30
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
- magic_pdf/para/para_split_v3.py +7 -2
- magic_pdf/pdf_parse_union_core_v2.py +97 -124
- magic_pdf/pre_proc/construct_page_dict.py +0 -55
- magic_pdf/pre_proc/cut_image.py +0 -37
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
- magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
- magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
- magic_pdf/rw/S3ReaderWriter.py +1 -1
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
- magic_pdf/dict2md/mkcontent.py +0 -438
- magic_pdf/layout/__init__.py +0 -0
- magic_pdf/layout/bbox_sort.py +0 -681
- magic_pdf/layout/layout_det_utils.py +0 -182
- magic_pdf/layout/layout_sort.py +0 -921
- magic_pdf/layout/layout_spiler_recog.py +0 -101
- magic_pdf/layout/mcol_sort.py +0 -336
- magic_pdf/libs/calc_span_stats.py +0 -239
- magic_pdf/libs/detect_language_from_model.py +0 -21
- magic_pdf/libs/nlp_utils.py +0 -203
- magic_pdf/libs/textbase.py +0 -33
- magic_pdf/libs/vis_utils.py +0 -308
- magic_pdf/para/block_continuation_processor.py +0 -562
- magic_pdf/para/block_termination_processor.py +0 -480
- magic_pdf/para/commons.py +0 -222
- magic_pdf/para/denoise.py +0 -246
- magic_pdf/para/draw.py +0 -121
- magic_pdf/para/exceptions.py +0 -198
- magic_pdf/para/layout_match_processor.py +0 -40
- magic_pdf/para/para_split.py +0 -807
- magic_pdf/para/para_split_v2.py +0 -959
- magic_pdf/para/raw_processor.py +0 -207
- magic_pdf/para/stats.py +0 -268
- magic_pdf/para/title_processor.py +0 -1014
- magic_pdf/pdf_parse_union_core.py +0 -345
- magic_pdf/post_proc/__init__.py +0 -0
- magic_pdf/post_proc/detect_para.py +0 -3472
- magic_pdf/post_proc/pdf_post_filter.py +0 -60
- magic_pdf/post_proc/remove_footnote.py +0 -153
- magic_pdf/pre_proc/citationmarker_remove.py +0 -161
- magic_pdf/pre_proc/detect_equation.py +0 -134
- magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
- magic_pdf/pre_proc/detect_footnote.py +0 -170
- magic_pdf/pre_proc/detect_header.py +0 -64
- magic_pdf/pre_proc/detect_images.py +0 -647
- magic_pdf/pre_proc/detect_page_number.py +0 -64
- magic_pdf/pre_proc/detect_tables.py +0 -62
- magic_pdf/pre_proc/equations_replace.py +0 -550
- magic_pdf/pre_proc/fix_image.py +0 -244
- magic_pdf/pre_proc/fix_table.py +0 -270
- magic_pdf/pre_proc/main_text_font.py +0 -23
- magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
- magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
- magic_pdf/pre_proc/post_layout_split.py +0 -0
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
- magic_pdf/pre_proc/remove_footer_header.py +0 -114
- magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
- magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
- magic_pdf/pre_proc/solve_line_alien.py +0 -29
- magic_pdf/pre_proc/statistics.py +0 -12
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
2
|
magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
|
3
3
|
magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
|
4
|
-
magic_pdf/
|
5
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=EqEi9AahBBh2JbXoY8uOCmClvi9W_H_26U4jK8RwPwU,31308
|
4
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=-4yJwcSMcGwQKJhmK_MbBMa-fexzkqeD1CQHWpzGC3I,29920
|
6
5
|
magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
|
7
6
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
7
|
magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
|
@@ -20,7 +19,7 @@ magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
|
20
19
|
magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
|
21
20
|
magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
|
22
21
|
magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
|
23
|
-
magic_pdf/data/data_reader_writer/filebase.py,sha256=
|
22
|
+
magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
|
24
23
|
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
|
25
24
|
magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
|
26
25
|
magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
|
@@ -28,48 +27,36 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
|
|
28
27
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
29
28
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
30
29
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
|
-
magic_pdf/dict2md/mkcontent.py,sha256=bMQK7uiay76YaWA92VIK57YajINV20SnOs65wOEXyKE,18667
|
32
30
|
magic_pdf/dict2md/ocr_mkcontent.py,sha256=ohjhEFS9YFrzTCC9c9yrvi4QuZe9iZm1qlkQWB6xxIw,13038
|
33
31
|
magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
32
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
35
|
-
magic_pdf/filter/pdf_meta_scan.py,sha256=
|
33
|
+
magic_pdf/filter/pdf_meta_scan.py,sha256=3I-t3PSrQUZ3PZAPl_NGoEhxLmIUE9Fpc0jueEXP7Xw,17381
|
36
34
|
magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
35
|
magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
38
36
|
magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
|
39
37
|
magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCalhJdmk,3193
|
40
38
|
magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
|
41
|
-
magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
|
-
magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
|
43
|
-
magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl6ku2cls,9137
|
44
|
-
magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
|
45
|
-
magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
|
46
|
-
magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
|
47
39
|
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
48
40
|
magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
|
49
|
-
magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
|
50
41
|
magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
|
51
|
-
magic_pdf/libs/commons.py,sha256=
|
42
|
+
magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
|
52
43
|
magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
|
53
44
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
54
45
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
55
|
-
magic_pdf/libs/
|
56
|
-
magic_pdf/libs/draw_bbox.py,sha256=NhAfqib5HYuGjjrAG_SvJR-yOHZTy6tzDxLXdxKlULQ,17676
|
46
|
+
magic_pdf/libs/draw_bbox.py,sha256=2IXr4TUxm0-pXYIPkNaELWo9pOysZC6etpqzTE5eg-w,17588
|
57
47
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
58
48
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
59
49
|
magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
|
60
50
|
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
61
|
-
magic_pdf/libs/markdown_utils.py,sha256=
|
62
|
-
magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
|
51
|
+
magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
|
63
52
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
64
53
|
magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
|
65
|
-
magic_pdf/libs/pdf_image_tools.py,sha256=
|
54
|
+
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
66
55
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
67
|
-
magic_pdf/libs/
|
68
|
-
magic_pdf/libs/version.py,sha256=v4zmKjsKOPZbp6BrWoz7iK4ST0sdZdUh9bQSJmluZ5o,23
|
69
|
-
magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
|
56
|
+
magic_pdf/libs/version.py,sha256=A_AARqtxTOj_AQTpjpgOxNx-UOBio5wYFfZ2mrdMKfs,23
|
70
57
|
magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
|
71
|
-
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=
|
72
|
-
magic_pdf/model/magic_model.py,sha256=
|
58
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
|
59
|
+
magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
|
73
60
|
magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
|
74
61
|
magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
|
75
62
|
magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
|
@@ -107,8 +94,8 @@ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1
|
|
107
94
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
95
|
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
109
96
|
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
110
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=
|
111
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=
|
97
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=bya-KGr5OPCmE8KC8K5Pp6OlGigCmUmB9xpm59nExaM,9056
|
98
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=Deoth86bltlLz1Y-1jpyhLCwCaRfq-KKI0tiFyKKqA8,7268
|
112
99
|
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
|
113
100
|
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
114
101
|
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -123,64 +110,26 @@ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-u
|
|
123
110
|
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
124
111
|
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=AdH3UGu4BEoII0uFjPKUf61W7HmG4fDlWgR1xxMeFlE,2775
|
125
112
|
magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
126
|
-
magic_pdf/para/
|
127
|
-
magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
|
128
|
-
magic_pdf/para/commons.py,sha256=VdJ8SY9qJTtcRyx8HH-PFeZSJwL4Tsf50197RD_-dwc,5414
|
129
|
-
magic_pdf/para/denoise.py,sha256=J7dM2KNnbdzAd2A3agB04U6L1GL9RrhAs-MLrq-_Ftg,10443
|
130
|
-
magic_pdf/para/draw.py,sha256=KyWc03do_WuBKQ028HYzepYwbIkel9ID0uqRhuPVOHc,5643
|
131
|
-
magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,4978
|
132
|
-
magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
|
133
|
-
magic_pdf/para/para_split.py,sha256=z7nYeg86BjZOAdJNMwYKSu51W9evurtl3cy1ZUcQLlw,33222
|
134
|
-
magic_pdf/para/para_split_v2.py,sha256=vJJqqMMKbv8D702nODThL-5hjkgZ7Vl2BTmEIdwmmDw,39051
|
135
|
-
magic_pdf/para/para_split_v3.py,sha256=atfELVRx-90paAS3nZptgP0qG8UpTTaj3LG_2x3NAlQ,15977
|
136
|
-
magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
|
137
|
-
magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
|
138
|
-
magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
|
113
|
+
magic_pdf/para/para_split_v3.py,sha256=x6nfjyt38W-wdxXjo6Chd18eiqLzmhbTNyGHhBQcEHs,16459
|
139
114
|
magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
|
140
115
|
magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
|
141
116
|
magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
|
142
117
|
magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,4807
|
143
118
|
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
144
|
-
magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
145
|
-
magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
|
146
|
-
magic_pdf/post_proc/pdf_post_filter.py,sha256=3EJDovQPckPKJaBY1wvAty-LGKyRG63WICY_bA_Kfbs,2501
|
147
|
-
magic_pdf/post_proc/remove_footnote.py,sha256=701P7xRu6gzLaEHfb2xkYpLZI4CwK2FAo7Ggho4bOTI,7596
|
148
119
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
149
|
-
magic_pdf/pre_proc/
|
150
|
-
magic_pdf/pre_proc/
|
151
|
-
magic_pdf/pre_proc/
|
152
|
-
magic_pdf/pre_proc/
|
153
|
-
magic_pdf/pre_proc/
|
154
|
-
magic_pdf/pre_proc/detect_footer_header_by_statistics.py,sha256=924soXZ51QVpitPgVgnwbC7BqOZI30j5hGW5zP86y-w,11250
|
155
|
-
magic_pdf/pre_proc/detect_footnote.py,sha256=UxFuTCRwXdAv3wKCgRQJJVt12hM9O9oPTwzPAChQXoM,8309
|
156
|
-
magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1itbY7g,2848
|
157
|
-
magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
|
158
|
-
magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
|
159
|
-
magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
|
160
|
-
magic_pdf/pre_proc/equations_replace.py,sha256=7mexRPwD9C_UJ-SbvO_-XnpcnN7YtGUUznmPjHbjhnw,20526
|
161
|
-
magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
|
162
|
-
magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
|
163
|
-
magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
|
164
|
-
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=DMc2H2xGqVePBReZu5AQbPdvDw3sxOssmujCLlNW3Vs,14143
|
165
|
-
magic_pdf/pre_proc/ocr_detect_layout.py,sha256=DW0_HXzmcbW22cXKIYFsyZNFh8mEjSHXIFVjXndJsvQ,5878
|
166
|
-
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Au8y1NBhbWpq_VuPLg3b9dAMUhyPS71xtTghtd21K5M,14273
|
167
|
-
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=9DxEyy1pH87g4T_JEgI3cTVCL2TVrEBl38wsmqhQM4k,12758
|
168
|
-
magic_pdf/pre_proc/pdf_pre_filter.py,sha256=qvNlNyj4Mc3qa73mgfkp0PMR-ucABbx3mMcyVipaEpQ,2776
|
169
|
-
magic_pdf/pre_proc/post_layout_split.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
|
+
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
121
|
+
magic_pdf/pre_proc/cut_image.py,sha256=U-ttnl3lAhhmgtkR1GGyPAVm0i0-6VscXf3E2EDy3lE,1187
|
122
|
+
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=xQ1L6pwQjN4xBSKEXslheip1aMFaiB0grqlX3BF-kh0,9282
|
123
|
+
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
|
124
|
+
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
|
170
125
|
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
|
171
|
-
magic_pdf/pre_proc/remove_colored_strip_bbox.py,sha256=WVKhgeWifRdO-u2ETYffkcMOFVYIbiaZu5pMr1RpEdA,4090
|
172
|
-
magic_pdf/pre_proc/remove_footer_header.py,sha256=Igdr4jH7BUGuTcapWPiKEGKxhWH12c3VVmX5xwUVn7w,5680
|
173
|
-
magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=di7geS7AFhSaAvkWZHT6J3dlXEq8uu9Z4oBYtolQjl0,8803
|
174
|
-
magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=ABl0vo8kkcCPSTI8dpXQTOH1b9R-lbzsJDDFONU6ELk,7313
|
175
|
-
magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
|
176
|
-
magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
|
177
126
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
178
127
|
magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
|
179
128
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
|
180
129
|
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
181
130
|
magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
|
182
131
|
magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
|
183
|
-
magic_pdf/rw/S3ReaderWriter.py,sha256=
|
132
|
+
magic_pdf/rw/S3ReaderWriter.py,sha256=LmbtA-pZlC745nnSUs67C1iqSrBDS7IzE6QC8YMB644,5293
|
184
133
|
magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
185
134
|
magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
186
135
|
magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
|
@@ -190,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
|
|
190
139
|
magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
|
191
140
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
192
141
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
193
|
-
magic_pdf-0.10.
|
194
|
-
magic_pdf-0.10.
|
195
|
-
magic_pdf-0.10.
|
196
|
-
magic_pdf-0.10.
|
197
|
-
magic_pdf-0.10.
|
198
|
-
magic_pdf-0.10.
|
142
|
+
magic_pdf-0.10.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
143
|
+
magic_pdf-0.10.2.dist-info/METADATA,sha256=5pYglDeTXZaIsMRAHSfNl57Yq3gPXdcexNxt1zdvmu4,37030
|
144
|
+
magic_pdf-0.10.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
145
|
+
magic_pdf-0.10.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
146
|
+
magic_pdf-0.10.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
147
|
+
magic_pdf-0.10.2.dist-info/RECORD,,
|
magic_pdf/dict2md/mkcontent.py
DELETED
@@ -1,438 +0,0 @@
|
|
1
|
-
import math
|
2
|
-
|
3
|
-
from loguru import logger
|
4
|
-
|
5
|
-
from magic_pdf.config.ocr_content_type import ContentType
|
6
|
-
from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
|
7
|
-
find_top_nearest_text_bbox)
|
8
|
-
from magic_pdf.libs.commons import join_path
|
9
|
-
|
10
|
-
TYPE_INLINE_EQUATION = ContentType.InlineEquation
|
11
|
-
TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
|
12
|
-
UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
|
13
|
-
|
14
|
-
|
15
|
-
@DeprecationWarning
|
16
|
-
def mk_nlp_markdown_1(para_dict: dict):
|
17
|
-
"""对排序后的bboxes拼接内容."""
|
18
|
-
content_lst = []
|
19
|
-
for _, page_info in para_dict.items():
|
20
|
-
para_blocks = page_info.get('para_blocks')
|
21
|
-
if not para_blocks:
|
22
|
-
continue
|
23
|
-
|
24
|
-
for block in para_blocks:
|
25
|
-
item = block['paras']
|
26
|
-
for _, p in item.items():
|
27
|
-
para_text = p['para_text']
|
28
|
-
is_title = p['is_para_title']
|
29
|
-
title_level = p['para_title_level']
|
30
|
-
md_title_prefix = '#' * title_level
|
31
|
-
if is_title:
|
32
|
-
content_lst.append(f'{md_title_prefix} {para_text}')
|
33
|
-
else:
|
34
|
-
content_lst.append(para_text)
|
35
|
-
|
36
|
-
content_text = '\n\n'.join(content_lst)
|
37
|
-
|
38
|
-
return content_text
|
39
|
-
|
40
|
-
|
41
|
-
# 找到目标字符串在段落中的索引
|
42
|
-
def __find_index(paragraph, target):
|
43
|
-
index = paragraph.find(target)
|
44
|
-
if index != -1:
|
45
|
-
return index
|
46
|
-
else:
|
47
|
-
return None
|
48
|
-
|
49
|
-
|
50
|
-
def __insert_string(paragraph, target, position):
|
51
|
-
new_paragraph = paragraph[:position] + target + paragraph[position:]
|
52
|
-
return new_paragraph
|
53
|
-
|
54
|
-
|
55
|
-
def __insert_after(content, image_content, target):
|
56
|
-
"""在content中找到target,将image_content插入到target后面."""
|
57
|
-
index = content.find(target)
|
58
|
-
if index != -1:
|
59
|
-
content = (
|
60
|
-
content[: index + len(target)]
|
61
|
-
+ '\n\n'
|
62
|
-
+ image_content
|
63
|
-
+ '\n\n'
|
64
|
-
+ content[index + len(target) :]
|
65
|
-
)
|
66
|
-
else:
|
67
|
-
logger.error(
|
68
|
-
f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
|
69
|
-
)
|
70
|
-
return content
|
71
|
-
|
72
|
-
|
73
|
-
def __insert_before(content, image_content, target):
|
74
|
-
"""在content中找到target,将image_content插入到target前面."""
|
75
|
-
index = content.find(target)
|
76
|
-
if index != -1:
|
77
|
-
content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
|
78
|
-
else:
|
79
|
-
logger.error(
|
80
|
-
f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
|
81
|
-
)
|
82
|
-
return content
|
83
|
-
|
84
|
-
|
85
|
-
@DeprecationWarning
|
86
|
-
def mk_mm_markdown_1(para_dict: dict):
|
87
|
-
"""拼装多模态markdown."""
|
88
|
-
content_lst = []
|
89
|
-
for _, page_info in para_dict.items():
|
90
|
-
page_lst = [] # 一个page内的段落列表
|
91
|
-
para_blocks = page_info.get('para_blocks')
|
92
|
-
pymu_raw_blocks = page_info.get('preproc_blocks')
|
93
|
-
|
94
|
-
all_page_images = []
|
95
|
-
all_page_images.extend(page_info.get('images', []))
|
96
|
-
all_page_images.extend(page_info.get('image_backup', []))
|
97
|
-
all_page_images.extend(page_info.get('tables', []))
|
98
|
-
all_page_images.extend(page_info.get('table_backup', []))
|
99
|
-
|
100
|
-
if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
|
101
|
-
for img in all_page_images:
|
102
|
-
page_lst.append(f"") # TODO 图片顺序
|
103
|
-
page_md = '\n\n'.join(page_lst)
|
104
|
-
|
105
|
-
else:
|
106
|
-
for block in para_blocks:
|
107
|
-
item = block['paras']
|
108
|
-
for _, p in item.items():
|
109
|
-
para_text = p['para_text']
|
110
|
-
is_title = p['is_para_title']
|
111
|
-
title_level = p['para_title_level']
|
112
|
-
md_title_prefix = '#' * title_level
|
113
|
-
if is_title:
|
114
|
-
page_lst.append(f'{md_title_prefix} {para_text}')
|
115
|
-
else:
|
116
|
-
page_lst.append(para_text)
|
117
|
-
|
118
|
-
"""拼装成一个页面的文本"""
|
119
|
-
page_md = '\n\n'.join(page_lst)
|
120
|
-
"""插入图片"""
|
121
|
-
for img in all_page_images:
|
122
|
-
imgbox = img['bbox']
|
123
|
-
img_content = f""
|
124
|
-
# 先看在哪个block内
|
125
|
-
for block in pymu_raw_blocks:
|
126
|
-
bbox = block['bbox']
|
127
|
-
if (
|
128
|
-
bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
|
129
|
-
and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
|
130
|
-
): # 确定在block内
|
131
|
-
for l in block['lines']: # noqa: E741
|
132
|
-
line_box = l['bbox']
|
133
|
-
if (
|
134
|
-
line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
|
135
|
-
and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
|
136
|
-
): # 在line内的,插入line前面
|
137
|
-
line_txt = ''.join([s['text'] for s in l['spans']])
|
138
|
-
page_md = __insert_before(
|
139
|
-
page_md, img_content, line_txt
|
140
|
-
)
|
141
|
-
break
|
142
|
-
break
|
143
|
-
else: # 在行与行之间
|
144
|
-
# 找到图片x0,y0与line的x0,y0最近的line
|
145
|
-
min_distance = 100000
|
146
|
-
min_line = None
|
147
|
-
for l in block['lines']: # noqa: E741
|
148
|
-
line_box = l['bbox']
|
149
|
-
distance = math.sqrt(
|
150
|
-
(line_box[0] - imgbox[0]) ** 2
|
151
|
-
+ (line_box[1] - imgbox[1]) ** 2
|
152
|
-
)
|
153
|
-
if distance < min_distance:
|
154
|
-
min_distance = distance
|
155
|
-
min_line = l
|
156
|
-
if min_line:
|
157
|
-
line_txt = ''.join(
|
158
|
-
[s['text'] for s in min_line['spans']]
|
159
|
-
)
|
160
|
-
img_h = imgbox[3] - imgbox[1]
|
161
|
-
if min_distance < img_h: # 文字在图片前面
|
162
|
-
page_md = __insert_after(
|
163
|
-
page_md, img_content, line_txt
|
164
|
-
)
|
165
|
-
else:
|
166
|
-
page_md = __insert_before(
|
167
|
-
page_md, img_content, line_txt
|
168
|
-
)
|
169
|
-
else:
|
170
|
-
logger.error(
|
171
|
-
f"Can't find the location of image {img['image_path']} in the markdown file #1"
|
172
|
-
)
|
173
|
-
else: # 应当在两个block之间
|
174
|
-
# 找到上方最近的block,如果上方没有就找大下方最近的block
|
175
|
-
top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
|
176
|
-
if top_txt_block:
|
177
|
-
line_txt = ''.join(
|
178
|
-
[s['text'] for s in top_txt_block['lines'][-1]['spans']]
|
179
|
-
)
|
180
|
-
page_md = __insert_after(page_md, img_content, line_txt)
|
181
|
-
else:
|
182
|
-
bottom_txt_block = find_bottom_nearest_text_bbox(
|
183
|
-
pymu_raw_blocks, imgbox
|
184
|
-
)
|
185
|
-
if bottom_txt_block:
|
186
|
-
line_txt = ''.join(
|
187
|
-
[
|
188
|
-
s['text']
|
189
|
-
for s in bottom_txt_block['lines'][0]['spans']
|
190
|
-
]
|
191
|
-
)
|
192
|
-
page_md = __insert_before(page_md, img_content, line_txt)
|
193
|
-
else:
|
194
|
-
logger.error(
|
195
|
-
f"Can't find the location of image {img['image_path']} in the markdown file #2"
|
196
|
-
)
|
197
|
-
|
198
|
-
content_lst.append(page_md)
|
199
|
-
|
200
|
-
"""拼装成全部页面的文本"""
|
201
|
-
content_text = '\n\n'.join(content_lst)
|
202
|
-
|
203
|
-
return content_text
|
204
|
-
|
205
|
-
|
206
|
-
def __insert_after_para(text, type, element, content_list):
|
207
|
-
"""在content_list中找到text,将image_path作为一个新的node插入到text后面."""
|
208
|
-
for i, c in enumerate(content_list):
|
209
|
-
content_type = c.get('type')
|
210
|
-
if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
|
211
|
-
if type == 'image':
|
212
|
-
content_node = {
|
213
|
-
'type': 'image',
|
214
|
-
'img_path': element.get('image_path'),
|
215
|
-
'img_alt': '',
|
216
|
-
'img_title': '',
|
217
|
-
'img_caption': '',
|
218
|
-
}
|
219
|
-
elif type == 'table':
|
220
|
-
content_node = {
|
221
|
-
'type': 'table',
|
222
|
-
'img_path': element.get('image_path'),
|
223
|
-
'table_latex': element.get('text'),
|
224
|
-
'table_title': '',
|
225
|
-
'table_caption': '',
|
226
|
-
'table_quality': element.get('quality'),
|
227
|
-
}
|
228
|
-
content_list.insert(i + 1, content_node)
|
229
|
-
break
|
230
|
-
else:
|
231
|
-
logger.error(
|
232
|
-
f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
|
233
|
-
)
|
234
|
-
|
235
|
-
|
236
|
-
def __insert_before_para(text, type, element, content_list):
|
237
|
-
"""在content_list中找到text,将image_path作为一个新的node插入到text前面."""
|
238
|
-
for i, c in enumerate(content_list):
|
239
|
-
content_type = c.get('type')
|
240
|
-
if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
|
241
|
-
if type == 'image':
|
242
|
-
content_node = {
|
243
|
-
'type': 'image',
|
244
|
-
'img_path': element.get('image_path'),
|
245
|
-
'img_alt': '',
|
246
|
-
'img_title': '',
|
247
|
-
'img_caption': '',
|
248
|
-
}
|
249
|
-
elif type == 'table':
|
250
|
-
content_node = {
|
251
|
-
'type': 'table',
|
252
|
-
'img_path': element.get('image_path'),
|
253
|
-
'table_latex': element.get('text'),
|
254
|
-
'table_title': '',
|
255
|
-
'table_caption': '',
|
256
|
-
'table_quality': element.get('quality'),
|
257
|
-
}
|
258
|
-
content_list.insert(i, content_node)
|
259
|
-
break
|
260
|
-
else:
|
261
|
-
logger.error(
|
262
|
-
f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
|
263
|
-
)
|
264
|
-
|
265
|
-
|
266
|
-
def mk_universal_format(pdf_info_list: list, img_buket_path):
|
267
|
-
"""构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
|
268
|
-
content_lst = []
|
269
|
-
for page_info in pdf_info_list:
|
270
|
-
page_lst = [] # 一个page内的段落列表
|
271
|
-
para_blocks = page_info.get('para_blocks')
|
272
|
-
pymu_raw_blocks = page_info.get('preproc_blocks')
|
273
|
-
|
274
|
-
all_page_images = []
|
275
|
-
all_page_images.extend(page_info.get('images', []))
|
276
|
-
all_page_images.extend(page_info.get('image_backup', []))
|
277
|
-
# all_page_images.extend(page_info.get("tables",[]))
|
278
|
-
# all_page_images.extend(page_info.get("table_backup",[]) )
|
279
|
-
all_page_tables = []
|
280
|
-
all_page_tables.extend(page_info.get('tables', []))
|
281
|
-
|
282
|
-
if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
|
283
|
-
for img in all_page_images:
|
284
|
-
content_node = {
|
285
|
-
'type': 'image',
|
286
|
-
'img_path': join_path(img_buket_path, img['image_path']),
|
287
|
-
'img_alt': '',
|
288
|
-
'img_title': '',
|
289
|
-
'img_caption': '',
|
290
|
-
}
|
291
|
-
page_lst.append(content_node) # TODO 图片顺序
|
292
|
-
for table in all_page_tables:
|
293
|
-
content_node = {
|
294
|
-
'type': 'table',
|
295
|
-
'img_path': join_path(img_buket_path, table['image_path']),
|
296
|
-
'table_latex': table.get('text'),
|
297
|
-
'table_title': '',
|
298
|
-
'table_caption': '',
|
299
|
-
'table_quality': table.get('quality'),
|
300
|
-
}
|
301
|
-
page_lst.append(content_node) # TODO 图片顺序
|
302
|
-
else:
|
303
|
-
for block in para_blocks:
|
304
|
-
item = block['paras']
|
305
|
-
for _, p in item.items():
|
306
|
-
font_type = p[
|
307
|
-
'para_font_type'
|
308
|
-
] # 对于文本来说,要么是普通文本,要么是个行间公式
|
309
|
-
if font_type == TYPE_INTERLINE_EQUATION:
|
310
|
-
content_node = {'type': 'equation', 'latex': p['para_text']}
|
311
|
-
page_lst.append(content_node)
|
312
|
-
else:
|
313
|
-
para_text = p['para_text']
|
314
|
-
is_title = p['is_para_title']
|
315
|
-
title_level = p['para_title_level']
|
316
|
-
|
317
|
-
if is_title:
|
318
|
-
content_node = {
|
319
|
-
'type': f'h{title_level}',
|
320
|
-
'text': para_text,
|
321
|
-
}
|
322
|
-
page_lst.append(content_node)
|
323
|
-
else:
|
324
|
-
content_node = {'type': 'text', 'text': para_text}
|
325
|
-
page_lst.append(content_node)
|
326
|
-
|
327
|
-
content_lst.extend(page_lst)
|
328
|
-
|
329
|
-
"""插入图片"""
|
330
|
-
for img in all_page_images:
|
331
|
-
insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
|
332
|
-
|
333
|
-
"""插入表格"""
|
334
|
-
for table in all_page_tables:
|
335
|
-
insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
|
336
|
-
# end for
|
337
|
-
return content_lst
|
338
|
-
|
339
|
-
|
340
|
-
def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
|
341
|
-
element_bbox = element['bbox']
|
342
|
-
# 先看在哪个block内
|
343
|
-
for block in pymu_raw_blocks:
|
344
|
-
bbox = block['bbox']
|
345
|
-
if (
|
346
|
-
bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
|
347
|
-
and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
|
348
|
-
): # 确定在这个大的block内,然后进入逐行比较距离
|
349
|
-
for l in block['lines']: # noqa: E741
|
350
|
-
line_box = l['bbox']
|
351
|
-
if (
|
352
|
-
line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
|
353
|
-
and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
|
354
|
-
): # 在line内的,插入line前面
|
355
|
-
line_txt = ''.join([s['text'] for s in l['spans']])
|
356
|
-
__insert_before_para(line_txt, type, element, content_lst)
|
357
|
-
break
|
358
|
-
break
|
359
|
-
else: # 在行与行之间
|
360
|
-
# 找到图片x0,y0与line的x0,y0最近的line
|
361
|
-
min_distance = 100000
|
362
|
-
min_line = None
|
363
|
-
for l in block['lines']: # noqa: E741
|
364
|
-
line_box = l['bbox']
|
365
|
-
distance = math.sqrt(
|
366
|
-
(line_box[0] - element_bbox[0]) ** 2
|
367
|
-
+ (line_box[1] - element_bbox[1]) ** 2
|
368
|
-
)
|
369
|
-
if distance < min_distance:
|
370
|
-
min_distance = distance
|
371
|
-
min_line = l
|
372
|
-
if min_line:
|
373
|
-
line_txt = ''.join([s['text'] for s in min_line['spans']])
|
374
|
-
img_h = element_bbox[3] - element_bbox[1]
|
375
|
-
if min_distance < img_h: # 文字在图片前面
|
376
|
-
__insert_after_para(line_txt, type, element, content_lst)
|
377
|
-
else:
|
378
|
-
__insert_before_para(line_txt, type, element, content_lst)
|
379
|
-
break
|
380
|
-
else:
|
381
|
-
logger.error(
|
382
|
-
f"Can't find the location of image {element.get('image_path')} in the markdown file #1"
|
383
|
-
)
|
384
|
-
else: # 应当在两个block之间
|
385
|
-
# 找到上方最近的block,如果上方没有就找大下方最近的block
|
386
|
-
top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
|
387
|
-
if top_txt_block:
|
388
|
-
line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
|
389
|
-
__insert_after_para(line_txt, type, element, content_lst)
|
390
|
-
else:
|
391
|
-
bottom_txt_block = find_bottom_nearest_text_bbox(
|
392
|
-
pymu_raw_blocks, element_bbox
|
393
|
-
)
|
394
|
-
if bottom_txt_block:
|
395
|
-
line_txt = ''.join(
|
396
|
-
[s['text'] for s in bottom_txt_block['lines'][0]['spans']]
|
397
|
-
)
|
398
|
-
__insert_before_para(line_txt, type, element, content_lst)
|
399
|
-
else: # TODO ,图片可能独占一列,这种情况上下是没有图片的
|
400
|
-
logger.error(
|
401
|
-
f"Can't find the location of image {element.get('image_path')} in the markdown file #2"
|
402
|
-
)
|
403
|
-
|
404
|
-
|
405
|
-
def mk_mm_markdown(content_list):
|
406
|
-
"""基于同一格式的内容列表,构造markdown,含图片."""
|
407
|
-
content_md = []
|
408
|
-
for c in content_list:
|
409
|
-
content_type = c.get('type')
|
410
|
-
if content_type == 'text':
|
411
|
-
content_md.append(c.get('text'))
|
412
|
-
elif content_type == 'equation':
|
413
|
-
content = c.get('latex')
|
414
|
-
if content.startswith('$$') and content.endswith('$$'):
|
415
|
-
content_md.append(content)
|
416
|
-
else:
|
417
|
-
content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
|
418
|
-
elif content_type in UNI_FORMAT_TEXT_TYPE:
|
419
|
-
content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
|
420
|
-
elif content_type == 'image':
|
421
|
-
content_md.append(f"})")
|
422
|
-
return '\n\n'.join(content_md)
|
423
|
-
|
424
|
-
|
425
|
-
def mk_nlp_markdown(content_list):
|
426
|
-
"""基于同一格式的内容列表,构造markdown,不含图片."""
|
427
|
-
content_md = []
|
428
|
-
for c in content_list:
|
429
|
-
content_type = c.get('type')
|
430
|
-
if content_type == 'text':
|
431
|
-
content_md.append(c.get('text'))
|
432
|
-
elif content_type == 'equation':
|
433
|
-
content_md.append(f"$$\n{c.get('latex')}\n$$")
|
434
|
-
elif content_type == 'table':
|
435
|
-
content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
|
436
|
-
elif content_type in UNI_FORMAT_TEXT_TYPE:
|
437
|
-
content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
|
438
|
-
return '\n\n'.join(content_md)
|
magic_pdf/layout/__init__.py
DELETED
File without changes
|