magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  2. magic_pdf/filter/pdf_meta_scan.py +3 -17
  3. magic_pdf/libs/commons.py +0 -161
  4. magic_pdf/libs/draw_bbox.py +2 -3
  5. magic_pdf/libs/markdown_utils.py +0 -21
  6. magic_pdf/libs/pdf_image_tools.py +2 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
  9. magic_pdf/model/magic_model.py +0 -30
  10. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
  11. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
  12. magic_pdf/para/para_split_v3.py +7 -2
  13. magic_pdf/pdf_parse_union_core_v2.py +97 -124
  14. magic_pdf/pre_proc/construct_page_dict.py +0 -55
  15. magic_pdf/pre_proc/cut_image.py +0 -37
  16. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
  17. magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
  18. magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
  19. magic_pdf/rw/S3ReaderWriter.py +1 -1
  20. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
  21. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
  22. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
  23. magic_pdf/dict2md/mkcontent.py +0 -438
  24. magic_pdf/layout/__init__.py +0 -0
  25. magic_pdf/layout/bbox_sort.py +0 -681
  26. magic_pdf/layout/layout_det_utils.py +0 -182
  27. magic_pdf/layout/layout_sort.py +0 -921
  28. magic_pdf/layout/layout_spiler_recog.py +0 -101
  29. magic_pdf/layout/mcol_sort.py +0 -336
  30. magic_pdf/libs/calc_span_stats.py +0 -239
  31. magic_pdf/libs/detect_language_from_model.py +0 -21
  32. magic_pdf/libs/nlp_utils.py +0 -203
  33. magic_pdf/libs/textbase.py +0 -33
  34. magic_pdf/libs/vis_utils.py +0 -308
  35. magic_pdf/para/block_continuation_processor.py +0 -562
  36. magic_pdf/para/block_termination_processor.py +0 -480
  37. magic_pdf/para/commons.py +0 -222
  38. magic_pdf/para/denoise.py +0 -246
  39. magic_pdf/para/draw.py +0 -121
  40. magic_pdf/para/exceptions.py +0 -198
  41. magic_pdf/para/layout_match_processor.py +0 -40
  42. magic_pdf/para/para_split.py +0 -807
  43. magic_pdf/para/para_split_v2.py +0 -959
  44. magic_pdf/para/raw_processor.py +0 -207
  45. magic_pdf/para/stats.py +0 -268
  46. magic_pdf/para/title_processor.py +0 -1014
  47. magic_pdf/pdf_parse_union_core.py +0 -345
  48. magic_pdf/post_proc/__init__.py +0 -0
  49. magic_pdf/post_proc/detect_para.py +0 -3472
  50. magic_pdf/post_proc/pdf_post_filter.py +0 -60
  51. magic_pdf/post_proc/remove_footnote.py +0 -153
  52. magic_pdf/pre_proc/citationmarker_remove.py +0 -161
  53. magic_pdf/pre_proc/detect_equation.py +0 -134
  54. magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
  55. magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
  56. magic_pdf/pre_proc/detect_footnote.py +0 -170
  57. magic_pdf/pre_proc/detect_header.py +0 -64
  58. magic_pdf/pre_proc/detect_images.py +0 -647
  59. magic_pdf/pre_proc/detect_page_number.py +0 -64
  60. magic_pdf/pre_proc/detect_tables.py +0 -62
  61. magic_pdf/pre_proc/equations_replace.py +0 -550
  62. magic_pdf/pre_proc/fix_image.py +0 -244
  63. magic_pdf/pre_proc/fix_table.py +0 -270
  64. magic_pdf/pre_proc/main_text_font.py +0 -23
  65. magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
  66. magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
  67. magic_pdf/pre_proc/post_layout_split.py +0 -0
  68. magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
  69. magic_pdf/pre_proc/remove_footer_header.py +0 -114
  70. magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
  71. magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
  72. magic_pdf/pre_proc/solve_line_alien.py +0 -29
  73. magic_pdf/pre_proc/statistics.py +0 -12
  74. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
  75. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
  76. {magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,7 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
3
3
  magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
4
- magic_pdf/pdf_parse_union_core.py,sha256=w90lFIMOYUMAq4iv8bpsbBtLXFphPV4HyYeqbOTYQUI,12420
5
- magic_pdf/pdf_parse_union_core_v2.py,sha256=EqEi9AahBBh2JbXoY8uOCmClvi9W_H_26U4jK8RwPwU,31308
4
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=-4yJwcSMcGwQKJhmK_MbBMa-fexzkqeD1CQHWpzGC3I,29920
6
5
  magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
7
6
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
7
  magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
@@ -20,7 +19,7 @@ magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
20
19
  magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
21
20
  magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
22
21
  magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
23
- magic_pdf/data/data_reader_writer/filebase.py,sha256=21RYy4m9MqJGqwd2HWICQJHM-PZXp7UYETCQQK390Kk,1988
22
+ magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
24
23
  magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
25
24
  magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
26
25
  magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
@@ -28,48 +27,36 @@ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,111
28
27
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
29
28
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
30
29
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- magic_pdf/dict2md/mkcontent.py,sha256=bMQK7uiay76YaWA92VIK57YajINV20SnOs65wOEXyKE,18667
32
30
  magic_pdf/dict2md/ocr_mkcontent.py,sha256=ohjhEFS9YFrzTCC9c9yrvi4QuZe9iZm1qlkQWB6xxIw,13038
33
31
  magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
32
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
35
- magic_pdf/filter/pdf_meta_scan.py,sha256=h4D4O0OeAlEy2A8mJ6E0aQ8wIizIfsIxEagbjaomnAo,17823
33
+ magic_pdf/filter/pdf_meta_scan.py,sha256=3I-t3PSrQUZ3PZAPl_NGoEhxLmIUE9Fpc0jueEXP7Xw,17381
36
34
  magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
35
  magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
36
  magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
39
37
  magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCalhJdmk,3193
40
38
  magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
41
- magic_pdf/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- magic_pdf/layout/bbox_sort.py,sha256=PzzaBf6MC_AZ-ZWGU0Kg-KIsw874l_gML73mM3hE4Ps,30807
43
- magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl6ku2cls,9137
44
- magic_pdf/layout/layout_sort.py,sha256=jtacQVcxnuYAksvEqtS0DH-v6U8qyjX-jmyZgDJ-egA,37005
45
- magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
46
- magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
47
39
  magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
40
  magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
49
- magic_pdf/libs/calc_span_stats.py,sha256=5vnU27DcbkFDRSAoLqAmX0KQ3I9ehWkEgh_t9hxg_zI,10147
50
41
  magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
51
- magic_pdf/libs/commons.py,sha256=6Zu9-OyamyCNDY7qj0SxR-rux-ggj9im3CVPtC4ubB8,7108
42
+ magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
52
43
  magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
53
44
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
54
45
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
55
- magic_pdf/libs/detect_language_from_model.py,sha256=Uln8F9qs8EJOw4EgI7KRlaU3lD_mK8KMTlADLFtz8fk,816
56
- magic_pdf/libs/draw_bbox.py,sha256=NhAfqib5HYuGjjrAG_SvJR-yOHZTy6tzDxLXdxKlULQ,17676
46
+ magic_pdf/libs/draw_bbox.py,sha256=2IXr4TUxm0-pXYIPkNaELWo9pOysZC6etpqzTE5eg-w,17588
57
47
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
58
48
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
59
49
  magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
60
50
  magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
61
- magic_pdf/libs/markdown_utils.py,sha256=cLxLXjRhrNp_wCHvtglrGA_FVdrvfd1KULeTtj1p18w,944
62
- magic_pdf/libs/nlp_utils.py,sha256=-X9W3-Ns5ZdDYFvyyEq6i6P2b5hCATaFEZeOjwNOH9M,6901
51
+ magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
63
52
  magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
64
53
  magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2145
65
- magic_pdf/libs/pdf_image_tools.py,sha256=sh8hgBQu_83R71qBLodOFdByBUuQujsOMfgpSD9mrhE,1981
54
+ magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
66
55
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
67
- magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
68
- magic_pdf/libs/version.py,sha256=v4zmKjsKOPZbp6BrWoz7iK4ST0sdZdUh9bQSJmluZ5o,23
69
- magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
56
+ magic_pdf/libs/version.py,sha256=A_AARqtxTOj_AQTpjpgOxNx-UOBio5wYFfZ2mrdMKfs,23
70
57
  magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
71
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=HOT6chGx2VPyH6O9WB0c6xGPeDs9m_6oZn3iOa745yw,7125
72
- magic_pdf/model/magic_model.py,sha256=8nJLzNCa0Ag4JhMAQbjj5qrkj617qKPCXVJAiT9DnaA,43472
58
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
59
+ magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
73
60
  magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
74
61
  magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
75
62
  magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
@@ -107,8 +94,8 @@ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1
107
94
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
95
  magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
96
  magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=UP7fADPGoxAMj2SUKmeW-fe_AcAQxlT9Mfy4WF6vHmU,9796
111
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=a6xkQHqLMUL4NCaORp8oo4Tfa8GB8PN9MVvG7Pj6jIE,7316
97
+ magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=bya-KGr5OPCmE8KC8K5Pp6OlGigCmUmB9xpm59nExaM,9056
98
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=Deoth86bltlLz1Y-1jpyhLCwCaRfq-KKI0tiFyKKqA8,7268
112
99
  magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
113
100
  magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
101
  magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -123,64 +110,26 @@ magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-u
123
110
  magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
124
111
  magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=AdH3UGu4BEoII0uFjPKUf61W7HmG4fDlWgR1xxMeFlE,2775
125
112
  magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
126
- magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
127
- magic_pdf/para/block_termination_processor.py,sha256=YU3ZYqJy9e3OQmOuQYZrR6AUpmAlQ0mhj0PgZZPZ_fM,17957
128
- magic_pdf/para/commons.py,sha256=VdJ8SY9qJTtcRyx8HH-PFeZSJwL4Tsf50197RD_-dwc,5414
129
- magic_pdf/para/denoise.py,sha256=J7dM2KNnbdzAd2A3agB04U6L1GL9RrhAs-MLrq-_Ftg,10443
130
- magic_pdf/para/draw.py,sha256=KyWc03do_WuBKQ028HYzepYwbIkel9ID0uqRhuPVOHc,5643
131
- magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,4978
132
- magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
133
- magic_pdf/para/para_split.py,sha256=z7nYeg86BjZOAdJNMwYKSu51W9evurtl3cy1ZUcQLlw,33222
134
- magic_pdf/para/para_split_v2.py,sha256=vJJqqMMKbv8D702nODThL-5hjkgZ7Vl2BTmEIdwmmDw,39051
135
- magic_pdf/para/para_split_v3.py,sha256=atfELVRx-90paAS3nZptgP0qG8UpTTaj3LG_2x3NAlQ,15977
136
- magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
137
- magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
138
- magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
113
+ magic_pdf/para/para_split_v3.py,sha256=x6nfjyt38W-wdxXjo6Chd18eiqLzmhbTNyGHhBQcEHs,16459
139
114
  magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
140
115
  magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
141
116
  magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
142
117
  magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,4807
143
118
  magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
144
- magic_pdf/post_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
145
- magic_pdf/post_proc/detect_para.py,sha256=5LX86ueHQGOV9CNimAxqZH4R3KTi78leum1de_Na0pw,126181
146
- magic_pdf/post_proc/pdf_post_filter.py,sha256=3EJDovQPckPKJaBY1wvAty-LGKyRG63WICY_bA_Kfbs,2501
147
- magic_pdf/post_proc/remove_footnote.py,sha256=701P7xRu6gzLaEHfb2xkYpLZI4CwK2FAo7Ggho4bOTI,7596
148
119
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
- magic_pdf/pre_proc/citationmarker_remove.py,sha256=IitOERaK9fGaktsYMyiaaL_71uMIrlG5ZdmpZaR6dsA,6640
150
- magic_pdf/pre_proc/construct_page_dict.py,sha256=lp3zBmInlWYYIcGC1-NSqT9s44AjDvlnWxDPeZoBVSY,3043
151
- magic_pdf/pre_proc/cut_image.py,sha256=TghshkDTgdUbyLSbKZoFI9-n-xaFub02IYPyu0IAnRY,2761
152
- magic_pdf/pre_proc/detect_equation.py,sha256=9omDHKTI8QO9Qd46eVFHWhZeMmTNx7XDuWRgjXI-KFA,6627
153
- magic_pdf/pre_proc/detect_footer_by_model.py,sha256=_EghAM_zWBcqVY8XBkbSoprKqKUa0mlN1U8YNWxNNLI,2848
154
- magic_pdf/pre_proc/detect_footer_header_by_statistics.py,sha256=924soXZ51QVpitPgVgnwbC7BqOZI30j5hGW5zP86y-w,11250
155
- magic_pdf/pre_proc/detect_footnote.py,sha256=UxFuTCRwXdAv3wKCgRQJJVt12hM9O9oPTwzPAChQXoM,8309
156
- magic_pdf/pre_proc/detect_header.py,sha256=KOmRehgKMuMqNa_2weXkdNSiRVWMFgLMQE4e1itbY7g,2848
157
- magic_pdf/pre_proc/detect_images.py,sha256=8DwGGTb5IjxqADZDTc_ngwJrTYXxK2qpRqI2FBoPr00,30432
158
- magic_pdf/pre_proc/detect_page_number.py,sha256=qvYrBbCtBbREvw-MySL_p7byCRvcm1fkLJ5ZB4TP8OM,2848
159
- magic_pdf/pre_proc/detect_tables.py,sha256=srJzgLVeVuOsqnESqfdJfVukTF84K8qmI5mgFX_BZGs,2800
160
- magic_pdf/pre_proc/equations_replace.py,sha256=7mexRPwD9C_UJ-SbvO_-XnpcnN7YtGUUznmPjHbjhnw,20526
161
- magic_pdf/pre_proc/fix_image.py,sha256=5MOfkXc8abfIp49g-68vll40wwTUZ5tcQ2gtsJuFmvs,11486
162
- magic_pdf/pre_proc/fix_table.py,sha256=20sqJe27fAXcL7_C0qQ9mpsggmH37WuX-wPYWyRgACA,13227
163
- magic_pdf/pre_proc/main_text_font.py,sha256=1gkjvPuBdKC4oVFkLvnRm2zghsLtVlfAEMKXouyVonM,1048
164
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=DMc2H2xGqVePBReZu5AQbPdvDw3sxOssmujCLlNW3Vs,14143
165
- magic_pdf/pre_proc/ocr_detect_layout.py,sha256=DW0_HXzmcbW22cXKIYFsyZNFh8mEjSHXIFVjXndJsvQ,5878
166
- magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Au8y1NBhbWpq_VuPLg3b9dAMUhyPS71xtTghtd21K5M,14273
167
- magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=9DxEyy1pH87g4T_JEgI3cTVCL2TVrEBl38wsmqhQM4k,12758
168
- magic_pdf/pre_proc/pdf_pre_filter.py,sha256=qvNlNyj4Mc3qa73mgfkp0PMR-ucABbx3mMcyVipaEpQ,2776
169
- magic_pdf/pre_proc/post_layout_split.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
+ magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
121
+ magic_pdf/pre_proc/cut_image.py,sha256=U-ttnl3lAhhmgtkR1GGyPAVm0i0-6VscXf3E2EDy3lE,1187
122
+ magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=xQ1L6pwQjN4xBSKEXslheip1aMFaiB0grqlX3BF-kh0,9282
123
+ magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
124
+ magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
170
125
  magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
171
- magic_pdf/pre_proc/remove_colored_strip_bbox.py,sha256=WVKhgeWifRdO-u2ETYffkcMOFVYIbiaZu5pMr1RpEdA,4090
172
- magic_pdf/pre_proc/remove_footer_header.py,sha256=Igdr4jH7BUGuTcapWPiKEGKxhWH12c3VVmX5xwUVn7w,5680
173
- magic_pdf/pre_proc/remove_rotate_bbox.py,sha256=di7geS7AFhSaAvkWZHT6J3dlXEq8uu9Z4oBYtolQjl0,8803
174
- magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=ABl0vo8kkcCPSTI8dpXQTOH1b9R-lbzsJDDFONU6ELk,7313
175
- magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
176
- magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
177
126
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
178
127
  magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
179
128
  magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
180
129
  magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
181
130
  magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
182
131
  magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
183
- magic_pdf/rw/S3ReaderWriter.py,sha256=_DmL45Ubio-_VsKD84KrqOQ-VNDUTzcXSrXfNMb5vww,5310
132
+ magic_pdf/rw/S3ReaderWriter.py,sha256=LmbtA-pZlC745nnSUs67C1iqSrBDS7IzE6QC8YMB644,5293
184
133
  magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
185
134
  magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
186
135
  magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
@@ -190,9 +139,9 @@ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,39
190
139
  magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
191
140
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
192
141
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
193
- magic_pdf-0.10.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
194
- magic_pdf-0.10.0.dist-info/METADATA,sha256=U_TtQjdODFjAADoZro_ipfGiasBCVq2_zZlF2DFyNpM,40300
195
- magic_pdf-0.10.0.dist-info/WHEEL,sha256=bFJAMchF8aTQGUgMZzHJyDDMPTO3ToJ7x23SLJa1SVo,92
196
- magic_pdf-0.10.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
197
- magic_pdf-0.10.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
198
- magic_pdf-0.10.0.dist-info/RECORD,,
142
+ magic_pdf-0.10.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
+ magic_pdf-0.10.2.dist-info/METADATA,sha256=5pYglDeTXZaIsMRAHSfNl57Yq3gPXdcexNxt1zdvmu4,37030
144
+ magic_pdf-0.10.2.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
+ magic_pdf-0.10.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
+ magic_pdf-0.10.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
+ magic_pdf-0.10.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.0)
2
+ Generator: bdist_wheel (0.45.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,438 +0,0 @@
1
- import math
2
-
3
- from loguru import logger
4
-
5
- from magic_pdf.config.ocr_content_type import ContentType
6
- from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
7
- find_top_nearest_text_bbox)
8
- from magic_pdf.libs.commons import join_path
9
-
10
- TYPE_INLINE_EQUATION = ContentType.InlineEquation
11
- TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
12
- UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
13
-
14
-
15
- @DeprecationWarning
16
- def mk_nlp_markdown_1(para_dict: dict):
17
- """对排序后的bboxes拼接内容."""
18
- content_lst = []
19
- for _, page_info in para_dict.items():
20
- para_blocks = page_info.get('para_blocks')
21
- if not para_blocks:
22
- continue
23
-
24
- for block in para_blocks:
25
- item = block['paras']
26
- for _, p in item.items():
27
- para_text = p['para_text']
28
- is_title = p['is_para_title']
29
- title_level = p['para_title_level']
30
- md_title_prefix = '#' * title_level
31
- if is_title:
32
- content_lst.append(f'{md_title_prefix} {para_text}')
33
- else:
34
- content_lst.append(para_text)
35
-
36
- content_text = '\n\n'.join(content_lst)
37
-
38
- return content_text
39
-
40
-
41
- # 找到目标字符串在段落中的索引
42
- def __find_index(paragraph, target):
43
- index = paragraph.find(target)
44
- if index != -1:
45
- return index
46
- else:
47
- return None
48
-
49
-
50
- def __insert_string(paragraph, target, position):
51
- new_paragraph = paragraph[:position] + target + paragraph[position:]
52
- return new_paragraph
53
-
54
-
55
- def __insert_after(content, image_content, target):
56
- """在content中找到target,将image_content插入到target后面."""
57
- index = content.find(target)
58
- if index != -1:
59
- content = (
60
- content[: index + len(target)]
61
- + '\n\n'
62
- + image_content
63
- + '\n\n'
64
- + content[index + len(target) :]
65
- )
66
- else:
67
- logger.error(
68
- f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
69
- )
70
- return content
71
-
72
-
73
- def __insert_before(content, image_content, target):
74
- """在content中找到target,将image_content插入到target前面."""
75
- index = content.find(target)
76
- if index != -1:
77
- content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
78
- else:
79
- logger.error(
80
- f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
81
- )
82
- return content
83
-
84
-
85
- @DeprecationWarning
86
- def mk_mm_markdown_1(para_dict: dict):
87
- """拼装多模态markdown."""
88
- content_lst = []
89
- for _, page_info in para_dict.items():
90
- page_lst = [] # 一个page内的段落列表
91
- para_blocks = page_info.get('para_blocks')
92
- pymu_raw_blocks = page_info.get('preproc_blocks')
93
-
94
- all_page_images = []
95
- all_page_images.extend(page_info.get('images', []))
96
- all_page_images.extend(page_info.get('image_backup', []))
97
- all_page_images.extend(page_info.get('tables', []))
98
- all_page_images.extend(page_info.get('table_backup', []))
99
-
100
- if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
101
- for img in all_page_images:
102
- page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
103
- page_md = '\n\n'.join(page_lst)
104
-
105
- else:
106
- for block in para_blocks:
107
- item = block['paras']
108
- for _, p in item.items():
109
- para_text = p['para_text']
110
- is_title = p['is_para_title']
111
- title_level = p['para_title_level']
112
- md_title_prefix = '#' * title_level
113
- if is_title:
114
- page_lst.append(f'{md_title_prefix} {para_text}')
115
- else:
116
- page_lst.append(para_text)
117
-
118
- """拼装成一个页面的文本"""
119
- page_md = '\n\n'.join(page_lst)
120
- """插入图片"""
121
- for img in all_page_images:
122
- imgbox = img['bbox']
123
- img_content = f"![]({img['image_path']})"
124
- # 先看在哪个block内
125
- for block in pymu_raw_blocks:
126
- bbox = block['bbox']
127
- if (
128
- bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
129
- and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
130
- ): # 确定在block内
131
- for l in block['lines']: # noqa: E741
132
- line_box = l['bbox']
133
- if (
134
- line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
135
- and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
136
- ): # 在line内的,插入line前面
137
- line_txt = ''.join([s['text'] for s in l['spans']])
138
- page_md = __insert_before(
139
- page_md, img_content, line_txt
140
- )
141
- break
142
- break
143
- else: # 在行与行之间
144
- # 找到图片x0,y0与line的x0,y0最近的line
145
- min_distance = 100000
146
- min_line = None
147
- for l in block['lines']: # noqa: E741
148
- line_box = l['bbox']
149
- distance = math.sqrt(
150
- (line_box[0] - imgbox[0]) ** 2
151
- + (line_box[1] - imgbox[1]) ** 2
152
- )
153
- if distance < min_distance:
154
- min_distance = distance
155
- min_line = l
156
- if min_line:
157
- line_txt = ''.join(
158
- [s['text'] for s in min_line['spans']]
159
- )
160
- img_h = imgbox[3] - imgbox[1]
161
- if min_distance < img_h: # 文字在图片前面
162
- page_md = __insert_after(
163
- page_md, img_content, line_txt
164
- )
165
- else:
166
- page_md = __insert_before(
167
- page_md, img_content, line_txt
168
- )
169
- else:
170
- logger.error(
171
- f"Can't find the location of image {img['image_path']} in the markdown file #1"
172
- )
173
- else: # 应当在两个block之间
174
- # 找到上方最近的block,如果上方没有就找大下方最近的block
175
- top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
176
- if top_txt_block:
177
- line_txt = ''.join(
178
- [s['text'] for s in top_txt_block['lines'][-1]['spans']]
179
- )
180
- page_md = __insert_after(page_md, img_content, line_txt)
181
- else:
182
- bottom_txt_block = find_bottom_nearest_text_bbox(
183
- pymu_raw_blocks, imgbox
184
- )
185
- if bottom_txt_block:
186
- line_txt = ''.join(
187
- [
188
- s['text']
189
- for s in bottom_txt_block['lines'][0]['spans']
190
- ]
191
- )
192
- page_md = __insert_before(page_md, img_content, line_txt)
193
- else:
194
- logger.error(
195
- f"Can't find the location of image {img['image_path']} in the markdown file #2"
196
- )
197
-
198
- content_lst.append(page_md)
199
-
200
- """拼装成全部页面的文本"""
201
- content_text = '\n\n'.join(content_lst)
202
-
203
- return content_text
204
-
205
-
206
- def __insert_after_para(text, type, element, content_list):
207
- """在content_list中找到text,将image_path作为一个新的node插入到text后面."""
208
- for i, c in enumerate(content_list):
209
- content_type = c.get('type')
210
- if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
211
- if type == 'image':
212
- content_node = {
213
- 'type': 'image',
214
- 'img_path': element.get('image_path'),
215
- 'img_alt': '',
216
- 'img_title': '',
217
- 'img_caption': '',
218
- }
219
- elif type == 'table':
220
- content_node = {
221
- 'type': 'table',
222
- 'img_path': element.get('image_path'),
223
- 'table_latex': element.get('text'),
224
- 'table_title': '',
225
- 'table_caption': '',
226
- 'table_quality': element.get('quality'),
227
- }
228
- content_list.insert(i + 1, content_node)
229
- break
230
- else:
231
- logger.error(
232
- f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
233
- )
234
-
235
-
236
- def __insert_before_para(text, type, element, content_list):
237
- """在content_list中找到text,将image_path作为一个新的node插入到text前面."""
238
- for i, c in enumerate(content_list):
239
- content_type = c.get('type')
240
- if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
241
- if type == 'image':
242
- content_node = {
243
- 'type': 'image',
244
- 'img_path': element.get('image_path'),
245
- 'img_alt': '',
246
- 'img_title': '',
247
- 'img_caption': '',
248
- }
249
- elif type == 'table':
250
- content_node = {
251
- 'type': 'table',
252
- 'img_path': element.get('image_path'),
253
- 'table_latex': element.get('text'),
254
- 'table_title': '',
255
- 'table_caption': '',
256
- 'table_quality': element.get('quality'),
257
- }
258
- content_list.insert(i, content_node)
259
- break
260
- else:
261
- logger.error(
262
- f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
263
- )
264
-
265
-
266
- def mk_universal_format(pdf_info_list: list, img_buket_path):
267
- """构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
268
- content_lst = []
269
- for page_info in pdf_info_list:
270
- page_lst = [] # 一个page内的段落列表
271
- para_blocks = page_info.get('para_blocks')
272
- pymu_raw_blocks = page_info.get('preproc_blocks')
273
-
274
- all_page_images = []
275
- all_page_images.extend(page_info.get('images', []))
276
- all_page_images.extend(page_info.get('image_backup', []))
277
- # all_page_images.extend(page_info.get("tables",[]))
278
- # all_page_images.extend(page_info.get("table_backup",[]) )
279
- all_page_tables = []
280
- all_page_tables.extend(page_info.get('tables', []))
281
-
282
- if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
283
- for img in all_page_images:
284
- content_node = {
285
- 'type': 'image',
286
- 'img_path': join_path(img_buket_path, img['image_path']),
287
- 'img_alt': '',
288
- 'img_title': '',
289
- 'img_caption': '',
290
- }
291
- page_lst.append(content_node) # TODO 图片顺序
292
- for table in all_page_tables:
293
- content_node = {
294
- 'type': 'table',
295
- 'img_path': join_path(img_buket_path, table['image_path']),
296
- 'table_latex': table.get('text'),
297
- 'table_title': '',
298
- 'table_caption': '',
299
- 'table_quality': table.get('quality'),
300
- }
301
- page_lst.append(content_node) # TODO 图片顺序
302
- else:
303
- for block in para_blocks:
304
- item = block['paras']
305
- for _, p in item.items():
306
- font_type = p[
307
- 'para_font_type'
308
- ] # 对于文本来说,要么是普通文本,要么是个行间公式
309
- if font_type == TYPE_INTERLINE_EQUATION:
310
- content_node = {'type': 'equation', 'latex': p['para_text']}
311
- page_lst.append(content_node)
312
- else:
313
- para_text = p['para_text']
314
- is_title = p['is_para_title']
315
- title_level = p['para_title_level']
316
-
317
- if is_title:
318
- content_node = {
319
- 'type': f'h{title_level}',
320
- 'text': para_text,
321
- }
322
- page_lst.append(content_node)
323
- else:
324
- content_node = {'type': 'text', 'text': para_text}
325
- page_lst.append(content_node)
326
-
327
- content_lst.extend(page_lst)
328
-
329
- """插入图片"""
330
- for img in all_page_images:
331
- insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
332
-
333
- """插入表格"""
334
- for table in all_page_tables:
335
- insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
336
- # end for
337
- return content_lst
338
-
339
-
340
- def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
341
- element_bbox = element['bbox']
342
- # 先看在哪个block内
343
- for block in pymu_raw_blocks:
344
- bbox = block['bbox']
345
- if (
346
- bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
347
- and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
348
- ): # 确定在这个大的block内,然后进入逐行比较距离
349
- for l in block['lines']: # noqa: E741
350
- line_box = l['bbox']
351
- if (
352
- line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
353
- and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
354
- ): # 在line内的,插入line前面
355
- line_txt = ''.join([s['text'] for s in l['spans']])
356
- __insert_before_para(line_txt, type, element, content_lst)
357
- break
358
- break
359
- else: # 在行与行之间
360
- # 找到图片x0,y0与line的x0,y0最近的line
361
- min_distance = 100000
362
- min_line = None
363
- for l in block['lines']: # noqa: E741
364
- line_box = l['bbox']
365
- distance = math.sqrt(
366
- (line_box[0] - element_bbox[0]) ** 2
367
- + (line_box[1] - element_bbox[1]) ** 2
368
- )
369
- if distance < min_distance:
370
- min_distance = distance
371
- min_line = l
372
- if min_line:
373
- line_txt = ''.join([s['text'] for s in min_line['spans']])
374
- img_h = element_bbox[3] - element_bbox[1]
375
- if min_distance < img_h: # 文字在图片前面
376
- __insert_after_para(line_txt, type, element, content_lst)
377
- else:
378
- __insert_before_para(line_txt, type, element, content_lst)
379
- break
380
- else:
381
- logger.error(
382
- f"Can't find the location of image {element.get('image_path')} in the markdown file #1"
383
- )
384
- else: # 应当在两个block之间
385
- # 找到上方最近的block,如果上方没有就找大下方最近的block
386
- top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
387
- if top_txt_block:
388
- line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
389
- __insert_after_para(line_txt, type, element, content_lst)
390
- else:
391
- bottom_txt_block = find_bottom_nearest_text_bbox(
392
- pymu_raw_blocks, element_bbox
393
- )
394
- if bottom_txt_block:
395
- line_txt = ''.join(
396
- [s['text'] for s in bottom_txt_block['lines'][0]['spans']]
397
- )
398
- __insert_before_para(line_txt, type, element, content_lst)
399
- else: # TODO ,图片可能独占一列,这种情况上下是没有图片的
400
- logger.error(
401
- f"Can't find the location of image {element.get('image_path')} in the markdown file #2"
402
- )
403
-
404
-
405
- def mk_mm_markdown(content_list):
406
- """基于同一格式的内容列表,构造markdown,含图片."""
407
- content_md = []
408
- for c in content_list:
409
- content_type = c.get('type')
410
- if content_type == 'text':
411
- content_md.append(c.get('text'))
412
- elif content_type == 'equation':
413
- content = c.get('latex')
414
- if content.startswith('$$') and content.endswith('$$'):
415
- content_md.append(content)
416
- else:
417
- content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
418
- elif content_type in UNI_FORMAT_TEXT_TYPE:
419
- content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
420
- elif content_type == 'image':
421
- content_md.append(f"![]({c.get('img_path')})")
422
- return '\n\n'.join(content_md)
423
-
424
-
425
- def mk_nlp_markdown(content_list):
426
- """基于同一格式的内容列表,构造markdown,不含图片."""
427
- content_md = []
428
- for c in content_list:
429
- content_type = c.get('type')
430
- if content_type == 'text':
431
- content_md.append(c.get('text'))
432
- elif content_type == 'equation':
433
- content_md.append(f"$$\n{c.get('latex')}\n$$")
434
- elif content_type == 'table':
435
- content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
436
- elif content_type in UNI_FORMAT_TEXT_TYPE:
437
- content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
438
- return '\n\n'.join(content_md)
File without changes