magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. magic_pdf/config/constants.py +7 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/base.py +13 -1
  4. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  5. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  6. magic_pdf/data/dataset.py +188 -5
  7. magic_pdf/data/read_api.py +59 -12
  8. magic_pdf/data/utils.py +35 -0
  9. magic_pdf/dict2md/ocr_mkcontent.py +16 -15
  10. magic_pdf/filter/__init__.py +32 -0
  11. magic_pdf/filter/pdf_meta_scan.py +3 -2
  12. magic_pdf/libs/clean_memory.py +11 -4
  13. magic_pdf/libs/config_reader.py +9 -0
  14. magic_pdf/libs/draw_bbox.py +19 -22
  15. magic_pdf/libs/language.py +3 -0
  16. magic_pdf/libs/pdf_check.py +30 -30
  17. magic_pdf/libs/version.py +1 -1
  18. magic_pdf/model/__init__.py +1 -1
  19. magic_pdf/model/batch_analyze.py +275 -0
  20. magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
  21. magic_pdf/model/magic_model.py +4 -435
  22. magic_pdf/model/model_list.py +1 -0
  23. magic_pdf/model/pdf_extract_kit.py +35 -5
  24. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  25. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  26. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  27. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  29. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  30. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  31. magic_pdf/model/sub_modules/model_init.py +43 -7
  32. magic_pdf/model/sub_modules/model_utils.py +17 -5
  33. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  34. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  35. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  36. magic_pdf/operators/__init__.py +94 -0
  37. magic_pdf/operators/models.py +154 -0
  38. magic_pdf/operators/pipes.py +191 -0
  39. magic_pdf/pdf_parse_union_core_v2.py +77 -27
  40. magic_pdf/post_proc/__init__.py +1 -0
  41. magic_pdf/post_proc/llm_aided.py +133 -0
  42. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  43. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  44. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  45. magic_pdf/tools/cli.py +36 -11
  46. magic_pdf/tools/common.py +120 -61
  47. magic_pdf/utils/office_to_pdf.py +29 -0
  48. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
  49. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
  50. magic_pdf/para/__init__.py +0 -0
  51. magic_pdf/pdf_parse_by_ocr.py +0 -23
  52. magic_pdf/pdf_parse_by_txt.py +0 -24
  53. magic_pdf/pipe/AbsPipe.py +0 -98
  54. magic_pdf/pipe/OCRPipe.py +0 -41
  55. magic_pdf/pipe/TXTPipe.py +0 -41
  56. magic_pdf/pipe/UNIPipe.py +0 -98
  57. magic_pdf/pipe/__init__.py +0 -0
  58. magic_pdf/rw/AbsReaderWriter.py +0 -17
  59. magic_pdf/rw/DiskReaderWriter.py +0 -74
  60. magic_pdf/rw/S3ReaderWriter.py +0 -142
  61. magic_pdf/rw/__init__.py +0 -0
  62. magic_pdf/user_api.py +0 -121
  63. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  64. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
  65. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
  66. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
  67. {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,36 +1,33 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- magic_pdf/pdf_parse_by_ocr.py,sha256=WTaLVSU2wRpgtldasnqbrw1B0OvVi8VvcB_t-dAIfmw,880
3
- magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
4
- magic_pdf/pdf_parse_union_core_v2.py,sha256=6Apku7-pW450HbHNTtbVLDyroRSKlQ57w9f0ScOaZv4,30879
5
- magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
2
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=uZiTwyVT5iE4YRHzUfN5kifHtYuEuKIqQHnAZQofWuM,33292
6
3
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- magic_pdf/config/constants.py,sha256=gqhUEtso7rCop-k-VvEPAMW_6pA6Tv2Y9smrr_0Iajo,1173
4
+ magic_pdf/config/constants.py,sha256=fXGzANULnJWLPxwYp3BEFWx-rnorzpySMx63ffyEyq4,1272
8
5
  magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
9
6
  magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
10
7
  magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
11
- magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zkE,622
8
+ magic_pdf/config/exceptions.py,sha256=2tsJxYUebVeimyYBGQkc9Nd1kIakTmWmz3SDcfJWy54,784
12
9
  magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLxLgdBEeRvLg,248
13
10
  magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
14
11
  magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
15
12
  magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
17
- magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
13
+ magic_pdf/data/dataset.py,sha256=q7wfX99HTVjKCFVpf1mnYn55rK6oF5Dz8O9w4C9cYhw,11196
14
+ magic_pdf/data/read_api.py,sha256=_faBnYE3iU_EiQLNFjVM6a8IQtOGAcSQNYBZsTSN1d8,5225
18
15
  magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
19
- magic_pdf/data/utils.py,sha256=uaSHprh80D_puPUmd1slQDoE4uecNn4zZMzYWY0-a-8,917
16
+ magic_pdf/data/utils.py,sha256=aMeQB3soGUJyoI41hfgWeOZNzPj36SOrewUM7z51AOU,2305
20
17
  magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
21
- magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
22
- magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
23
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
18
+ magic_pdf/data/data_reader_writer/base.py,sha256=nqmAcdHOXMOJO6RAT3ILligDFaw8Op0STyCw5yOzAbI,1706
19
+ magic_pdf/data/data_reader_writer/filebase.py,sha256=VbNAxLyo0Io0j7iprJERt_TqxzHAtA7cUyPIaJstToU,2146
20
+ magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=4pEJ8PPd3nX7sccHobCs0mbDM8BiqDP_sAEz7CIvpNI,5938
24
21
  magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
25
22
  magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
26
23
  magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
27
24
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
28
25
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
29
26
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=pE-lEUsYAhZC3nSmbgYO42Kvk_bW8Ds-AL-QMPHFu8c,12941
31
- magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=RQ47F2CT0Zgmg1rZoqYj5IW5msqoCTEF6GEHi3mVd8U,12989
28
+ magic_pdf/filter/__init__.py,sha256=rV4dvUxfKyVErDx9ZbUp8DVq_fRIlv0lfSXp1ND4STc,1503
32
29
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
33
- magic_pdf/filter/pdf_meta_scan.py,sha256=3ba7SxXu1z2r5N97Dxmp_L10Lo7llsrBlvtEAJeIJBQ,17403
30
+ magic_pdf/filter/pdf_meta_scan.py,sha256=rqTuStW2_ICr3HmV_9IQ5jnsl4JnSh7-bL11vbtH3i0,17470
34
31
  magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
32
  magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
33
  magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
@@ -38,33 +35,38 @@ magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCal
38
35
  magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
39
36
  magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
37
  magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
41
- magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
38
+ magic_pdf/libs/clean_memory.py,sha256=OsQexCjmBO2i-Hv-0uYQfn72dbUWR8sTW81nG2zlNQQ,479
42
39
  magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
43
- magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
40
+ magic_pdf/libs/config_reader.py,sha256=9GXK7jtDyA_jxXKWZAQ69rB02v5UW4mOmo1IaOYOkW0,4374
44
41
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
45
42
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
46
- magic_pdf/libs/draw_bbox.py,sha256=2IXr4TUxm0-pXYIPkNaELWo9pOysZC6etpqzTE5eg-w,17588
43
+ magic_pdf/libs/draw_bbox.py,sha256=RX_ELX6P8mF0sIBx_h2A3BzhevcSPIzbbrboZTBlBik,17653
47
44
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
48
45
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
49
- magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
46
+ magic_pdf/libs/language.py,sha256=w1GVYmlocF7DQjtZrspgH6WacoWazOQBzz-iQx0mSBk,1135
50
47
  magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
51
48
  magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
52
49
  magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
53
- magic_pdf/libs/pdf_check.py,sha256=wCVOcwEPeMRcHW5OGN-GSQnPT5qNXUYHWWowoUknxF4,3178
50
+ magic_pdf/libs/pdf_check.py,sha256=zBwUThKKBtnrNPmgE10lYsTy1Kq7j_6IejO7JR0J4pA,3118
54
51
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
55
52
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
56
- magic_pdf/libs/version.py,sha256=c61d5YjslqtpItkzB2NGlURm177H2racruHXV9G6u6s,23
57
- magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
58
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=YZwlhIgidy1_MUyTM_MRSLfKR_rpi508Bra6Vpj8PJ4,7125
59
- magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
60
- magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
61
- magic_pdf/model/pdf_extract_kit.py,sha256=ceYWlSU1BhakfsHPVM9SrUx35EvCBa20uJmgDO5PAtE,10933
53
+ magic_pdf/libs/version.py,sha256=J-j-u0itpEFT6irdmWmixQqYMadNl1X91TxUmoiLHMI,22
54
+ magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
55
+ magic_pdf/model/batch_analyze.py,sha256=A49qD5zY9G8nl6wnpMLATqS4_xOOgRvjo1Eq6v_mcUE,11551
56
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=AZSzqzGz5utLwuysF5JY5k5pYcJGtcppNP2BbvExLnA,5989
57
+ magic_pdf/model/magic_model.py,sha256=Nt74oZGYUcbm4qdOQtN-hbKhXxlWO2LVv3K9yXvteWY,25204
58
+ magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
59
+ magic_pdf/model/pdf_extract_kit.py,sha256=Z7zzU_lkVR0vgycpeqVe1pwLc4svYThIUSEdTJVLVNM,12287
62
60
  magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
63
61
  magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
- magic_pdf/model/sub_modules/model_init.py,sha256=CnlZLsiSOmGJXQRASH-hMmuPiF6hYKCNfmzDTjQqy5g,5073
65
- magic_pdf/model/sub_modules/model_utils.py,sha256=ToiuwXbrvH_CPIwW2AXzz9miadUN5FA7lthwBljtIco,2118
62
+ magic_pdf/model/sub_modules/model_init.py,sha256=-sVlsOhSjLakOOmw675iDdSQSBW6Py5U0K0XiM3UpvU,6423
63
+ magic_pdf/model/sub_modules/model_utils.py,sha256=2pI1Xcr2zCF3b64e4WoFtIbjSmTVYBE4zjyHB23gvmE,2488
64
+ magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
65
+ magic_pdf/model/sub_modules/language_detection/utils.py,sha256=5nec_loLyYCJ5o6n38AYLz2SKmRvHDCBdt6ka84EaGM,3096
66
+ magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=GW_9WkqIzpJm1MFJexZ2ZvA6AjoqM-6yh8p4LupJhas,4762
67
+ magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
66
68
  magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
67
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
69
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=5DXZhbkLrycF3FGF8OMuuHGJtTHLSkTGetxxi5KWDgw,2189
68
70
  magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
71
  magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
72
  magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
@@ -87,15 +89,15 @@ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/model
87
89
  magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
88
90
  magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
89
91
  magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
90
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=A0eABWvJLyRH6kENWU31g66D2QQos12S0hEmbOuoB0g,347
92
+ magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=QfHbMr1br0pOJUu1NJEMgA6yw11G0yFImJv_AfW48_c,1008
91
93
  magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
94
  magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
93
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1PgpFrE0RcmCRl19oXbudxwgXc,3528
95
+ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=Jgi38JDo6D6sVVnBJ1XZ-iAT9qjj5jW__NL-8GKJb78,5290
94
96
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
95
97
  magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
96
98
  magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=rwKphio9SZgiNgqASWOBWZIf6PPi3kvgQO_qJLc_diE,10726
98
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=d__xICejA_Q-Cz4cfajwroDjfA0dT4TL18XAFYYc4OQ,7265
99
+ magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=nT47hOH0rG9_dm4FMU_UNWvoX1IRW0t7TPKQw5XfMQ4,12324
100
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=QBBeFN1iF7nj5gqQ5sQXjhpwy8lB4c96gubnRDBuDNU,8424
99
101
  magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
100
102
  magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
101
103
  magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -104,44 +106,41 @@ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4
104
106
  magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
107
  magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
106
108
  magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
107
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=_FKKOSKeceusx6DCnhqYzP-4b1zSWptrefimxFTmy8Q,583
109
+ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=YsfgKEM0ETBBYsYmPdR9DAJIGeguK-oy9Pn25vS07CE,1953
108
110
  magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
109
111
  magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
110
112
  magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
113
  magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
112
- magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
113
- magic_pdf/para/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
114
- magic_pdf/pipe/AbsPipe.py,sha256=jPtAa0pz_vPddya3ZpUk6UrGqp8PcBdLONO1spzavQo,4371
115
- magic_pdf/pipe/OCRPipe.py,sha256=nuN-zpUzu--gyrC0_vsvvilAyK7Mp3Tom_UOnsur1ps,2158
116
- magic_pdf/pipe/TXTPipe.py,sha256=5OFo2e8U5Y24wJrFDEJghBDpklnKFEnzKTYVnnhQssE,2159
117
- magic_pdf/pipe/UNIPipe.py,sha256=ik0xXPdsHo7Un0gFpLC5ul04BP3Omd2mp5gqem40deE,4807
118
- magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
+ magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1REI,2630
115
+ magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
116
+ magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
117
+ magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
118
+ magic_pdf/post_proc/llm_aided.py,sha256=6eKZAfc0Vk_wX7NyYDle71rf1WWa2-7ZKXQ_Vm7Pem8,4722
119
+ magic_pdf/post_proc/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
119
120
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
121
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
121
122
  magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
122
123
  magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
123
124
  magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
124
- magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
125
- magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
125
+ magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=MVRO_GLsOtmsnj77veH3_QToU9A3gjq7qC6zt73Af1s,3101
126
+ magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
126
127
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
127
128
  magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
128
129
  magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
129
130
  magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
130
- magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
131
- magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
132
- magic_pdf/rw/S3ReaderWriter.py,sha256=LmbtA-pZlC745nnSUs67C1iqSrBDS7IzE6QC8YMB644,5293
133
- magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
+ magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt,sha256=dV4lcudF8wimEbAooYbvISvFhrXjp9i0rMRqv9VW6hY,3204667
134
132
  magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
135
133
  magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
136
134
  magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
- magic_pdf/tools/cli.py,sha256=83a8p4_DvVdDOTuviE6WqexSXsDE_MUY-af3QDxXeoU,3067
135
+ magic_pdf/tools/cli.py,sha256=YiX9LU4UeU3yYIpblGO1cbO95Tbo3A8cmWFK_1WvqfU,4134
138
136
  magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
139
- magic_pdf/tools/common.py,sha256=ILTv8YjnK-XTVV5nzak3Sm-EJJXjG1hJJghlYKgYVBQ,6809
137
+ magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,8381
140
138
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
141
139
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
142
- magic_pdf-0.10.5.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
143
- magic_pdf-0.10.5.dist-info/METADATA,sha256=TIb8C_MrpU0_XwZc2dLfKpH5wQtE8G8Q0w56OPWYG30,36992
144
- magic_pdf-0.10.5.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
145
- magic_pdf-0.10.5.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
146
- magic_pdf-0.10.5.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
147
- magic_pdf-0.10.5.dist-info/RECORD,,
140
+ magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
141
+ magic_pdf-1.0.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
142
+ magic_pdf-1.0.0.dist-info/METADATA,sha256=itfWxjtkwtZt0xdUfRuDu0v6aXQUB8yEhA5tCimZkgc,40499
143
+ magic_pdf-1.0.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
144
+ magic_pdf-1.0.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
145
+ magic_pdf-1.0.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
146
+ magic_pdf-1.0.0.dist-info/RECORD,,
File without changes
@@ -1,23 +0,0 @@
1
- from magic_pdf.config.enums import SupportedPdfParseMethod
2
- from magic_pdf.data.dataset import PymuDocDataset
3
- from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
4
-
5
-
6
- def parse_pdf_by_ocr(pdf_bytes,
7
- model_list,
8
- imageWriter,
9
- start_page_id=0,
10
- end_page_id=None,
11
- debug_mode=False,
12
- lang=None,
13
- ):
14
- dataset = PymuDocDataset(pdf_bytes)
15
- return pdf_parse_union(dataset,
16
- model_list,
17
- imageWriter,
18
- SupportedPdfParseMethod.OCR,
19
- start_page_id=start_page_id,
20
- end_page_id=end_page_id,
21
- debug_mode=debug_mode,
22
- lang=lang,
23
- )
@@ -1,24 +0,0 @@
1
- from magic_pdf.config.enums import SupportedPdfParseMethod
2
- from magic_pdf.data.dataset import PymuDocDataset
3
- from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
4
-
5
-
6
- def parse_pdf_by_txt(
7
- pdf_bytes,
8
- model_list,
9
- imageWriter,
10
- start_page_id=0,
11
- end_page_id=None,
12
- debug_mode=False,
13
- lang=None,
14
- ):
15
- dataset = PymuDocDataset(pdf_bytes)
16
- return pdf_parse_union(dataset,
17
- model_list,
18
- imageWriter,
19
- SupportedPdfParseMethod.TXT,
20
- start_page_id=start_page_id,
21
- end_page_id=end_page_id,
22
- debug_mode=debug_mode,
23
- lang=lang,
24
- )
magic_pdf/pipe/AbsPipe.py DELETED
@@ -1,98 +0,0 @@
1
- from abc import ABC, abstractmethod
2
-
3
- from magic_pdf.config.drop_reason import DropReason
4
- from magic_pdf.config.make_content_config import DropMode, MakeMode
5
- from magic_pdf.data.data_reader_writer import DataWriter
6
- from magic_pdf.dict2md.ocr_mkcontent import union_make
7
- from magic_pdf.filter.pdf_classify_by_type import classify
8
- from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
9
- from magic_pdf.libs.json_compressor import JsonCompressor
10
-
11
-
12
- class AbsPipe(ABC):
13
- """txt和ocr处理的抽象类."""
14
- PIP_OCR = 'ocr'
15
- PIP_TXT = 'txt'
16
-
17
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
18
- start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
19
- self.pdf_bytes = pdf_bytes
20
- self.model_list = model_list
21
- self.image_writer = image_writer
22
- self.pdf_mid_data = None # 未压缩
23
- self.is_debug = is_debug
24
- self.start_page_id = start_page_id
25
- self.end_page_id = end_page_id
26
- self.lang = lang
27
- self.layout_model = layout_model
28
- self.formula_enable = formula_enable
29
- self.table_enable = table_enable
30
-
31
- def get_compress_pdf_mid_data(self):
32
- return JsonCompressor.compress_json(self.pdf_mid_data)
33
-
34
- @abstractmethod
35
- def pipe_classify(self):
36
- """有状态的分类."""
37
- raise NotImplementedError
38
-
39
- @abstractmethod
40
- def pipe_analyze(self):
41
- """有状态的跑模型分析."""
42
- raise NotImplementedError
43
-
44
- @abstractmethod
45
- def pipe_parse(self):
46
- """有状态的解析."""
47
- raise NotImplementedError
48
-
49
- def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
50
- content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
51
- return content_list
52
-
53
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
54
- md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
55
- return md_content
56
-
57
- @staticmethod
58
- def classify(pdf_bytes: bytes) -> str:
59
- """根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
60
- pdf_meta = pdf_meta_scan(pdf_bytes)
61
- if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
62
- raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
63
- else:
64
- is_encrypted = pdf_meta['is_encrypted']
65
- is_needs_password = pdf_meta['is_needs_password']
66
- if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
67
- raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
68
- else:
69
- is_text_pdf, results = classify(
70
- pdf_meta['total_page'],
71
- pdf_meta['page_width_pts'],
72
- pdf_meta['page_height_pts'],
73
- pdf_meta['image_info_per_page'],
74
- pdf_meta['text_len_per_page'],
75
- pdf_meta['imgs_per_page'],
76
- pdf_meta['text_layout_per_page'],
77
- pdf_meta['invalid_chars'],
78
- )
79
- if is_text_pdf:
80
- return AbsPipe.PIP_TXT
81
- else:
82
- return AbsPipe.PIP_OCR
83
-
84
- @staticmethod
85
- def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
86
- """根据pdf类型,生成统一格式content_list."""
87
- pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
88
- pdf_info_list = pdf_mid_data['pdf_info']
89
- content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
90
- return content_list
91
-
92
- @staticmethod
93
- def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
94
- """根据pdf类型,markdown."""
95
- pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
96
- pdf_info_list = pdf_mid_data['pdf_info']
97
- md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
98
- return md_content
magic_pdf/pipe/OCRPipe.py DELETED
@@ -1,41 +0,0 @@
1
- from loguru import logger
2
-
3
- from magic_pdf.config.make_content_config import DropMode, MakeMode
4
- from magic_pdf.data.data_reader_writer import DataWriter
5
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
6
- from magic_pdf.pipe.AbsPipe import AbsPipe
7
- from magic_pdf.user_api import parse_ocr_pdf
8
-
9
-
10
- class OCRPipe(AbsPipe):
11
-
12
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
13
- start_page_id=0, end_page_id=None, lang=None,
14
- layout_model=None, formula_enable=None, table_enable=None):
15
- super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
16
- layout_model, formula_enable, table_enable)
17
-
18
- def pipe_classify(self):
19
- pass
20
-
21
- def pipe_analyze(self):
22
- self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
23
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
24
- lang=self.lang, layout_model=self.layout_model,
25
- formula_enable=self.formula_enable, table_enable=self.table_enable)
26
-
27
- def pipe_parse(self):
28
- self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
29
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
30
- lang=self.lang, layout_model=self.layout_model,
31
- formula_enable=self.formula_enable, table_enable=self.table_enable)
32
-
33
- def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
34
- result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
35
- logger.info('ocr_pipe mk content list finished')
36
- return result
37
-
38
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
39
- result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
40
- logger.info(f'ocr_pipe mk {md_make_mode} finished')
41
- return result
magic_pdf/pipe/TXTPipe.py DELETED
@@ -1,41 +0,0 @@
1
- from loguru import logger
2
-
3
- from magic_pdf.config.make_content_config import DropMode, MakeMode
4
- from magic_pdf.data.data_reader_writer import DataWriter
5
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
6
- from magic_pdf.pipe.AbsPipe import AbsPipe
7
- from magic_pdf.user_api import parse_txt_pdf
8
-
9
-
10
- class TXTPipe(AbsPipe):
11
-
12
- def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
13
- start_page_id=0, end_page_id=None, lang=None,
14
- layout_model=None, formula_enable=None, table_enable=None):
15
- super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
16
- layout_model, formula_enable, table_enable)
17
-
18
- def pipe_classify(self):
19
- pass
20
-
21
- def pipe_analyze(self):
22
- self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
23
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
24
- lang=self.lang, layout_model=self.layout_model,
25
- formula_enable=self.formula_enable, table_enable=self.table_enable)
26
-
27
- def pipe_parse(self):
28
- self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
29
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
30
- lang=self.lang, layout_model=self.layout_model,
31
- formula_enable=self.formula_enable, table_enable=self.table_enable)
32
-
33
- def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
34
- result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
35
- logger.info('txt_pipe mk content list finished')
36
- return result
37
-
38
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
39
- result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
40
- logger.info(f'txt_pipe mk {md_make_mode} finished')
41
- return result
magic_pdf/pipe/UNIPipe.py DELETED
@@ -1,98 +0,0 @@
1
- import json
2
-
3
- from loguru import logger
4
-
5
- from magic_pdf.config.make_content_config import DropMode, MakeMode
6
- from magic_pdf.data.data_reader_writer import DataWriter
7
- from magic_pdf.libs.commons import join_path
8
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
9
- from magic_pdf.pipe.AbsPipe import AbsPipe
10
- from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
11
-
12
-
13
- class UNIPipe(AbsPipe):
14
-
15
- def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: DataWriter, is_debug: bool = False,
16
- start_page_id=0, end_page_id=None, lang=None,
17
- layout_model=None, formula_enable=None, table_enable=None):
18
- self.pdf_type = jso_useful_key['_pdf_type']
19
- super().__init__(pdf_bytes, jso_useful_key['model_list'], image_writer, is_debug, start_page_id, end_page_id,
20
- lang, layout_model, formula_enable, table_enable)
21
- if len(self.model_list) == 0:
22
- self.input_model_is_empty = True
23
- else:
24
- self.input_model_is_empty = False
25
-
26
- def pipe_classify(self):
27
- self.pdf_type = AbsPipe.classify(self.pdf_bytes)
28
-
29
- def pipe_analyze(self):
30
- if self.pdf_type == self.PIP_TXT:
31
- self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
32
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
33
- lang=self.lang, layout_model=self.layout_model,
34
- formula_enable=self.formula_enable, table_enable=self.table_enable)
35
- elif self.pdf_type == self.PIP_OCR:
36
- self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
37
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
38
- lang=self.lang, layout_model=self.layout_model,
39
- formula_enable=self.formula_enable, table_enable=self.table_enable)
40
-
41
- def pipe_parse(self):
42
- if self.pdf_type == self.PIP_TXT:
43
- self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
44
- is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
45
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
46
- lang=self.lang, layout_model=self.layout_model,
47
- formula_enable=self.formula_enable, table_enable=self.table_enable)
48
- elif self.pdf_type == self.PIP_OCR:
49
- self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
50
- is_debug=self.is_debug,
51
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
52
- lang=self.lang)
53
-
54
- def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
55
- result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
56
- logger.info('uni_pipe mk content list finished')
57
- return result
58
-
59
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
60
- result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
61
- logger.info(f'uni_pipe mk {md_make_mode} finished')
62
- return result
63
-
64
-
65
- if __name__ == '__main__':
66
- # 测试
67
- from magic_pdf.data.data_reader_writer import DataReader
68
- drw = DataReader(r'D:/project/20231108code-clean')
69
-
70
- pdf_file_path = r'linshixuqiu\19983-00.pdf'
71
- model_file_path = r'linshixuqiu\19983-00.json'
72
- pdf_bytes = drw.read(pdf_file_path)
73
- model_json_txt = drw.read(model_file_path).decode()
74
- model_list = json.loads(model_json_txt)
75
- write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
76
- img_bucket_path = 'imgs'
77
- img_writer = DataWriter(join_path(write_path, img_bucket_path))
78
-
79
- # pdf_type = UNIPipe.classify(pdf_bytes)
80
- # jso_useful_key = {
81
- # "_pdf_type": pdf_type,
82
- # "model_list": model_list
83
- # }
84
-
85
- jso_useful_key = {
86
- '_pdf_type': '',
87
- 'model_list': model_list
88
- }
89
- pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
90
- pipe.pipe_classify()
91
- pipe.pipe_parse()
92
- md_content = pipe.pipe_mk_markdown(img_bucket_path)
93
- content_list = pipe.pipe_mk_uni_format(img_bucket_path)
94
-
95
- md_writer = DataWriter(write_path)
96
- md_writer.write_string('19983-00.md', md_content)
97
- md_writer.write_string('19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4))
98
- md_writer.write_string('19983-00.txt', str(content_list))
File without changes
@@ -1,17 +0,0 @@
1
- from abc import ABC, abstractmethod
2
-
3
-
4
- class AbsReaderWriter(ABC):
5
- MODE_TXT = "text"
6
- MODE_BIN = "binary"
7
- @abstractmethod
8
- def read(self, path: str, mode=MODE_TXT):
9
- raise NotImplementedError
10
-
11
- @abstractmethod
12
- def write(self, content: str, path: str, mode=MODE_TXT):
13
- raise NotImplementedError
14
-
15
- @abstractmethod
16
- def read_offset(self, path: str, offset=0, limit=None) -> bytes:
17
- raise NotImplementedError
@@ -1,74 +0,0 @@
1
- import os
2
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
3
- from loguru import logger
4
-
5
-
6
- class DiskReaderWriter(AbsReaderWriter):
7
- def __init__(self, parent_path, encoding="utf-8"):
8
- self.path = parent_path
9
- self.encoding = encoding
10
-
11
- def read(self, path, mode=AbsReaderWriter.MODE_TXT):
12
- if os.path.isabs(path):
13
- abspath = path
14
- else:
15
- abspath = os.path.join(self.path, path)
16
- if not os.path.exists(abspath):
17
- logger.error(f"file {abspath} not exists")
18
- raise Exception(f"file {abspath} no exists")
19
- if mode == AbsReaderWriter.MODE_TXT:
20
- with open(abspath, "r", encoding=self.encoding) as f:
21
- return f.read()
22
- elif mode == AbsReaderWriter.MODE_BIN:
23
- with open(abspath, "rb") as f:
24
- return f.read()
25
- else:
26
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
27
-
28
- def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
29
- if os.path.isabs(path):
30
- abspath = path
31
- else:
32
- abspath = os.path.join(self.path, path)
33
- directory_path = os.path.dirname(abspath)
34
- if not os.path.exists(directory_path):
35
- os.makedirs(directory_path)
36
- if mode == AbsReaderWriter.MODE_TXT:
37
- with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
38
- f.write(content)
39
-
40
- elif mode == AbsReaderWriter.MODE_BIN:
41
- with open(abspath, "wb") as f:
42
- f.write(content)
43
- else:
44
- raise ValueError("Invalid mode. Use 'text' or 'binary'.")
45
-
46
- def read_offset(self, path: str, offset=0, limit=None):
47
- abspath = path
48
- if not os.path.isabs(path):
49
- abspath = os.path.join(self.path, path)
50
- with open(abspath, "rb") as f:
51
- f.seek(offset)
52
- return f.read(limit)
53
-
54
-
55
- if __name__ == "__main__":
56
- if 0:
57
- file_path = "io/test/example.txt"
58
- drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
59
-
60
- # 写入内容到文件
61
- drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
62
-
63
- # 从文件读取内容
64
- content = drw.read(path=file_path)
65
- if content:
66
- logger.info(f"从 {file_path} 读取的内容: {content}")
67
- if 1:
68
- drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
69
- content_bin = drw.read_offset("1.txt")
70
- assert content_bin == b"ABCD!"
71
-
72
- content_bin = drw.read_offset("1.txt", offset=1, limit=2)
73
- assert content_bin == b"BC"
74
-