magic-pdf 0.10.6__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. magic_pdf/config/constants.py +2 -0
  2. magic_pdf/config/exceptions.py +7 -0
  3. magic_pdf/data/data_reader_writer/filebase.py +1 -1
  4. magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
  5. magic_pdf/data/dataset.py +13 -1
  6. magic_pdf/data/read_api.py +59 -12
  7. magic_pdf/data/utils.py +35 -0
  8. magic_pdf/dict2md/ocr_mkcontent.py +14 -13
  9. magic_pdf/libs/clean_memory.py +11 -4
  10. magic_pdf/libs/config_reader.py +9 -0
  11. magic_pdf/libs/draw_bbox.py +8 -12
  12. magic_pdf/libs/language.py +3 -0
  13. magic_pdf/libs/version.py +1 -1
  14. magic_pdf/model/__init__.py +1 -125
  15. magic_pdf/model/batch_analyze.py +275 -0
  16. magic_pdf/model/doc_analyze_by_custom_model.py +4 -51
  17. magic_pdf/model/magic_model.py +4 -435
  18. magic_pdf/model/model_list.py +1 -0
  19. magic_pdf/model/pdf_extract_kit.py +33 -22
  20. magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
  21. magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
  22. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
  23. magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
  24. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
  25. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
  26. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
  27. magic_pdf/model/sub_modules/model_init.py +30 -4
  28. magic_pdf/model/sub_modules/model_utils.py +8 -2
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
  31. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
  32. magic_pdf/operators/__init__.py +94 -0
  33. magic_pdf/{model/operators.py → operators/models.py} +2 -38
  34. magic_pdf/{pipe/operators.py → operators/pipes.py} +70 -17
  35. magic_pdf/pdf_parse_union_core_v2.py +71 -17
  36. magic_pdf/post_proc/__init__.py +1 -0
  37. magic_pdf/post_proc/llm_aided.py +133 -0
  38. magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
  39. magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
  40. magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
  41. magic_pdf/tools/cli.py +36 -11
  42. magic_pdf/tools/common.py +28 -18
  43. magic_pdf/utils/office_to_pdf.py +29 -0
  44. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/METADATA +73 -23
  45. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/RECORD +50 -53
  46. magic_pdf/para/__init__.py +0 -0
  47. magic_pdf/pdf_parse_by_ocr.py +0 -22
  48. magic_pdf/pdf_parse_by_txt.py +0 -23
  49. magic_pdf/pipe/AbsPipe.py +0 -99
  50. magic_pdf/pipe/OCRPipe.py +0 -80
  51. magic_pdf/pipe/TXTPipe.py +0 -42
  52. magic_pdf/pipe/UNIPipe.py +0 -150
  53. magic_pdf/pipe/__init__.py +0 -0
  54. magic_pdf/rw/AbsReaderWriter.py +0 -17
  55. magic_pdf/rw/DiskReaderWriter.py +0 -74
  56. magic_pdf/rw/S3ReaderWriter.py +0 -142
  57. magic_pdf/rw/__init__.py +0 -0
  58. magic_pdf/user_api.py +0 -144
  59. /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
  60. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/LICENSE.md +0 -0
  61. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/WHEEL +0 -0
  62. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/entry_points.txt +0 -0
  63. {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,33 +1,30 @@
1
1
  magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- magic_pdf/pdf_parse_by_ocr.py,sha256=WFk6jhHSGvy8-hU2Qlpo5q-VORdSK_5Erh9IA_H7ZbQ,840
3
- magic_pdf/pdf_parse_by_txt.py,sha256=1-xieVOP8qmAC957ftzSzaeviv0-QC4yL6Lv6Pcg_6Y,722
4
- magic_pdf/pdf_parse_union_core_v2.py,sha256=Hl8PSJOJFHAGCdTwX4YY2_MMgjAuat47yALLb_E-DYg,30879
5
- magic_pdf/user_api.py,sha256=EAalk3WfQTfBq4qKMcISuHSjQg2Ku61ox_WiOPeFfuY,4060
2
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=wjc9Ev7F-FV3UQQkpY_hRcBMC8xczZqUrUd7s6qJrLk,33473
6
3
  magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- magic_pdf/config/constants.py,sha256=CEhNtP8o_2zcK6DesO6cNDlpS9fUdRv-QUyHw0_vsso,1222
4
+ magic_pdf/config/constants.py,sha256=fXGzANULnJWLPxwYp3BEFWx-rnorzpySMx63ffyEyq4,1272
8
5
  magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
9
6
  magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
10
7
  magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
11
- magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zkE,622
8
+ magic_pdf/config/exceptions.py,sha256=2tsJxYUebVeimyYBGQkc9Nd1kIakTmWmz3SDcfJWy54,784
12
9
  magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLxLgdBEeRvLg,248
13
10
  magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
14
11
  magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
15
12
  magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
- magic_pdf/data/dataset.py,sha256=NpljxcttgRk4_Rl8Rf191t_vNIdbqIpK5x1xHAGE2iI,10686
17
- magic_pdf/data/read_api.py,sha256=hGpSVg9EcyM2mIlOsDIwsl7Y_ybWf9kkoxRumIXSzQQ,3566
13
+ magic_pdf/data/dataset.py,sha256=q7wfX99HTVjKCFVpf1mnYn55rK6oF5Dz8O9w4C9cYhw,11196
14
+ magic_pdf/data/read_api.py,sha256=_faBnYE3iU_EiQLNFjVM6a8IQtOGAcSQNYBZsTSN1d8,5225
18
15
  magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
19
- magic_pdf/data/utils.py,sha256=uaSHprh80D_puPUmd1slQDoE4uecNn4zZMzYWY0-a-8,917
16
+ magic_pdf/data/utils.py,sha256=aMeQB3soGUJyoI41hfgWeOZNzPj36SOrewUM7z51AOU,2305
20
17
  magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
21
18
  magic_pdf/data/data_reader_writer/base.py,sha256=nqmAcdHOXMOJO6RAT3ILligDFaw8Op0STyCw5yOzAbI,1706
22
- magic_pdf/data/data_reader_writer/filebase.py,sha256=iVjBT1M_89F5HB2uMsBzLfCNhsTVnmurJk4VUuAA2tw,2111
23
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
19
+ magic_pdf/data/data_reader_writer/filebase.py,sha256=VbNAxLyo0Io0j7iprJERt_TqxzHAtA7cUyPIaJstToU,2146
20
+ magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=4pEJ8PPd3nX7sccHobCs0mbDM8BiqDP_sAEz7CIvpNI,5938
24
21
  magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
25
22
  magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
26
23
  magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
27
24
  magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
28
25
  magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
29
26
  magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
- magic_pdf/dict2md/ocr_mkcontent.py,sha256=zmEbxuIdFPfy3W72Zx_EEgyYtIOKcTa-0JoXHgXkEJ8,13046
27
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=RQ47F2CT0Zgmg1rZoqYj5IW5msqoCTEF6GEHi3mVd8U,12989
31
28
  magic_pdf/filter/__init__.py,sha256=rV4dvUxfKyVErDx9ZbUp8DVq_fRIlv0lfSXp1ND4STc,1503
32
29
  magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
33
30
  magic_pdf/filter/pdf_meta_scan.py,sha256=rqTuStW2_ICr3HmV_9IQ5jnsl4JnSh7-bL11vbtH3i0,17470
@@ -38,34 +35,38 @@ magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCal
38
35
  magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
39
36
  magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
37
  magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
41
- magic_pdf/libs/clean_memory.py,sha256=BIOmEWuwR7c_p4OwTSW2muE3PRaGhmOplS-wTXt_EXk,211
38
+ magic_pdf/libs/clean_memory.py,sha256=OsQexCjmBO2i-Hv-0uYQfn72dbUWR8sTW81nG2zlNQQ,479
42
39
  magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
43
- magic_pdf/libs/config_reader.py,sha256=vDsxw2xbW7Gb1mKqERTSlttbXFNtVU0BDdae2dG7wEI,4068
40
+ magic_pdf/libs/config_reader.py,sha256=9GXK7jtDyA_jxXKWZAQ69rB02v5UW4mOmo1IaOYOkW0,4374
44
41
  magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
45
42
  magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
46
- magic_pdf/libs/draw_bbox.py,sha256=Z7-OOETUo90yj3tCV8MwbiJwckThcC0bjs4MXI9ocac,17561
43
+ magic_pdf/libs/draw_bbox.py,sha256=RX_ELX6P8mF0sIBx_h2A3BzhevcSPIzbbrboZTBlBik,17653
47
44
  magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
48
45
  magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
49
- magic_pdf/libs/language.py,sha256=Hj5-lrGoNExxdHLbkcNG-c27U4AjJ9AZPdZblaNSehU,1099
46
+ magic_pdf/libs/language.py,sha256=w1GVYmlocF7DQjtZrspgH6WacoWazOQBzz-iQx0mSBk,1135
50
47
  magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
51
48
  magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
52
49
  magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
53
50
  magic_pdf/libs/pdf_check.py,sha256=zBwUThKKBtnrNPmgE10lYsTy1Kq7j_6IejO7JR0J4pA,3118
54
51
  magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
55
52
  magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
56
- magic_pdf/libs/version.py,sha256=7qmFu9Qmzy5OxKJPN-LQOkzV_2T4cJYrUSLTfq7F3kE,23
57
- magic_pdf/model/__init__.py,sha256=R6uhAQucHJa87V81ahYHWEffG0-3F1792J4kaSxZpi8,3698
58
- magic_pdf/model/doc_analyze_by_custom_model.py,sha256=KAPRDgWUAzsXbofZ6i0ll9eaanPdPnfjM1nn4Pl8-Zo,7588
59
- magic_pdf/model/magic_model.py,sha256=ppMkMqtP7sKncHTZ2SbXuPOoR988iRPexBEMA6QeiIc,42208
60
- magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
61
- magic_pdf/model/operators.py,sha256=qcacETf6j-gDUj9g0zYJgBrkq0YWe6ZlfoPjJhCMUYU,6628
62
- magic_pdf/model/pdf_extract_kit.py,sha256=6JdWkdKOgL9UyAlI5znPMexs0AMZzn1SgrIpJUxWiGs,11839
53
+ magic_pdf/libs/version.py,sha256=d4QHYmS_30j0hPN8NmNPnQ_Z0TphDRbu4MtQj9cT9e8,22
54
+ magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
55
+ magic_pdf/model/batch_analyze.py,sha256=A49qD5zY9G8nl6wnpMLATqS4_xOOgRvjo1Eq6v_mcUE,11551
56
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=AZSzqzGz5utLwuysF5JY5k5pYcJGtcppNP2BbvExLnA,5989
57
+ magic_pdf/model/magic_model.py,sha256=Nt74oZGYUcbm4qdOQtN-hbKhXxlWO2LVv3K9yXvteWY,25204
58
+ magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
59
+ magic_pdf/model/pdf_extract_kit.py,sha256=Z7zzU_lkVR0vgycpeqVe1pwLc4svYThIUSEdTJVLVNM,12287
63
60
  magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
64
61
  magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
65
- magic_pdf/model/sub_modules/model_init.py,sha256=Sp4I2tQ2oFsTIBRHXv8-44WU1PvPSx4L3VfwnQUaRFo,5438
66
- magic_pdf/model/sub_modules/model_utils.py,sha256=svV5bn_Xw3QqSa22h7OrmlQQQySSqe3DdE6KMEURr2c,2219
62
+ magic_pdf/model/sub_modules/model_init.py,sha256=-sVlsOhSjLakOOmw675iDdSQSBW6Py5U0K0XiM3UpvU,6423
63
+ magic_pdf/model/sub_modules/model_utils.py,sha256=2pI1Xcr2zCF3b64e4WoFtIbjSmTVYBE4zjyHB23gvmE,2488
64
+ magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
65
+ magic_pdf/model/sub_modules/language_detection/utils.py,sha256=5nec_loLyYCJ5o6n38AYLz2SKmRvHDCBdt6ka84EaGM,3096
66
+ magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=GW_9WkqIzpJm1MFJexZ2ZvA6AjoqM-6yh8p4LupJhas,4762
67
+ magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
67
68
  magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
68
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=roe6Rth6cvBrCw0MWXcj1CBjvK3S_Ni7GC4DxY4-yBQ,886
69
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=5DXZhbkLrycF3FGF8OMuuHGJtTHLSkTGetxxi5KWDgw,2189
69
70
  magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
71
  magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
72
  magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
@@ -88,15 +89,15 @@ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/model
88
89
  magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
89
90
  magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
90
91
  magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
91
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=A0eABWvJLyRH6kENWU31g66D2QQos12S0hEmbOuoB0g,347
92
+ magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=QfHbMr1br0pOJUu1NJEMgA6yw11G0yFImJv_AfW48_c,1008
92
93
  magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
93
94
  magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=jeJkqID6L1ZivPMdK1PgpFrE0RcmCRl19oXbudxwgXc,3528
95
+ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=Jgi38JDo6D6sVVnBJ1XZ-iAT9qjj5jW__NL-8GKJb78,5290
95
96
  magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
96
97
  magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
98
  magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
98
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=rwKphio9SZgiNgqASWOBWZIf6PPi3kvgQO_qJLc_diE,10726
99
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=d__xICejA_Q-Cz4cfajwroDjfA0dT4TL18XAFYYc4OQ,7265
99
+ magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=nT47hOH0rG9_dm4FMU_UNWvoX1IRW0t7TPKQw5XfMQ4,12324
100
+ magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=QBBeFN1iF7nj5gqQ5sQXjhpwy8lB4c96gubnRDBuDNU,8424
100
101
  magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
101
102
  magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
102
103
  magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -105,45 +106,41 @@ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4
105
106
  magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
107
  magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
107
108
  magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=_FKKOSKeceusx6DCnhqYzP-4b1zSWptrefimxFTmy8Q,583
109
+ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=YsfgKEM0ETBBYsYmPdR9DAJIGeguK-oy9Pn25vS07CE,1953
109
110
  magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
111
  magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
111
112
  magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
112
113
  magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
113
- magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
- magic_pdf/para/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
115
- magic_pdf/pipe/AbsPipe.py,sha256=_Lx4Ags5suEvmJEvgHEvg6n0RP4Yqjc1VBWaCP0la2o,4410
116
- magic_pdf/pipe/OCRPipe.py,sha256=nH21Rq7mQEw7pS7AVD2MRFdSE0DxGc1wk9VXB6T0m3A,2396
117
- magic_pdf/pipe/TXTPipe.py,sha256=JXJ7hzD7TNq5VnCt33dck2FM15GpozJoHibaRlYD14s,2196
118
- magic_pdf/pipe/UNIPipe.py,sha256=i0kWflZ5BFHrx8p8vDntRcN6jecaxOfGq11ANtYvrZY,5011
119
- magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
120
- magic_pdf/pipe/operators.py,sha256=5z7kF95IWyBGxs4tIhqJml2YMlfDkU9B5xy__NiUxz0,4962
114
+ magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1REI,2630
115
+ magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
116
+ magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
117
+ magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
118
+ magic_pdf/post_proc/llm_aided.py,sha256=6eKZAfc0Vk_wX7NyYDle71rf1WWa2-7ZKXQ_Vm7Pem8,4722
119
+ magic_pdf/post_proc/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
121
120
  magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
122
121
  magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
123
122
  magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
124
123
  magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
125
124
  magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
126
- magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=a5OmIwtkXkz6fTQg6p8R-f1nA_w0rgMwKFQjfs_HwrE,2864
127
- magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=8eXNdsz9s06LX0kS0AxYSkaY1tWQQMkJfVtVSdjTQNE,3090
125
+ magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=MVRO_GLsOtmsnj77veH3_QToU9A3gjq7qC6zt73Af1s,3101
126
+ magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
128
127
  magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
129
128
  magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
130
129
  magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
131
130
  magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
132
- magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
133
- magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
134
- magic_pdf/rw/S3ReaderWriter.py,sha256=LmbtA-pZlC745nnSUs67C1iqSrBDS7IzE6QC8YMB644,5293
135
- magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
+ magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt,sha256=dV4lcudF8wimEbAooYbvISvFhrXjp9i0rMRqv9VW6hY,3204667
136
132
  magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
137
133
  magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
138
134
  magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
139
- magic_pdf/tools/cli.py,sha256=83a8p4_DvVdDOTuviE6WqexSXsDE_MUY-af3QDxXeoU,3067
135
+ magic_pdf/tools/cli.py,sha256=YiX9LU4UeU3yYIpblGO1cbO95Tbo3A8cmWFK_1WvqfU,4134
140
136
  magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
141
- magic_pdf/tools/common.py,sha256=x3dNHT9wEpdmkkEb4Y70DmUMMPavre5C82T0v9OmA2g,7894
137
+ magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,8381
142
138
  magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
143
139
  magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
144
- magic_pdf-0.10.6.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
145
- magic_pdf-0.10.6.dist-info/METADATA,sha256=CbT8tghajhhMHEawiHakbU-ndjeJ_J9J1011PFoYDbA,37144
146
- magic_pdf-0.10.6.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
147
- magic_pdf-0.10.6.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
148
- magic_pdf-0.10.6.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
149
- magic_pdf-0.10.6.dist-info/RECORD,,
140
+ magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
141
+ magic_pdf-1.0.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
142
+ magic_pdf-1.0.1.dist-info/METADATA,sha256=DC6l1zz6GIZxALaEdhdjyqzxDZ8784tTZYut_l2FFoQ,40499
143
+ magic_pdf-1.0.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
144
+ magic_pdf-1.0.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
145
+ magic_pdf-1.0.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
146
+ magic_pdf-1.0.1.dist-info/RECORD,,
File without changes
@@ -1,22 +0,0 @@
1
- from magic_pdf.config.enums import SupportedPdfParseMethod
2
- from magic_pdf.data.dataset import Dataset
3
- from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
4
-
5
-
6
- def parse_pdf_by_ocr(dataset: Dataset,
7
- model_list,
8
- imageWriter,
9
- start_page_id=0,
10
- end_page_id=None,
11
- debug_mode=False,
12
- lang=None,
13
- ):
14
- return pdf_parse_union(model_list,
15
- dataset,
16
- imageWriter,
17
- SupportedPdfParseMethod.OCR,
18
- start_page_id=start_page_id,
19
- end_page_id=end_page_id,
20
- debug_mode=debug_mode,
21
- lang=lang,
22
- )
@@ -1,23 +0,0 @@
1
- from magic_pdf.config.enums import SupportedPdfParseMethod
2
- from magic_pdf.data.dataset import Dataset
3
- from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
4
-
5
-
6
- def parse_pdf_by_txt(
7
- dataset: Dataset,
8
- model_list,
9
- imageWriter,
10
- start_page_id=0,
11
- end_page_id=None,
12
- debug_mode=False,
13
- lang=None,
14
- ):
15
- return pdf_parse_union(model_list,
16
- dataset,
17
- imageWriter,
18
- SupportedPdfParseMethod.TXT,
19
- start_page_id=start_page_id,
20
- end_page_id=end_page_id,
21
- debug_mode=debug_mode,
22
- lang=lang,
23
- )
magic_pdf/pipe/AbsPipe.py DELETED
@@ -1,99 +0,0 @@
1
- from abc import ABC, abstractmethod
2
-
3
- from magic_pdf.config.drop_reason import DropReason
4
- from magic_pdf.config.make_content_config import DropMode, MakeMode
5
- from magic_pdf.data.data_reader_writer import DataWriter
6
- from magic_pdf.data.dataset import Dataset
7
- from magic_pdf.dict2md.ocr_mkcontent import union_make
8
- from magic_pdf.filter.pdf_classify_by_type import classify
9
- from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
10
- from magic_pdf.libs.json_compressor import JsonCompressor
11
-
12
-
13
- class AbsPipe(ABC):
14
- """txt和ocr处理的抽象类."""
15
- PIP_OCR = 'ocr'
16
- PIP_TXT = 'txt'
17
-
18
- def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
19
- start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
20
- self.dataset = Dataset
21
- self.model_list = model_list
22
- self.image_writer = image_writer
23
- self.pdf_mid_data = None # 未压缩
24
- self.is_debug = is_debug
25
- self.start_page_id = start_page_id
26
- self.end_page_id = end_page_id
27
- self.lang = lang
28
- self.layout_model = layout_model
29
- self.formula_enable = formula_enable
30
- self.table_enable = table_enable
31
-
32
- def get_compress_pdf_mid_data(self):
33
- return JsonCompressor.compress_json(self.pdf_mid_data)
34
-
35
- @abstractmethod
36
- def pipe_classify(self):
37
- """有状态的分类."""
38
- raise NotImplementedError
39
-
40
- @abstractmethod
41
- def pipe_analyze(self):
42
- """有状态的跑模型分析."""
43
- raise NotImplementedError
44
-
45
- @abstractmethod
46
- def pipe_parse(self):
47
- """有状态的解析."""
48
- raise NotImplementedError
49
-
50
- def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
51
- content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
52
- return content_list
53
-
54
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
55
- md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
56
- return md_content
57
-
58
- @staticmethod
59
- def classify(pdf_bytes: bytes) -> str:
60
- """根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
61
- pdf_meta = pdf_meta_scan(pdf_bytes)
62
- if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
63
- raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
64
- else:
65
- is_encrypted = pdf_meta['is_encrypted']
66
- is_needs_password = pdf_meta['is_needs_password']
67
- if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
68
- raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
69
- else:
70
- is_text_pdf, results = classify(
71
- pdf_meta['total_page'],
72
- pdf_meta['page_width_pts'],
73
- pdf_meta['page_height_pts'],
74
- pdf_meta['image_info_per_page'],
75
- pdf_meta['text_len_per_page'],
76
- pdf_meta['imgs_per_page'],
77
- pdf_meta['text_layout_per_page'],
78
- pdf_meta['invalid_chars'],
79
- )
80
- if is_text_pdf:
81
- return AbsPipe.PIP_TXT
82
- else:
83
- return AbsPipe.PIP_OCR
84
-
85
- @staticmethod
86
- def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
87
- """根据pdf类型,生成统一格式content_list."""
88
- pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
89
- pdf_info_list = pdf_mid_data['pdf_info']
90
- content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
91
- return content_list
92
-
93
- @staticmethod
94
- def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
95
- """根据pdf类型,markdown."""
96
- pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
97
- pdf_info_list = pdf_mid_data['pdf_info']
98
- md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
99
- return md_content
magic_pdf/pipe/OCRPipe.py DELETED
@@ -1,80 +0,0 @@
1
- from loguru import logger
2
-
3
- from magic_pdf.config.make_content_config import DropMode, MakeMode
4
- from magic_pdf.data.data_reader_writer import DataWriter
5
- from magic_pdf.data.dataset import Dataset
6
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
7
- from magic_pdf.pipe.AbsPipe import AbsPipe
8
- from magic_pdf.user_api import parse_ocr_pdf
9
-
10
-
11
- class OCRPipe(AbsPipe):
12
- def __init__(
13
- self,
14
- dataset: Dataset,
15
- model_list: list,
16
- image_writer: DataWriter,
17
- is_debug: bool = False,
18
- start_page_id=0,
19
- end_page_id=None,
20
- lang=None,
21
- layout_model=None,
22
- formula_enable=None,
23
- table_enable=None,
24
- ):
25
- super().__init__(
26
- dataset,
27
- model_list,
28
- image_writer,
29
- is_debug,
30
- start_page_id,
31
- end_page_id,
32
- lang,
33
- layout_model,
34
- formula_enable,
35
- table_enable,
36
- )
37
-
38
- def pipe_classify(self):
39
- pass
40
-
41
- def pipe_analyze(self):
42
- self.infer_res = doc_analyze(
43
- self.dataset,
44
- ocr=True,
45
- start_page_id=self.start_page_id,
46
- end_page_id=self.end_page_id,
47
- lang=self.lang,
48
- layout_model=self.layout_model,
49
- formula_enable=self.formula_enable,
50
- table_enable=self.table_enable,
51
- )
52
-
53
- def pipe_parse(self):
54
- self.pdf_mid_data = parse_ocr_pdf(
55
- self.dataset,
56
- self.infer_res,
57
- self.image_writer,
58
- is_debug=self.is_debug,
59
- start_page_id=self.start_page_id,
60
- end_page_id=self.end_page_id,
61
- lang=self.lang,
62
- layout_model=self.layout_model,
63
- formula_enable=self.formula_enable,
64
- table_enable=self.table_enable,
65
- )
66
-
67
- def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
68
- result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
69
- logger.info('ocr_pipe mk content list finished')
70
- return result
71
-
72
- def pipe_mk_markdown(
73
- self,
74
- img_parent_path: str,
75
- drop_mode=DropMode.WHOLE_PDF,
76
- md_make_mode=MakeMode.MM_MD,
77
- ):
78
- result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
79
- logger.info(f'ocr_pipe mk {md_make_mode} finished')
80
- return result
magic_pdf/pipe/TXTPipe.py DELETED
@@ -1,42 +0,0 @@
1
- from loguru import logger
2
-
3
- from magic_pdf.config.make_content_config import DropMode, MakeMode
4
- from magic_pdf.data.data_reader_writer import DataWriter
5
- from magic_pdf.data.dataset import Dataset
6
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
7
- from magic_pdf.pipe.AbsPipe import AbsPipe
8
- from magic_pdf.user_api import parse_txt_pdf
9
-
10
-
11
- class TXTPipe(AbsPipe):
12
-
13
- def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
14
- start_page_id=0, end_page_id=None, lang=None,
15
- layout_model=None, formula_enable=None, table_enable=None):
16
- super().__init__(dataset, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
17
- layout_model, formula_enable, table_enable)
18
-
19
- def pipe_classify(self):
20
- pass
21
-
22
- def pipe_analyze(self):
23
- self.model_list = doc_analyze(self.dataset, ocr=False,
24
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
25
- lang=self.lang, layout_model=self.layout_model,
26
- formula_enable=self.formula_enable, table_enable=self.table_enable)
27
-
28
- def pipe_parse(self):
29
- self.pdf_mid_data = parse_txt_pdf(self.dataset, self.model_list, self.image_writer, is_debug=self.is_debug,
30
- start_page_id=self.start_page_id, end_page_id=self.end_page_id,
31
- lang=self.lang, layout_model=self.layout_model,
32
- formula_enable=self.formula_enable, table_enable=self.table_enable)
33
-
34
- def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
35
- result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
36
- logger.info('txt_pipe mk content list finished')
37
- return result
38
-
39
- def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
40
- result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
41
- logger.info(f'txt_pipe mk {md_make_mode} finished')
42
- return result
magic_pdf/pipe/UNIPipe.py DELETED
@@ -1,150 +0,0 @@
1
- import json
2
-
3
- from loguru import logger
4
-
5
- from magic_pdf.config.make_content_config import DropMode, MakeMode
6
- from magic_pdf.data.data_reader_writer import DataWriter
7
- from magic_pdf.data.dataset import Dataset
8
- from magic_pdf.libs.commons import join_path
9
- from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
10
- from magic_pdf.pipe.AbsPipe import AbsPipe
11
- from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
12
-
13
-
14
- class UNIPipe(AbsPipe):
15
-
16
- def __init__(
17
- self,
18
- dataset: Dataset,
19
- jso_useful_key: dict,
20
- image_writer: DataWriter,
21
- is_debug: bool = False,
22
- start_page_id=0,
23
- end_page_id=None,
24
- lang=None,
25
- layout_model=None,
26
- formula_enable=None,
27
- table_enable=None,
28
- ):
29
- self.pdf_type = jso_useful_key['_pdf_type']
30
- super().__init__(
31
- dataset,
32
- jso_useful_key['model_list'],
33
- image_writer,
34
- is_debug,
35
- start_page_id,
36
- end_page_id,
37
- lang,
38
- layout_model,
39
- formula_enable,
40
- table_enable,
41
- )
42
- if len(self.model_list) == 0:
43
- self.input_model_is_empty = True
44
- else:
45
- self.input_model_is_empty = False
46
-
47
- def pipe_classify(self):
48
- self.pdf_type = AbsPipe.classify(self.pdf_bytes)
49
-
50
- def pipe_analyze(self):
51
- if self.pdf_type == self.PIP_TXT:
52
- self.model_list = doc_analyze(
53
- self.dataset,
54
- ocr=False,
55
- start_page_id=self.start_page_id,
56
- end_page_id=self.end_page_id,
57
- lang=self.lang,
58
- layout_model=self.layout_model,
59
- formula_enable=self.formula_enable,
60
- table_enable=self.table_enable,
61
- )
62
- elif self.pdf_type == self.PIP_OCR:
63
- self.model_list = doc_analyze(
64
- self.dataset,
65
- ocr=True,
66
- start_page_id=self.start_page_id,
67
- end_page_id=self.end_page_id,
68
- lang=self.lang,
69
- layout_model=self.layout_model,
70
- formula_enable=self.formula_enable,
71
- table_enable=self.table_enable,
72
- )
73
-
74
- def pipe_parse(self):
75
- if self.pdf_type == self.PIP_TXT:
76
- self.pdf_mid_data = parse_union_pdf(
77
- self.dataset,
78
- self.model_list,
79
- self.image_writer,
80
- is_debug=self.is_debug,
81
- start_page_id=self.start_page_id,
82
- end_page_id=self.end_page_id,
83
- lang=self.lang,
84
- layout_model=self.layout_model,
85
- formula_enable=self.formula_enable,
86
- table_enable=self.table_enable,
87
- )
88
- elif self.pdf_type == self.PIP_OCR:
89
- self.pdf_mid_data = parse_ocr_pdf(
90
- self.dataset,
91
- self.model_list,
92
- self.image_writer,
93
- is_debug=self.is_debug,
94
- start_page_id=self.start_page_id,
95
- end_page_id=self.end_page_id,
96
- lang=self.lang,
97
- )
98
-
99
- def pipe_mk_uni_format(
100
- self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON
101
- ):
102
- result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
103
- logger.info('uni_pipe mk content list finished')
104
- return result
105
-
106
- def pipe_mk_markdown(
107
- self,
108
- img_parent_path: str,
109
- drop_mode=DropMode.WHOLE_PDF,
110
- md_make_mode=MakeMode.MM_MD,
111
- ):
112
- result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
113
- logger.info(f'uni_pipe mk {md_make_mode} finished')
114
- return result
115
-
116
-
117
- if __name__ == '__main__':
118
- # 测试
119
- from magic_pdf.data.data_reader_writer import DataReader
120
-
121
- drw = DataReader(r'D:/project/20231108code-clean')
122
-
123
- pdf_file_path = r'linshixuqiu\19983-00.pdf'
124
- model_file_path = r'linshixuqiu\19983-00.json'
125
- pdf_bytes = drw.read(pdf_file_path)
126
- model_json_txt = drw.read(model_file_path).decode()
127
- model_list = json.loads(model_json_txt)
128
- write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
129
- img_bucket_path = 'imgs'
130
- img_writer = DataWriter(join_path(write_path, img_bucket_path))
131
-
132
- # pdf_type = UNIPipe.classify(pdf_bytes)
133
- # jso_useful_key = {
134
- # "_pdf_type": pdf_type,
135
- # "model_list": model_list
136
- # }
137
-
138
- jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
139
- pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
140
- pipe.pipe_classify()
141
- pipe.pipe_parse()
142
- md_content = pipe.pipe_mk_markdown(img_bucket_path)
143
- content_list = pipe.pipe_mk_uni_format(img_bucket_path)
144
-
145
- md_writer = DataWriter(write_path)
146
- md_writer.write_string('19983-00.md', md_content)
147
- md_writer.write_string(
148
- '19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
149
- )
150
- md_writer.write_string('19983-00.txt', str(content_list))
File without changes
@@ -1,17 +0,0 @@
1
- from abc import ABC, abstractmethod
2
-
3
-
4
- class AbsReaderWriter(ABC):
5
- MODE_TXT = "text"
6
- MODE_BIN = "binary"
7
- @abstractmethod
8
- def read(self, path: str, mode=MODE_TXT):
9
- raise NotImplementedError
10
-
11
- @abstractmethod
12
- def write(self, content: str, path: str, mode=MODE_TXT):
13
- raise NotImplementedError
14
-
15
- @abstractmethod
16
- def read_offset(self, path: str, offset=0, limit=None) -> bytes:
17
- raise NotImplementedError