magic-pdf 0.10.6__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +2 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +13 -1
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +14 -13
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +8 -12
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -125
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +4 -51
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +33 -22
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +30 -4
- magic_pdf/model/sub_modules/model_utils.py +8 -2
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/{model/operators.py → operators/models.py} +2 -38
- magic_pdf/{pipe/operators.py → operators/pipes.py} +70 -17
- magic_pdf/pdf_parse_union_core_v2.py +71 -17
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +28 -18
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/METADATA +73 -23
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/RECORD +50 -53
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -22
- magic_pdf/pdf_parse_by_txt.py +0 -23
- magic_pdf/pipe/AbsPipe.py +0 -99
- magic_pdf/pipe/OCRPipe.py +0 -80
- magic_pdf/pipe/TXTPipe.py +0 -42
- magic_pdf/pipe/UNIPipe.py +0 -150
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -144
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.6.dist-info → magic_pdf-1.0.1.dist-info}/top_level.txt +0 -0
@@ -1,33 +1,30 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
magic_pdf/
|
3
|
-
magic_pdf/pdf_parse_by_txt.py,sha256=1-xieVOP8qmAC957ftzSzaeviv0-QC4yL6Lv6Pcg_6Y,722
|
4
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=Hl8PSJOJFHAGCdTwX4YY2_MMgjAuat47yALLb_E-DYg,30879
|
5
|
-
magic_pdf/user_api.py,sha256=EAalk3WfQTfBq4qKMcISuHSjQg2Ku61ox_WiOPeFfuY,4060
|
2
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=wjc9Ev7F-FV3UQQkpY_hRcBMC8xczZqUrUd7s6qJrLk,33473
|
6
3
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
magic_pdf/config/constants.py,sha256=
|
4
|
+
magic_pdf/config/constants.py,sha256=fXGzANULnJWLPxwYp3BEFWx-rnorzpySMx63ffyEyq4,1272
|
8
5
|
magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
|
9
6
|
magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
|
10
7
|
magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
|
11
|
-
magic_pdf/config/exceptions.py,sha256=
|
8
|
+
magic_pdf/config/exceptions.py,sha256=2tsJxYUebVeimyYBGQkc9Nd1kIakTmWmz3SDcfJWy54,784
|
12
9
|
magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLxLgdBEeRvLg,248
|
13
10
|
magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
|
14
11
|
magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
|
15
12
|
magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
magic_pdf/data/dataset.py,sha256=
|
17
|
-
magic_pdf/data/read_api.py,sha256=
|
13
|
+
magic_pdf/data/dataset.py,sha256=q7wfX99HTVjKCFVpf1mnYn55rK6oF5Dz8O9w4C9cYhw,11196
|
14
|
+
magic_pdf/data/read_api.py,sha256=_faBnYE3iU_EiQLNFjVM6a8IQtOGAcSQNYBZsTSN1d8,5225
|
18
15
|
magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
19
|
-
magic_pdf/data/utils.py,sha256=
|
16
|
+
magic_pdf/data/utils.py,sha256=aMeQB3soGUJyoI41hfgWeOZNzPj36SOrewUM7z51AOU,2305
|
20
17
|
magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
|
21
18
|
magic_pdf/data/data_reader_writer/base.py,sha256=nqmAcdHOXMOJO6RAT3ILligDFaw8Op0STyCw5yOzAbI,1706
|
22
|
-
magic_pdf/data/data_reader_writer/filebase.py,sha256=
|
23
|
-
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=
|
19
|
+
magic_pdf/data/data_reader_writer/filebase.py,sha256=VbNAxLyo0Io0j7iprJERt_TqxzHAtA7cUyPIaJstToU,2146
|
20
|
+
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=4pEJ8PPd3nX7sccHobCs0mbDM8BiqDP_sAEz7CIvpNI,5938
|
24
21
|
magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
|
25
22
|
magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
|
26
23
|
magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
|
27
24
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
28
25
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
29
26
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
27
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=RQ47F2CT0Zgmg1rZoqYj5IW5msqoCTEF6GEHi3mVd8U,12989
|
31
28
|
magic_pdf/filter/__init__.py,sha256=rV4dvUxfKyVErDx9ZbUp8DVq_fRIlv0lfSXp1ND4STc,1503
|
32
29
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
33
30
|
magic_pdf/filter/pdf_meta_scan.py,sha256=rqTuStW2_ICr3HmV_9IQ5jnsl4JnSh7-bL11vbtH3i0,17470
|
@@ -38,34 +35,38 @@ magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCal
|
|
38
35
|
magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
|
39
36
|
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
37
|
magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
|
41
|
-
magic_pdf/libs/clean_memory.py,sha256=
|
38
|
+
magic_pdf/libs/clean_memory.py,sha256=OsQexCjmBO2i-Hv-0uYQfn72dbUWR8sTW81nG2zlNQQ,479
|
42
39
|
magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
|
43
|
-
magic_pdf/libs/config_reader.py,sha256=
|
40
|
+
magic_pdf/libs/config_reader.py,sha256=9GXK7jtDyA_jxXKWZAQ69rB02v5UW4mOmo1IaOYOkW0,4374
|
44
41
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
45
42
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
46
|
-
magic_pdf/libs/draw_bbox.py,sha256=
|
43
|
+
magic_pdf/libs/draw_bbox.py,sha256=RX_ELX6P8mF0sIBx_h2A3BzhevcSPIzbbrboZTBlBik,17653
|
47
44
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
48
45
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
49
|
-
magic_pdf/libs/language.py,sha256=
|
46
|
+
magic_pdf/libs/language.py,sha256=w1GVYmlocF7DQjtZrspgH6WacoWazOQBzz-iQx0mSBk,1135
|
50
47
|
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
51
48
|
magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
|
52
49
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
53
50
|
magic_pdf/libs/pdf_check.py,sha256=zBwUThKKBtnrNPmgE10lYsTy1Kq7j_6IejO7JR0J4pA,3118
|
54
51
|
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
55
52
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
56
|
-
magic_pdf/libs/version.py,sha256=
|
57
|
-
magic_pdf/model/__init__.py,sha256=
|
58
|
-
magic_pdf/model/
|
59
|
-
magic_pdf/model/
|
60
|
-
magic_pdf/model/
|
61
|
-
magic_pdf/model/
|
62
|
-
magic_pdf/model/pdf_extract_kit.py,sha256=
|
53
|
+
magic_pdf/libs/version.py,sha256=d4QHYmS_30j0hPN8NmNPnQ_Z0TphDRbu4MtQj9cT9e8,22
|
54
|
+
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
55
|
+
magic_pdf/model/batch_analyze.py,sha256=A49qD5zY9G8nl6wnpMLATqS4_xOOgRvjo1Eq6v_mcUE,11551
|
56
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=AZSzqzGz5utLwuysF5JY5k5pYcJGtcppNP2BbvExLnA,5989
|
57
|
+
magic_pdf/model/magic_model.py,sha256=Nt74oZGYUcbm4qdOQtN-hbKhXxlWO2LVv3K9yXvteWY,25204
|
58
|
+
magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
|
59
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=Z7zzU_lkVR0vgycpeqVe1pwLc4svYThIUSEdTJVLVNM,12287
|
63
60
|
magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
|
64
61
|
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
65
|
-
magic_pdf/model/sub_modules/model_init.py,sha256
|
66
|
-
magic_pdf/model/sub_modules/model_utils.py,sha256=
|
62
|
+
magic_pdf/model/sub_modules/model_init.py,sha256=-sVlsOhSjLakOOmw675iDdSQSBW6Py5U0K0XiM3UpvU,6423
|
63
|
+
magic_pdf/model/sub_modules/model_utils.py,sha256=2pI1Xcr2zCF3b64e4WoFtIbjSmTVYBE4zjyHB23gvmE,2488
|
64
|
+
magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
65
|
+
magic_pdf/model/sub_modules/language_detection/utils.py,sha256=5nec_loLyYCJ5o6n38AYLz2SKmRvHDCBdt6ka84EaGM,3096
|
66
|
+
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=GW_9WkqIzpJm1MFJexZ2ZvA6AjoqM-6yh8p4LupJhas,4762
|
67
|
+
magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
67
68
|
magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
68
|
-
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=
|
69
|
+
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=5DXZhbkLrycF3FGF8OMuuHGJtTHLSkTGetxxi5KWDgw,2189
|
69
70
|
magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
70
71
|
magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
71
72
|
magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
|
@@ -88,15 +89,15 @@ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/model
|
|
88
89
|
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
|
89
90
|
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
|
90
91
|
magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
91
|
-
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=
|
92
|
+
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=QfHbMr1br0pOJUu1NJEMgA6yw11G0yFImJv_AfW48_c,1008
|
92
93
|
magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
93
94
|
magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
94
|
-
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=
|
95
|
+
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=Jgi38JDo6D6sVVnBJ1XZ-iAT9qjj5jW__NL-8GKJb78,5290
|
95
96
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
96
97
|
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
97
98
|
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
98
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=
|
99
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=
|
99
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=nT47hOH0rG9_dm4FMU_UNWvoX1IRW0t7TPKQw5XfMQ4,12324
|
100
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=QBBeFN1iF7nj5gqQ5sQXjhpwy8lB4c96gubnRDBuDNU,8424
|
100
101
|
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
|
101
102
|
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
102
103
|
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -105,45 +106,41 @@ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4
|
|
105
106
|
magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
106
107
|
magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
|
107
108
|
magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
|
-
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=
|
109
|
+
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=YsfgKEM0ETBBYsYmPdR9DAJIGeguK-oy9Pn25vS07CE,1953
|
109
110
|
magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
110
111
|
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
|
111
112
|
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
112
113
|
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
|
113
|
-
magic_pdf/
|
114
|
-
magic_pdf/
|
115
|
-
magic_pdf/
|
116
|
-
magic_pdf/
|
117
|
-
magic_pdf/
|
118
|
-
magic_pdf/
|
119
|
-
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
|
-
magic_pdf/pipe/operators.py,sha256=5z7kF95IWyBGxs4tIhqJml2YMlfDkU9B5xy__NiUxz0,4962
|
114
|
+
magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1REI,2630
|
115
|
+
magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
|
116
|
+
magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
|
117
|
+
magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
118
|
+
magic_pdf/post_proc/llm_aided.py,sha256=6eKZAfc0Vk_wX7NyYDle71rf1WWa2-7ZKXQ_Vm7Pem8,4722
|
119
|
+
magic_pdf/post_proc/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
|
121
120
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
122
121
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
123
122
|
magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
|
124
123
|
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
|
125
124
|
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
|
126
|
-
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=
|
127
|
-
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=
|
125
|
+
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=MVRO_GLsOtmsnj77veH3_QToU9A3gjq7qC6zt73Af1s,3101
|
126
|
+
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
|
128
127
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
129
128
|
magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
|
130
129
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
|
131
130
|
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
132
|
-
magic_pdf/
|
133
|
-
magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
|
134
|
-
magic_pdf/rw/S3ReaderWriter.py,sha256=LmbtA-pZlC745nnSUs67C1iqSrBDS7IzE6QC8YMB644,5293
|
135
|
-
magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
131
|
+
magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt,sha256=dV4lcudF8wimEbAooYbvISvFhrXjp9i0rMRqv9VW6hY,3204667
|
136
132
|
magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
137
133
|
magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
|
138
134
|
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
139
|
-
magic_pdf/tools/cli.py,sha256=
|
135
|
+
magic_pdf/tools/cli.py,sha256=YiX9LU4UeU3yYIpblGO1cbO95Tbo3A8cmWFK_1WvqfU,4134
|
140
136
|
magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
|
141
|
-
magic_pdf/tools/common.py,sha256=
|
137
|
+
magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,8381
|
142
138
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
143
139
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
144
|
-
magic_pdf
|
145
|
-
magic_pdf-0.
|
146
|
-
magic_pdf-0.
|
147
|
-
magic_pdf-0.
|
148
|
-
magic_pdf-0.
|
149
|
-
magic_pdf-0.
|
140
|
+
magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
|
141
|
+
magic_pdf-1.0.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
142
|
+
magic_pdf-1.0.1.dist-info/METADATA,sha256=DC6l1zz6GIZxALaEdhdjyqzxDZ8784tTZYut_l2FFoQ,40499
|
143
|
+
magic_pdf-1.0.1.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
144
|
+
magic_pdf-1.0.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
145
|
+
magic_pdf-1.0.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
146
|
+
magic_pdf-1.0.1.dist-info/RECORD,,
|
magic_pdf/para/__init__.py
DELETED
File without changes
|
magic_pdf/pdf_parse_by_ocr.py
DELETED
@@ -1,22 +0,0 @@
|
|
1
|
-
from magic_pdf.config.enums import SupportedPdfParseMethod
|
2
|
-
from magic_pdf.data.dataset import Dataset
|
3
|
-
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
|
4
|
-
|
5
|
-
|
6
|
-
def parse_pdf_by_ocr(dataset: Dataset,
|
7
|
-
model_list,
|
8
|
-
imageWriter,
|
9
|
-
start_page_id=0,
|
10
|
-
end_page_id=None,
|
11
|
-
debug_mode=False,
|
12
|
-
lang=None,
|
13
|
-
):
|
14
|
-
return pdf_parse_union(model_list,
|
15
|
-
dataset,
|
16
|
-
imageWriter,
|
17
|
-
SupportedPdfParseMethod.OCR,
|
18
|
-
start_page_id=start_page_id,
|
19
|
-
end_page_id=end_page_id,
|
20
|
-
debug_mode=debug_mode,
|
21
|
-
lang=lang,
|
22
|
-
)
|
magic_pdf/pdf_parse_by_txt.py
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
from magic_pdf.config.enums import SupportedPdfParseMethod
|
2
|
-
from magic_pdf.data.dataset import Dataset
|
3
|
-
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
|
4
|
-
|
5
|
-
|
6
|
-
def parse_pdf_by_txt(
|
7
|
-
dataset: Dataset,
|
8
|
-
model_list,
|
9
|
-
imageWriter,
|
10
|
-
start_page_id=0,
|
11
|
-
end_page_id=None,
|
12
|
-
debug_mode=False,
|
13
|
-
lang=None,
|
14
|
-
):
|
15
|
-
return pdf_parse_union(model_list,
|
16
|
-
dataset,
|
17
|
-
imageWriter,
|
18
|
-
SupportedPdfParseMethod.TXT,
|
19
|
-
start_page_id=start_page_id,
|
20
|
-
end_page_id=end_page_id,
|
21
|
-
debug_mode=debug_mode,
|
22
|
-
lang=lang,
|
23
|
-
)
|
magic_pdf/pipe/AbsPipe.py
DELETED
@@ -1,99 +0,0 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
|
3
|
-
from magic_pdf.config.drop_reason import DropReason
|
4
|
-
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
5
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
6
|
-
from magic_pdf.data.dataset import Dataset
|
7
|
-
from magic_pdf.dict2md.ocr_mkcontent import union_make
|
8
|
-
from magic_pdf.filter.pdf_classify_by_type import classify
|
9
|
-
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
|
10
|
-
from magic_pdf.libs.json_compressor import JsonCompressor
|
11
|
-
|
12
|
-
|
13
|
-
class AbsPipe(ABC):
|
14
|
-
"""txt和ocr处理的抽象类."""
|
15
|
-
PIP_OCR = 'ocr'
|
16
|
-
PIP_TXT = 'txt'
|
17
|
-
|
18
|
-
def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
19
|
-
start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
|
20
|
-
self.dataset = Dataset
|
21
|
-
self.model_list = model_list
|
22
|
-
self.image_writer = image_writer
|
23
|
-
self.pdf_mid_data = None # 未压缩
|
24
|
-
self.is_debug = is_debug
|
25
|
-
self.start_page_id = start_page_id
|
26
|
-
self.end_page_id = end_page_id
|
27
|
-
self.lang = lang
|
28
|
-
self.layout_model = layout_model
|
29
|
-
self.formula_enable = formula_enable
|
30
|
-
self.table_enable = table_enable
|
31
|
-
|
32
|
-
def get_compress_pdf_mid_data(self):
|
33
|
-
return JsonCompressor.compress_json(self.pdf_mid_data)
|
34
|
-
|
35
|
-
@abstractmethod
|
36
|
-
def pipe_classify(self):
|
37
|
-
"""有状态的分类."""
|
38
|
-
raise NotImplementedError
|
39
|
-
|
40
|
-
@abstractmethod
|
41
|
-
def pipe_analyze(self):
|
42
|
-
"""有状态的跑模型分析."""
|
43
|
-
raise NotImplementedError
|
44
|
-
|
45
|
-
@abstractmethod
|
46
|
-
def pipe_parse(self):
|
47
|
-
"""有状态的解析."""
|
48
|
-
raise NotImplementedError
|
49
|
-
|
50
|
-
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
51
|
-
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
|
52
|
-
return content_list
|
53
|
-
|
54
|
-
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
55
|
-
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
|
56
|
-
return md_content
|
57
|
-
|
58
|
-
@staticmethod
|
59
|
-
def classify(pdf_bytes: bytes) -> str:
|
60
|
-
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
|
61
|
-
pdf_meta = pdf_meta_scan(pdf_bytes)
|
62
|
-
if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
|
63
|
-
raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
|
64
|
-
else:
|
65
|
-
is_encrypted = pdf_meta['is_encrypted']
|
66
|
-
is_needs_password = pdf_meta['is_needs_password']
|
67
|
-
if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
|
68
|
-
raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
|
69
|
-
else:
|
70
|
-
is_text_pdf, results = classify(
|
71
|
-
pdf_meta['total_page'],
|
72
|
-
pdf_meta['page_width_pts'],
|
73
|
-
pdf_meta['page_height_pts'],
|
74
|
-
pdf_meta['image_info_per_page'],
|
75
|
-
pdf_meta['text_len_per_page'],
|
76
|
-
pdf_meta['imgs_per_page'],
|
77
|
-
pdf_meta['text_layout_per_page'],
|
78
|
-
pdf_meta['invalid_chars'],
|
79
|
-
)
|
80
|
-
if is_text_pdf:
|
81
|
-
return AbsPipe.PIP_TXT
|
82
|
-
else:
|
83
|
-
return AbsPipe.PIP_OCR
|
84
|
-
|
85
|
-
@staticmethod
|
86
|
-
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
|
87
|
-
"""根据pdf类型,生成统一格式content_list."""
|
88
|
-
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
89
|
-
pdf_info_list = pdf_mid_data['pdf_info']
|
90
|
-
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
|
91
|
-
return content_list
|
92
|
-
|
93
|
-
@staticmethod
|
94
|
-
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
|
95
|
-
"""根据pdf类型,markdown."""
|
96
|
-
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
97
|
-
pdf_info_list = pdf_mid_data['pdf_info']
|
98
|
-
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
|
99
|
-
return md_content
|
magic_pdf/pipe/OCRPipe.py
DELETED
@@ -1,80 +0,0 @@
|
|
1
|
-
from loguru import logger
|
2
|
-
|
3
|
-
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
5
|
-
from magic_pdf.data.dataset import Dataset
|
6
|
-
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
7
|
-
from magic_pdf.pipe.AbsPipe import AbsPipe
|
8
|
-
from magic_pdf.user_api import parse_ocr_pdf
|
9
|
-
|
10
|
-
|
11
|
-
class OCRPipe(AbsPipe):
|
12
|
-
def __init__(
|
13
|
-
self,
|
14
|
-
dataset: Dataset,
|
15
|
-
model_list: list,
|
16
|
-
image_writer: DataWriter,
|
17
|
-
is_debug: bool = False,
|
18
|
-
start_page_id=0,
|
19
|
-
end_page_id=None,
|
20
|
-
lang=None,
|
21
|
-
layout_model=None,
|
22
|
-
formula_enable=None,
|
23
|
-
table_enable=None,
|
24
|
-
):
|
25
|
-
super().__init__(
|
26
|
-
dataset,
|
27
|
-
model_list,
|
28
|
-
image_writer,
|
29
|
-
is_debug,
|
30
|
-
start_page_id,
|
31
|
-
end_page_id,
|
32
|
-
lang,
|
33
|
-
layout_model,
|
34
|
-
formula_enable,
|
35
|
-
table_enable,
|
36
|
-
)
|
37
|
-
|
38
|
-
def pipe_classify(self):
|
39
|
-
pass
|
40
|
-
|
41
|
-
def pipe_analyze(self):
|
42
|
-
self.infer_res = doc_analyze(
|
43
|
-
self.dataset,
|
44
|
-
ocr=True,
|
45
|
-
start_page_id=self.start_page_id,
|
46
|
-
end_page_id=self.end_page_id,
|
47
|
-
lang=self.lang,
|
48
|
-
layout_model=self.layout_model,
|
49
|
-
formula_enable=self.formula_enable,
|
50
|
-
table_enable=self.table_enable,
|
51
|
-
)
|
52
|
-
|
53
|
-
def pipe_parse(self):
|
54
|
-
self.pdf_mid_data = parse_ocr_pdf(
|
55
|
-
self.dataset,
|
56
|
-
self.infer_res,
|
57
|
-
self.image_writer,
|
58
|
-
is_debug=self.is_debug,
|
59
|
-
start_page_id=self.start_page_id,
|
60
|
-
end_page_id=self.end_page_id,
|
61
|
-
lang=self.lang,
|
62
|
-
layout_model=self.layout_model,
|
63
|
-
formula_enable=self.formula_enable,
|
64
|
-
table_enable=self.table_enable,
|
65
|
-
)
|
66
|
-
|
67
|
-
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
68
|
-
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
69
|
-
logger.info('ocr_pipe mk content list finished')
|
70
|
-
return result
|
71
|
-
|
72
|
-
def pipe_mk_markdown(
|
73
|
-
self,
|
74
|
-
img_parent_path: str,
|
75
|
-
drop_mode=DropMode.WHOLE_PDF,
|
76
|
-
md_make_mode=MakeMode.MM_MD,
|
77
|
-
):
|
78
|
-
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
79
|
-
logger.info(f'ocr_pipe mk {md_make_mode} finished')
|
80
|
-
return result
|
magic_pdf/pipe/TXTPipe.py
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
from loguru import logger
|
2
|
-
|
3
|
-
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
5
|
-
from magic_pdf.data.dataset import Dataset
|
6
|
-
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
7
|
-
from magic_pdf.pipe.AbsPipe import AbsPipe
|
8
|
-
from magic_pdf.user_api import parse_txt_pdf
|
9
|
-
|
10
|
-
|
11
|
-
class TXTPipe(AbsPipe):
|
12
|
-
|
13
|
-
def __init__(self, dataset: Dataset, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
14
|
-
start_page_id=0, end_page_id=None, lang=None,
|
15
|
-
layout_model=None, formula_enable=None, table_enable=None):
|
16
|
-
super().__init__(dataset, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
17
|
-
layout_model, formula_enable, table_enable)
|
18
|
-
|
19
|
-
def pipe_classify(self):
|
20
|
-
pass
|
21
|
-
|
22
|
-
def pipe_analyze(self):
|
23
|
-
self.model_list = doc_analyze(self.dataset, ocr=False,
|
24
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
25
|
-
lang=self.lang, layout_model=self.layout_model,
|
26
|
-
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
27
|
-
|
28
|
-
def pipe_parse(self):
|
29
|
-
self.pdf_mid_data = parse_txt_pdf(self.dataset, self.model_list, self.image_writer, is_debug=self.is_debug,
|
30
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
31
|
-
lang=self.lang, layout_model=self.layout_model,
|
32
|
-
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
33
|
-
|
34
|
-
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
35
|
-
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
36
|
-
logger.info('txt_pipe mk content list finished')
|
37
|
-
return result
|
38
|
-
|
39
|
-
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
40
|
-
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
41
|
-
logger.info(f'txt_pipe mk {md_make_mode} finished')
|
42
|
-
return result
|
magic_pdf/pipe/UNIPipe.py
DELETED
@@ -1,150 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from loguru import logger
|
4
|
-
|
5
|
-
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
6
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
7
|
-
from magic_pdf.data.dataset import Dataset
|
8
|
-
from magic_pdf.libs.commons import join_path
|
9
|
-
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
10
|
-
from magic_pdf.pipe.AbsPipe import AbsPipe
|
11
|
-
from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
|
12
|
-
|
13
|
-
|
14
|
-
class UNIPipe(AbsPipe):
|
15
|
-
|
16
|
-
def __init__(
|
17
|
-
self,
|
18
|
-
dataset: Dataset,
|
19
|
-
jso_useful_key: dict,
|
20
|
-
image_writer: DataWriter,
|
21
|
-
is_debug: bool = False,
|
22
|
-
start_page_id=0,
|
23
|
-
end_page_id=None,
|
24
|
-
lang=None,
|
25
|
-
layout_model=None,
|
26
|
-
formula_enable=None,
|
27
|
-
table_enable=None,
|
28
|
-
):
|
29
|
-
self.pdf_type = jso_useful_key['_pdf_type']
|
30
|
-
super().__init__(
|
31
|
-
dataset,
|
32
|
-
jso_useful_key['model_list'],
|
33
|
-
image_writer,
|
34
|
-
is_debug,
|
35
|
-
start_page_id,
|
36
|
-
end_page_id,
|
37
|
-
lang,
|
38
|
-
layout_model,
|
39
|
-
formula_enable,
|
40
|
-
table_enable,
|
41
|
-
)
|
42
|
-
if len(self.model_list) == 0:
|
43
|
-
self.input_model_is_empty = True
|
44
|
-
else:
|
45
|
-
self.input_model_is_empty = False
|
46
|
-
|
47
|
-
def pipe_classify(self):
|
48
|
-
self.pdf_type = AbsPipe.classify(self.pdf_bytes)
|
49
|
-
|
50
|
-
def pipe_analyze(self):
|
51
|
-
if self.pdf_type == self.PIP_TXT:
|
52
|
-
self.model_list = doc_analyze(
|
53
|
-
self.dataset,
|
54
|
-
ocr=False,
|
55
|
-
start_page_id=self.start_page_id,
|
56
|
-
end_page_id=self.end_page_id,
|
57
|
-
lang=self.lang,
|
58
|
-
layout_model=self.layout_model,
|
59
|
-
formula_enable=self.formula_enable,
|
60
|
-
table_enable=self.table_enable,
|
61
|
-
)
|
62
|
-
elif self.pdf_type == self.PIP_OCR:
|
63
|
-
self.model_list = doc_analyze(
|
64
|
-
self.dataset,
|
65
|
-
ocr=True,
|
66
|
-
start_page_id=self.start_page_id,
|
67
|
-
end_page_id=self.end_page_id,
|
68
|
-
lang=self.lang,
|
69
|
-
layout_model=self.layout_model,
|
70
|
-
formula_enable=self.formula_enable,
|
71
|
-
table_enable=self.table_enable,
|
72
|
-
)
|
73
|
-
|
74
|
-
def pipe_parse(self):
|
75
|
-
if self.pdf_type == self.PIP_TXT:
|
76
|
-
self.pdf_mid_data = parse_union_pdf(
|
77
|
-
self.dataset,
|
78
|
-
self.model_list,
|
79
|
-
self.image_writer,
|
80
|
-
is_debug=self.is_debug,
|
81
|
-
start_page_id=self.start_page_id,
|
82
|
-
end_page_id=self.end_page_id,
|
83
|
-
lang=self.lang,
|
84
|
-
layout_model=self.layout_model,
|
85
|
-
formula_enable=self.formula_enable,
|
86
|
-
table_enable=self.table_enable,
|
87
|
-
)
|
88
|
-
elif self.pdf_type == self.PIP_OCR:
|
89
|
-
self.pdf_mid_data = parse_ocr_pdf(
|
90
|
-
self.dataset,
|
91
|
-
self.model_list,
|
92
|
-
self.image_writer,
|
93
|
-
is_debug=self.is_debug,
|
94
|
-
start_page_id=self.start_page_id,
|
95
|
-
end_page_id=self.end_page_id,
|
96
|
-
lang=self.lang,
|
97
|
-
)
|
98
|
-
|
99
|
-
def pipe_mk_uni_format(
|
100
|
-
self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON
|
101
|
-
):
|
102
|
-
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
103
|
-
logger.info('uni_pipe mk content list finished')
|
104
|
-
return result
|
105
|
-
|
106
|
-
def pipe_mk_markdown(
|
107
|
-
self,
|
108
|
-
img_parent_path: str,
|
109
|
-
drop_mode=DropMode.WHOLE_PDF,
|
110
|
-
md_make_mode=MakeMode.MM_MD,
|
111
|
-
):
|
112
|
-
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
113
|
-
logger.info(f'uni_pipe mk {md_make_mode} finished')
|
114
|
-
return result
|
115
|
-
|
116
|
-
|
117
|
-
if __name__ == '__main__':
|
118
|
-
# 测试
|
119
|
-
from magic_pdf.data.data_reader_writer import DataReader
|
120
|
-
|
121
|
-
drw = DataReader(r'D:/project/20231108code-clean')
|
122
|
-
|
123
|
-
pdf_file_path = r'linshixuqiu\19983-00.pdf'
|
124
|
-
model_file_path = r'linshixuqiu\19983-00.json'
|
125
|
-
pdf_bytes = drw.read(pdf_file_path)
|
126
|
-
model_json_txt = drw.read(model_file_path).decode()
|
127
|
-
model_list = json.loads(model_json_txt)
|
128
|
-
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
|
129
|
-
img_bucket_path = 'imgs'
|
130
|
-
img_writer = DataWriter(join_path(write_path, img_bucket_path))
|
131
|
-
|
132
|
-
# pdf_type = UNIPipe.classify(pdf_bytes)
|
133
|
-
# jso_useful_key = {
|
134
|
-
# "_pdf_type": pdf_type,
|
135
|
-
# "model_list": model_list
|
136
|
-
# }
|
137
|
-
|
138
|
-
jso_useful_key = {'_pdf_type': '', 'model_list': model_list}
|
139
|
-
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
|
140
|
-
pipe.pipe_classify()
|
141
|
-
pipe.pipe_parse()
|
142
|
-
md_content = pipe.pipe_mk_markdown(img_bucket_path)
|
143
|
-
content_list = pipe.pipe_mk_uni_format(img_bucket_path)
|
144
|
-
|
145
|
-
md_writer = DataWriter(write_path)
|
146
|
-
md_writer.write_string('19983-00.md', md_content)
|
147
|
-
md_writer.write_string(
|
148
|
-
'19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4)
|
149
|
-
)
|
150
|
-
md_writer.write_string('19983-00.txt', str(content_list))
|
magic_pdf/pipe/__init__.py
DELETED
File without changes
|
magic_pdf/rw/AbsReaderWriter.py
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
|
3
|
-
|
4
|
-
class AbsReaderWriter(ABC):
|
5
|
-
MODE_TXT = "text"
|
6
|
-
MODE_BIN = "binary"
|
7
|
-
@abstractmethod
|
8
|
-
def read(self, path: str, mode=MODE_TXT):
|
9
|
-
raise NotImplementedError
|
10
|
-
|
11
|
-
@abstractmethod
|
12
|
-
def write(self, content: str, path: str, mode=MODE_TXT):
|
13
|
-
raise NotImplementedError
|
14
|
-
|
15
|
-
@abstractmethod
|
16
|
-
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
|
17
|
-
raise NotImplementedError
|