magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +7 -0
- magic_pdf/config/exceptions.py +7 -0
- magic_pdf/data/data_reader_writer/base.py +13 -1
- magic_pdf/data/data_reader_writer/filebase.py +1 -1
- magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
- magic_pdf/data/dataset.py +188 -5
- magic_pdf/data/read_api.py +59 -12
- magic_pdf/data/utils.py +35 -0
- magic_pdf/dict2md/ocr_mkcontent.py +16 -15
- magic_pdf/filter/__init__.py +32 -0
- magic_pdf/filter/pdf_meta_scan.py +3 -2
- magic_pdf/libs/clean_memory.py +11 -4
- magic_pdf/libs/config_reader.py +9 -0
- magic_pdf/libs/draw_bbox.py +19 -22
- magic_pdf/libs/language.py +3 -0
- magic_pdf/libs/pdf_check.py +30 -30
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/__init__.py +1 -1
- magic_pdf/model/batch_analyze.py +275 -0
- magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
- magic_pdf/model/magic_model.py +4 -435
- magic_pdf/model/model_list.py +1 -0
- magic_pdf/model/pdf_extract_kit.py +35 -5
- magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
- magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
- magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
- magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
- magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
- magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
- magic_pdf/model/sub_modules/model_init.py +43 -7
- magic_pdf/model/sub_modules/model_utils.py +17 -5
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
- magic_pdf/operators/__init__.py +94 -0
- magic_pdf/operators/models.py +154 -0
- magic_pdf/operators/pipes.py +191 -0
- magic_pdf/pdf_parse_union_core_v2.py +77 -27
- magic_pdf/post_proc/__init__.py +1 -0
- magic_pdf/post_proc/llm_aided.py +133 -0
- magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
- magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
- magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
- magic_pdf/tools/cli.py +36 -11
- magic_pdf/tools/common.py +120 -61
- magic_pdf/utils/office_to_pdf.py +29 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
- magic_pdf/para/__init__.py +0 -0
- magic_pdf/pdf_parse_by_ocr.py +0 -23
- magic_pdf/pdf_parse_by_txt.py +0 -24
- magic_pdf/pipe/AbsPipe.py +0 -98
- magic_pdf/pipe/OCRPipe.py +0 -41
- magic_pdf/pipe/TXTPipe.py +0 -41
- magic_pdf/pipe/UNIPipe.py +0 -98
- magic_pdf/pipe/__init__.py +0 -0
- magic_pdf/rw/AbsReaderWriter.py +0 -17
- magic_pdf/rw/DiskReaderWriter.py +0 -74
- magic_pdf/rw/S3ReaderWriter.py +0 -142
- magic_pdf/rw/__init__.py +0 -0
- magic_pdf/user_api.py +0 -121
- /magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0
@@ -1,36 +1,33 @@
|
|
1
1
|
magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
2
|
-
magic_pdf/
|
3
|
-
magic_pdf/pdf_parse_by_txt.py,sha256=dh3ZM6BVrFzwbH4137BPUdKhgacGlpS2N4mn74_-UaA,762
|
4
|
-
magic_pdf/pdf_parse_union_core_v2.py,sha256=6Apku7-pW450HbHNTtbVLDyroRSKlQ57w9f0ScOaZv4,30879
|
5
|
-
magic_pdf/user_api.py,sha256=Sh6U7iD5VsH7Qkav_0o5GTx-Rlj7vhmhHQHZSBKR5T8,4006
|
2
|
+
magic_pdf/pdf_parse_union_core_v2.py,sha256=uZiTwyVT5iE4YRHzUfN5kifHtYuEuKIqQHnAZQofWuM,33292
|
6
3
|
magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
magic_pdf/config/constants.py,sha256=
|
4
|
+
magic_pdf/config/constants.py,sha256=fXGzANULnJWLPxwYp3BEFWx-rnorzpySMx63ffyEyq4,1272
|
8
5
|
magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
|
9
6
|
magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
|
10
7
|
magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
|
11
|
-
magic_pdf/config/exceptions.py,sha256=
|
8
|
+
magic_pdf/config/exceptions.py,sha256=2tsJxYUebVeimyYBGQkc9Nd1kIakTmWmz3SDcfJWy54,784
|
12
9
|
magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLxLgdBEeRvLg,248
|
13
10
|
magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
|
14
11
|
magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
|
15
12
|
magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
-
magic_pdf/data/dataset.py,sha256=
|
17
|
-
magic_pdf/data/read_api.py,sha256=
|
13
|
+
magic_pdf/data/dataset.py,sha256=q7wfX99HTVjKCFVpf1mnYn55rK6oF5Dz8O9w4C9cYhw,11196
|
14
|
+
magic_pdf/data/read_api.py,sha256=_faBnYE3iU_EiQLNFjVM6a8IQtOGAcSQNYBZsTSN1d8,5225
|
18
15
|
magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
|
19
|
-
magic_pdf/data/utils.py,sha256=
|
16
|
+
magic_pdf/data/utils.py,sha256=aMeQB3soGUJyoI41hfgWeOZNzPj36SOrewUM7z51AOU,2305
|
20
17
|
magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
|
21
|
-
magic_pdf/data/data_reader_writer/base.py,sha256=
|
22
|
-
magic_pdf/data/data_reader_writer/filebase.py,sha256=
|
23
|
-
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=
|
18
|
+
magic_pdf/data/data_reader_writer/base.py,sha256=nqmAcdHOXMOJO6RAT3ILligDFaw8Op0STyCw5yOzAbI,1706
|
19
|
+
magic_pdf/data/data_reader_writer/filebase.py,sha256=VbNAxLyo0Io0j7iprJERt_TqxzHAtA7cUyPIaJstToU,2146
|
20
|
+
magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=4pEJ8PPd3nX7sccHobCs0mbDM8BiqDP_sAEz7CIvpNI,5938
|
24
21
|
magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
|
25
22
|
magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
|
26
23
|
magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
|
27
24
|
magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
|
28
25
|
magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
|
29
26
|
magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
|
-
magic_pdf/dict2md/ocr_mkcontent.py,sha256=
|
31
|
-
magic_pdf/filter/__init__.py,sha256=
|
27
|
+
magic_pdf/dict2md/ocr_mkcontent.py,sha256=RQ47F2CT0Zgmg1rZoqYj5IW5msqoCTEF6GEHi3mVd8U,12989
|
28
|
+
magic_pdf/filter/__init__.py,sha256=rV4dvUxfKyVErDx9ZbUp8DVq_fRIlv0lfSXp1ND4STc,1503
|
32
29
|
magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
|
33
|
-
magic_pdf/filter/pdf_meta_scan.py,sha256=
|
30
|
+
magic_pdf/filter/pdf_meta_scan.py,sha256=rqTuStW2_ICr3HmV_9IQ5jnsl4JnSh7-bL11vbtH3i0,17470
|
34
31
|
magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
35
32
|
magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
33
|
magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
|
@@ -38,33 +35,38 @@ magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCal
|
|
38
35
|
magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
|
39
36
|
magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
37
|
magic_pdf/libs/boxbase.py,sha256=ELMHWolgWROxOAQDgwmL7VS5kveZp4ifvEzRmPul2Ws,16925
|
41
|
-
magic_pdf/libs/clean_memory.py,sha256=
|
38
|
+
magic_pdf/libs/clean_memory.py,sha256=OsQexCjmBO2i-Hv-0uYQfn72dbUWR8sTW81nG2zlNQQ,479
|
42
39
|
magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
|
43
|
-
magic_pdf/libs/config_reader.py,sha256=
|
40
|
+
magic_pdf/libs/config_reader.py,sha256=9GXK7jtDyA_jxXKWZAQ69rB02v5UW4mOmo1IaOYOkW0,4374
|
44
41
|
magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
|
45
42
|
magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
|
46
|
-
magic_pdf/libs/draw_bbox.py,sha256=
|
43
|
+
magic_pdf/libs/draw_bbox.py,sha256=RX_ELX6P8mF0sIBx_h2A3BzhevcSPIzbbrboZTBlBik,17653
|
47
44
|
magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
|
48
45
|
magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
|
49
|
-
magic_pdf/libs/language.py,sha256=
|
46
|
+
magic_pdf/libs/language.py,sha256=w1GVYmlocF7DQjtZrspgH6WacoWazOQBzz-iQx0mSBk,1135
|
50
47
|
magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
|
51
48
|
magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
|
52
49
|
magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
|
53
|
-
magic_pdf/libs/pdf_check.py,sha256=
|
50
|
+
magic_pdf/libs/pdf_check.py,sha256=zBwUThKKBtnrNPmgE10lYsTy1Kq7j_6IejO7JR0J4pA,3118
|
54
51
|
magic_pdf/libs/pdf_image_tools.py,sha256=kjzSEbm7K0yiHv8kJ4VbZ9HHktM8qvAv3LhxRyDZEQk,1987
|
55
52
|
magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
|
56
|
-
magic_pdf/libs/version.py,sha256=
|
57
|
-
magic_pdf/model/__init__.py,sha256=
|
58
|
-
magic_pdf/model/
|
59
|
-
magic_pdf/model/
|
60
|
-
magic_pdf/model/
|
61
|
-
magic_pdf/model/
|
53
|
+
magic_pdf/libs/version.py,sha256=J-j-u0itpEFT6irdmWmixQqYMadNl1X91TxUmoiLHMI,22
|
54
|
+
magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
|
55
|
+
magic_pdf/model/batch_analyze.py,sha256=A49qD5zY9G8nl6wnpMLATqS4_xOOgRvjo1Eq6v_mcUE,11551
|
56
|
+
magic_pdf/model/doc_analyze_by_custom_model.py,sha256=AZSzqzGz5utLwuysF5JY5k5pYcJGtcppNP2BbvExLnA,5989
|
57
|
+
magic_pdf/model/magic_model.py,sha256=Nt74oZGYUcbm4qdOQtN-hbKhXxlWO2LVv3K9yXvteWY,25204
|
58
|
+
magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
|
59
|
+
magic_pdf/model/pdf_extract_kit.py,sha256=Z7zzU_lkVR0vgycpeqVe1pwLc4svYThIUSEdTJVLVNM,12287
|
62
60
|
magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
|
63
61
|
magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
64
|
-
magic_pdf/model/sub_modules/model_init.py,sha256
|
65
|
-
magic_pdf/model/sub_modules/model_utils.py,sha256=
|
62
|
+
magic_pdf/model/sub_modules/model_init.py,sha256=-sVlsOhSjLakOOmw675iDdSQSBW6Py5U0K0XiM3UpvU,6423
|
63
|
+
magic_pdf/model/sub_modules/model_utils.py,sha256=2pI1Xcr2zCF3b64e4WoFtIbjSmTVYBE4zjyHB23gvmE,2488
|
64
|
+
magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
65
|
+
magic_pdf/model/sub_modules/language_detection/utils.py,sha256=5nec_loLyYCJ5o6n38AYLz2SKmRvHDCBdt6ka84EaGM,3096
|
66
|
+
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=GW_9WkqIzpJm1MFJexZ2ZvA6AjoqM-6yh8p4LupJhas,4762
|
67
|
+
magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
66
68
|
magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
67
|
-
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=
|
69
|
+
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=5DXZhbkLrycF3FGF8OMuuHGJtTHLSkTGetxxi5KWDgw,2189
|
68
70
|
magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
69
71
|
magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
70
72
|
magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
|
@@ -87,15 +89,15 @@ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/model
|
|
87
89
|
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
|
88
90
|
magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
|
89
91
|
magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
90
|
-
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=
|
92
|
+
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=QfHbMr1br0pOJUu1NJEMgA6yw11G0yFImJv_AfW48_c,1008
|
91
93
|
magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
92
94
|
magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
93
|
-
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=
|
95
|
+
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=Jgi38JDo6D6sVVnBJ1XZ-iAT9qjj5jW__NL-8GKJb78,5290
|
94
96
|
magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
95
97
|
magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
96
98
|
magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
97
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=
|
98
|
-
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=
|
99
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py,sha256=nT47hOH0rG9_dm4FMU_UNWvoX1IRW0t7TPKQw5XfMQ4,12324
|
100
|
+
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py,sha256=QBBeFN1iF7nj5gqQ5sQXjhpwy8lB4c96gubnRDBuDNU,8424
|
99
101
|
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py,sha256=VouMTvi6M5TV6pQdlpusgfyZapxiZ_Wi7Ff53eMC3rE,8996
|
100
102
|
magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
101
103
|
magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -104,44 +106,41 @@ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4
|
|
104
106
|
magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
105
107
|
magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
|
106
108
|
magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
107
|
-
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=
|
109
|
+
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=YsfgKEM0ETBBYsYmPdR9DAJIGeguK-oy9Pn25vS07CE,1953
|
108
110
|
magic_pdf/model/sub_modules/table/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
109
111
|
magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py,sha256=SrNPm-uOFEvN5muFGbXTAuwzXm-rCiaihVdqbydIBIA,1131
|
110
112
|
magic_pdf/model/sub_modules/table/tablemaster/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
111
113
|
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py,sha256=QEQ-56AzoIAU7UWsEidWW_KDOY5r16qm2kSpox8cxq4,2755
|
112
|
-
magic_pdf/
|
113
|
-
magic_pdf/
|
114
|
-
magic_pdf/
|
115
|
-
magic_pdf/
|
116
|
-
magic_pdf/
|
117
|
-
magic_pdf/
|
118
|
-
magic_pdf/pipe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
114
|
+
magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1REI,2630
|
115
|
+
magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
|
116
|
+
magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
|
117
|
+
magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
|
118
|
+
magic_pdf/post_proc/llm_aided.py,sha256=6eKZAfc0Vk_wX7NyYDle71rf1WWa2-7ZKXQ_Vm7Pem8,4722
|
119
|
+
magic_pdf/post_proc/para_split_v3.py,sha256=v4SdQn4OZdHRXpWQMfQ-FGJz_tglQ88uFUqpwY542Fo,16922
|
119
120
|
magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
120
121
|
magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
|
121
122
|
magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
|
122
123
|
magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
|
123
124
|
magic_pdf/pre_proc/ocr_dict_merge.py,sha256=Ycgz2whzotL7kwl0-mHNV48QOQ2j4tRXqLSQrJRojYg,4847
|
124
|
-
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=
|
125
|
-
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=
|
125
|
+
magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=MVRO_GLsOtmsnj77veH3_QToU9A3gjq7qC6zt73Af1s,3101
|
126
|
+
magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
|
126
127
|
magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
|
127
128
|
magic_pdf/resources/model_config/model_configs.yaml,sha256=S2BnVQxPd0xsZswn9WqJKTfnqd7ayY5lRwDVifTEAfw,290
|
128
129
|
magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=Jdaim2D2lAYrV9rhc1X5Sy2_IacGOrfysJhxEUgSElo,827
|
129
130
|
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
|
130
|
-
magic_pdf/
|
131
|
-
magic_pdf/rw/DiskReaderWriter.py,sha256=7ZAekH8V6xlBo_1WeSZ6sNwAj2WGPtjNl50zq1CoMDY,2614
|
132
|
-
magic_pdf/rw/S3ReaderWriter.py,sha256=LmbtA-pZlC745nnSUs67C1iqSrBDS7IzE6QC8YMB644,5293
|
133
|
-
magic_pdf/rw/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
131
|
+
magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt,sha256=dV4lcudF8wimEbAooYbvISvFhrXjp9i0rMRqv9VW6hY,3204667
|
134
132
|
magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
135
133
|
magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
|
136
134
|
magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
137
|
-
magic_pdf/tools/cli.py,sha256=
|
135
|
+
magic_pdf/tools/cli.py,sha256=YiX9LU4UeU3yYIpblGO1cbO95Tbo3A8cmWFK_1WvqfU,4134
|
138
136
|
magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
|
139
|
-
magic_pdf/tools/common.py,sha256=
|
137
|
+
magic_pdf/tools/common.py,sha256=1LfMeXBBsb3WlGeNAze_pPOYXQ8Qbfh-JgRXweojHKo,8381
|
140
138
|
magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
141
139
|
magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
|
142
|
-
magic_pdf
|
143
|
-
magic_pdf-0.
|
144
|
-
magic_pdf-0.
|
145
|
-
magic_pdf-0.
|
146
|
-
magic_pdf-0.
|
147
|
-
magic_pdf-0.
|
140
|
+
magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
|
141
|
+
magic_pdf-1.0.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
|
142
|
+
magic_pdf-1.0.0.dist-info/METADATA,sha256=itfWxjtkwtZt0xdUfRuDu0v6aXQUB8yEhA5tCimZkgc,40499
|
143
|
+
magic_pdf-1.0.0.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
|
144
|
+
magic_pdf-1.0.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
|
145
|
+
magic_pdf-1.0.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
|
146
|
+
magic_pdf-1.0.0.dist-info/RECORD,,
|
magic_pdf/para/__init__.py
DELETED
File without changes
|
magic_pdf/pdf_parse_by_ocr.py
DELETED
@@ -1,23 +0,0 @@
|
|
1
|
-
from magic_pdf.config.enums import SupportedPdfParseMethod
|
2
|
-
from magic_pdf.data.dataset import PymuDocDataset
|
3
|
-
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
|
4
|
-
|
5
|
-
|
6
|
-
def parse_pdf_by_ocr(pdf_bytes,
|
7
|
-
model_list,
|
8
|
-
imageWriter,
|
9
|
-
start_page_id=0,
|
10
|
-
end_page_id=None,
|
11
|
-
debug_mode=False,
|
12
|
-
lang=None,
|
13
|
-
):
|
14
|
-
dataset = PymuDocDataset(pdf_bytes)
|
15
|
-
return pdf_parse_union(dataset,
|
16
|
-
model_list,
|
17
|
-
imageWriter,
|
18
|
-
SupportedPdfParseMethod.OCR,
|
19
|
-
start_page_id=start_page_id,
|
20
|
-
end_page_id=end_page_id,
|
21
|
-
debug_mode=debug_mode,
|
22
|
-
lang=lang,
|
23
|
-
)
|
magic_pdf/pdf_parse_by_txt.py
DELETED
@@ -1,24 +0,0 @@
|
|
1
|
-
from magic_pdf.config.enums import SupportedPdfParseMethod
|
2
|
-
from magic_pdf.data.dataset import PymuDocDataset
|
3
|
-
from magic_pdf.pdf_parse_union_core_v2 import pdf_parse_union
|
4
|
-
|
5
|
-
|
6
|
-
def parse_pdf_by_txt(
|
7
|
-
pdf_bytes,
|
8
|
-
model_list,
|
9
|
-
imageWriter,
|
10
|
-
start_page_id=0,
|
11
|
-
end_page_id=None,
|
12
|
-
debug_mode=False,
|
13
|
-
lang=None,
|
14
|
-
):
|
15
|
-
dataset = PymuDocDataset(pdf_bytes)
|
16
|
-
return pdf_parse_union(dataset,
|
17
|
-
model_list,
|
18
|
-
imageWriter,
|
19
|
-
SupportedPdfParseMethod.TXT,
|
20
|
-
start_page_id=start_page_id,
|
21
|
-
end_page_id=end_page_id,
|
22
|
-
debug_mode=debug_mode,
|
23
|
-
lang=lang,
|
24
|
-
)
|
magic_pdf/pipe/AbsPipe.py
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
|
3
|
-
from magic_pdf.config.drop_reason import DropReason
|
4
|
-
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
5
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
6
|
-
from magic_pdf.dict2md.ocr_mkcontent import union_make
|
7
|
-
from magic_pdf.filter.pdf_classify_by_type import classify
|
8
|
-
from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan
|
9
|
-
from magic_pdf.libs.json_compressor import JsonCompressor
|
10
|
-
|
11
|
-
|
12
|
-
class AbsPipe(ABC):
|
13
|
-
"""txt和ocr处理的抽象类."""
|
14
|
-
PIP_OCR = 'ocr'
|
15
|
-
PIP_TXT = 'txt'
|
16
|
-
|
17
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
18
|
-
start_page_id=0, end_page_id=None, lang=None, layout_model=None, formula_enable=None, table_enable=None):
|
19
|
-
self.pdf_bytes = pdf_bytes
|
20
|
-
self.model_list = model_list
|
21
|
-
self.image_writer = image_writer
|
22
|
-
self.pdf_mid_data = None # 未压缩
|
23
|
-
self.is_debug = is_debug
|
24
|
-
self.start_page_id = start_page_id
|
25
|
-
self.end_page_id = end_page_id
|
26
|
-
self.lang = lang
|
27
|
-
self.layout_model = layout_model
|
28
|
-
self.formula_enable = formula_enable
|
29
|
-
self.table_enable = table_enable
|
30
|
-
|
31
|
-
def get_compress_pdf_mid_data(self):
|
32
|
-
return JsonCompressor.compress_json(self.pdf_mid_data)
|
33
|
-
|
34
|
-
@abstractmethod
|
35
|
-
def pipe_classify(self):
|
36
|
-
"""有状态的分类."""
|
37
|
-
raise NotImplementedError
|
38
|
-
|
39
|
-
@abstractmethod
|
40
|
-
def pipe_analyze(self):
|
41
|
-
"""有状态的跑模型分析."""
|
42
|
-
raise NotImplementedError
|
43
|
-
|
44
|
-
@abstractmethod
|
45
|
-
def pipe_parse(self):
|
46
|
-
"""有状态的解析."""
|
47
|
-
raise NotImplementedError
|
48
|
-
|
49
|
-
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
50
|
-
content_list = AbsPipe.mk_uni_format(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode)
|
51
|
-
return content_list
|
52
|
-
|
53
|
-
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
54
|
-
md_content = AbsPipe.mk_markdown(self.get_compress_pdf_mid_data(), img_parent_path, drop_mode, md_make_mode)
|
55
|
-
return md_content
|
56
|
-
|
57
|
-
@staticmethod
|
58
|
-
def classify(pdf_bytes: bytes) -> str:
|
59
|
-
"""根据pdf的元数据,判断是文本pdf,还是ocr pdf."""
|
60
|
-
pdf_meta = pdf_meta_scan(pdf_bytes)
|
61
|
-
if pdf_meta.get('_need_drop', False): # 如果返回了需要丢弃的标志,则抛出异常
|
62
|
-
raise Exception(f"pdf meta_scan need_drop,reason is {pdf_meta['_drop_reason']}")
|
63
|
-
else:
|
64
|
-
is_encrypted = pdf_meta['is_encrypted']
|
65
|
-
is_needs_password = pdf_meta['is_needs_password']
|
66
|
-
if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理
|
67
|
-
raise Exception(f'pdf meta_scan need_drop,reason is {DropReason.ENCRYPTED}')
|
68
|
-
else:
|
69
|
-
is_text_pdf, results = classify(
|
70
|
-
pdf_meta['total_page'],
|
71
|
-
pdf_meta['page_width_pts'],
|
72
|
-
pdf_meta['page_height_pts'],
|
73
|
-
pdf_meta['image_info_per_page'],
|
74
|
-
pdf_meta['text_len_per_page'],
|
75
|
-
pdf_meta['imgs_per_page'],
|
76
|
-
pdf_meta['text_layout_per_page'],
|
77
|
-
pdf_meta['invalid_chars'],
|
78
|
-
)
|
79
|
-
if is_text_pdf:
|
80
|
-
return AbsPipe.PIP_TXT
|
81
|
-
else:
|
82
|
-
return AbsPipe.PIP_OCR
|
83
|
-
|
84
|
-
@staticmethod
|
85
|
-
def mk_uni_format(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF) -> list:
|
86
|
-
"""根据pdf类型,生成统一格式content_list."""
|
87
|
-
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
88
|
-
pdf_info_list = pdf_mid_data['pdf_info']
|
89
|
-
content_list = union_make(pdf_info_list, MakeMode.STANDARD_FORMAT, drop_mode, img_buket_path)
|
90
|
-
return content_list
|
91
|
-
|
92
|
-
@staticmethod
|
93
|
-
def mk_markdown(compressed_pdf_mid_data: str, img_buket_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD) -> list:
|
94
|
-
"""根据pdf类型,markdown."""
|
95
|
-
pdf_mid_data = JsonCompressor.decompress_json(compressed_pdf_mid_data)
|
96
|
-
pdf_info_list = pdf_mid_data['pdf_info']
|
97
|
-
md_content = union_make(pdf_info_list, md_make_mode, drop_mode, img_buket_path)
|
98
|
-
return md_content
|
magic_pdf/pipe/OCRPipe.py
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
from loguru import logger
|
2
|
-
|
3
|
-
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
5
|
-
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
6
|
-
from magic_pdf.pipe.AbsPipe import AbsPipe
|
7
|
-
from magic_pdf.user_api import parse_ocr_pdf
|
8
|
-
|
9
|
-
|
10
|
-
class OCRPipe(AbsPipe):
|
11
|
-
|
12
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
13
|
-
start_page_id=0, end_page_id=None, lang=None,
|
14
|
-
layout_model=None, formula_enable=None, table_enable=None):
|
15
|
-
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
16
|
-
layout_model, formula_enable, table_enable)
|
17
|
-
|
18
|
-
def pipe_classify(self):
|
19
|
-
pass
|
20
|
-
|
21
|
-
def pipe_analyze(self):
|
22
|
-
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
|
23
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
24
|
-
lang=self.lang, layout_model=self.layout_model,
|
25
|
-
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
26
|
-
|
27
|
-
def pipe_parse(self):
|
28
|
-
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
|
29
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
30
|
-
lang=self.lang, layout_model=self.layout_model,
|
31
|
-
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
32
|
-
|
33
|
-
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
34
|
-
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
35
|
-
logger.info('ocr_pipe mk content list finished')
|
36
|
-
return result
|
37
|
-
|
38
|
-
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
39
|
-
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
40
|
-
logger.info(f'ocr_pipe mk {md_make_mode} finished')
|
41
|
-
return result
|
magic_pdf/pipe/TXTPipe.py
DELETED
@@ -1,41 +0,0 @@
|
|
1
|
-
from loguru import logger
|
2
|
-
|
3
|
-
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
4
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
5
|
-
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
6
|
-
from magic_pdf.pipe.AbsPipe import AbsPipe
|
7
|
-
from magic_pdf.user_api import parse_txt_pdf
|
8
|
-
|
9
|
-
|
10
|
-
class TXTPipe(AbsPipe):
|
11
|
-
|
12
|
-
def __init__(self, pdf_bytes: bytes, model_list: list, image_writer: DataWriter, is_debug: bool = False,
|
13
|
-
start_page_id=0, end_page_id=None, lang=None,
|
14
|
-
layout_model=None, formula_enable=None, table_enable=None):
|
15
|
-
super().__init__(pdf_bytes, model_list, image_writer, is_debug, start_page_id, end_page_id, lang,
|
16
|
-
layout_model, formula_enable, table_enable)
|
17
|
-
|
18
|
-
def pipe_classify(self):
|
19
|
-
pass
|
20
|
-
|
21
|
-
def pipe_analyze(self):
|
22
|
-
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
|
23
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
24
|
-
lang=self.lang, layout_model=self.layout_model,
|
25
|
-
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
26
|
-
|
27
|
-
def pipe_parse(self):
|
28
|
-
self.pdf_mid_data = parse_txt_pdf(self.pdf_bytes, self.model_list, self.image_writer, is_debug=self.is_debug,
|
29
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
30
|
-
lang=self.lang, layout_model=self.layout_model,
|
31
|
-
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
32
|
-
|
33
|
-
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF):
|
34
|
-
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
35
|
-
logger.info('txt_pipe mk content list finished')
|
36
|
-
return result
|
37
|
-
|
38
|
-
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
39
|
-
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
40
|
-
logger.info(f'txt_pipe mk {md_make_mode} finished')
|
41
|
-
return result
|
magic_pdf/pipe/UNIPipe.py
DELETED
@@ -1,98 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from loguru import logger
|
4
|
-
|
5
|
-
from magic_pdf.config.make_content_config import DropMode, MakeMode
|
6
|
-
from magic_pdf.data.data_reader_writer import DataWriter
|
7
|
-
from magic_pdf.libs.commons import join_path
|
8
|
-
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
|
9
|
-
from magic_pdf.pipe.AbsPipe import AbsPipe
|
10
|
-
from magic_pdf.user_api import parse_ocr_pdf, parse_union_pdf
|
11
|
-
|
12
|
-
|
13
|
-
class UNIPipe(AbsPipe):
|
14
|
-
|
15
|
-
def __init__(self, pdf_bytes: bytes, jso_useful_key: dict, image_writer: DataWriter, is_debug: bool = False,
|
16
|
-
start_page_id=0, end_page_id=None, lang=None,
|
17
|
-
layout_model=None, formula_enable=None, table_enable=None):
|
18
|
-
self.pdf_type = jso_useful_key['_pdf_type']
|
19
|
-
super().__init__(pdf_bytes, jso_useful_key['model_list'], image_writer, is_debug, start_page_id, end_page_id,
|
20
|
-
lang, layout_model, formula_enable, table_enable)
|
21
|
-
if len(self.model_list) == 0:
|
22
|
-
self.input_model_is_empty = True
|
23
|
-
else:
|
24
|
-
self.input_model_is_empty = False
|
25
|
-
|
26
|
-
def pipe_classify(self):
|
27
|
-
self.pdf_type = AbsPipe.classify(self.pdf_bytes)
|
28
|
-
|
29
|
-
def pipe_analyze(self):
|
30
|
-
if self.pdf_type == self.PIP_TXT:
|
31
|
-
self.model_list = doc_analyze(self.pdf_bytes, ocr=False,
|
32
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
33
|
-
lang=self.lang, layout_model=self.layout_model,
|
34
|
-
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
35
|
-
elif self.pdf_type == self.PIP_OCR:
|
36
|
-
self.model_list = doc_analyze(self.pdf_bytes, ocr=True,
|
37
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
38
|
-
lang=self.lang, layout_model=self.layout_model,
|
39
|
-
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
40
|
-
|
41
|
-
def pipe_parse(self):
|
42
|
-
if self.pdf_type == self.PIP_TXT:
|
43
|
-
self.pdf_mid_data = parse_union_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
44
|
-
is_debug=self.is_debug, input_model_is_empty=self.input_model_is_empty,
|
45
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
46
|
-
lang=self.lang, layout_model=self.layout_model,
|
47
|
-
formula_enable=self.formula_enable, table_enable=self.table_enable)
|
48
|
-
elif self.pdf_type == self.PIP_OCR:
|
49
|
-
self.pdf_mid_data = parse_ocr_pdf(self.pdf_bytes, self.model_list, self.image_writer,
|
50
|
-
is_debug=self.is_debug,
|
51
|
-
start_page_id=self.start_page_id, end_page_id=self.end_page_id,
|
52
|
-
lang=self.lang)
|
53
|
-
|
54
|
-
def pipe_mk_uni_format(self, img_parent_path: str, drop_mode=DropMode.NONE_WITH_REASON):
|
55
|
-
result = super().pipe_mk_uni_format(img_parent_path, drop_mode)
|
56
|
-
logger.info('uni_pipe mk content list finished')
|
57
|
-
return result
|
58
|
-
|
59
|
-
def pipe_mk_markdown(self, img_parent_path: str, drop_mode=DropMode.WHOLE_PDF, md_make_mode=MakeMode.MM_MD):
|
60
|
-
result = super().pipe_mk_markdown(img_parent_path, drop_mode, md_make_mode)
|
61
|
-
logger.info(f'uni_pipe mk {md_make_mode} finished')
|
62
|
-
return result
|
63
|
-
|
64
|
-
|
65
|
-
if __name__ == '__main__':
|
66
|
-
# 测试
|
67
|
-
from magic_pdf.data.data_reader_writer import DataReader
|
68
|
-
drw = DataReader(r'D:/project/20231108code-clean')
|
69
|
-
|
70
|
-
pdf_file_path = r'linshixuqiu\19983-00.pdf'
|
71
|
-
model_file_path = r'linshixuqiu\19983-00.json'
|
72
|
-
pdf_bytes = drw.read(pdf_file_path)
|
73
|
-
model_json_txt = drw.read(model_file_path).decode()
|
74
|
-
model_list = json.loads(model_json_txt)
|
75
|
-
write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
|
76
|
-
img_bucket_path = 'imgs'
|
77
|
-
img_writer = DataWriter(join_path(write_path, img_bucket_path))
|
78
|
-
|
79
|
-
# pdf_type = UNIPipe.classify(pdf_bytes)
|
80
|
-
# jso_useful_key = {
|
81
|
-
# "_pdf_type": pdf_type,
|
82
|
-
# "model_list": model_list
|
83
|
-
# }
|
84
|
-
|
85
|
-
jso_useful_key = {
|
86
|
-
'_pdf_type': '',
|
87
|
-
'model_list': model_list
|
88
|
-
}
|
89
|
-
pipe = UNIPipe(pdf_bytes, jso_useful_key, img_writer)
|
90
|
-
pipe.pipe_classify()
|
91
|
-
pipe.pipe_parse()
|
92
|
-
md_content = pipe.pipe_mk_markdown(img_bucket_path)
|
93
|
-
content_list = pipe.pipe_mk_uni_format(img_bucket_path)
|
94
|
-
|
95
|
-
md_writer = DataWriter(write_path)
|
96
|
-
md_writer.write_string('19983-00.md', md_content)
|
97
|
-
md_writer.write_string('19983-00.json', json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4))
|
98
|
-
md_writer.write_string('19983-00.txt', str(content_list))
|
magic_pdf/pipe/__init__.py
DELETED
File without changes
|
magic_pdf/rw/AbsReaderWriter.py
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
from abc import ABC, abstractmethod
|
2
|
-
|
3
|
-
|
4
|
-
class AbsReaderWriter(ABC):
|
5
|
-
MODE_TXT = "text"
|
6
|
-
MODE_BIN = "binary"
|
7
|
-
@abstractmethod
|
8
|
-
def read(self, path: str, mode=MODE_TXT):
|
9
|
-
raise NotImplementedError
|
10
|
-
|
11
|
-
@abstractmethod
|
12
|
-
def write(self, content: str, path: str, mode=MODE_TXT):
|
13
|
-
raise NotImplementedError
|
14
|
-
|
15
|
-
@abstractmethod
|
16
|
-
def read_offset(self, path: str, offset=0, limit=None) -> bytes:
|
17
|
-
raise NotImplementedError
|
magic_pdf/rw/DiskReaderWriter.py
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
|
3
|
-
from loguru import logger
|
4
|
-
|
5
|
-
|
6
|
-
class DiskReaderWriter(AbsReaderWriter):
|
7
|
-
def __init__(self, parent_path, encoding="utf-8"):
|
8
|
-
self.path = parent_path
|
9
|
-
self.encoding = encoding
|
10
|
-
|
11
|
-
def read(self, path, mode=AbsReaderWriter.MODE_TXT):
|
12
|
-
if os.path.isabs(path):
|
13
|
-
abspath = path
|
14
|
-
else:
|
15
|
-
abspath = os.path.join(self.path, path)
|
16
|
-
if not os.path.exists(abspath):
|
17
|
-
logger.error(f"file {abspath} not exists")
|
18
|
-
raise Exception(f"file {abspath} no exists")
|
19
|
-
if mode == AbsReaderWriter.MODE_TXT:
|
20
|
-
with open(abspath, "r", encoding=self.encoding) as f:
|
21
|
-
return f.read()
|
22
|
-
elif mode == AbsReaderWriter.MODE_BIN:
|
23
|
-
with open(abspath, "rb") as f:
|
24
|
-
return f.read()
|
25
|
-
else:
|
26
|
-
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
27
|
-
|
28
|
-
def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
|
29
|
-
if os.path.isabs(path):
|
30
|
-
abspath = path
|
31
|
-
else:
|
32
|
-
abspath = os.path.join(self.path, path)
|
33
|
-
directory_path = os.path.dirname(abspath)
|
34
|
-
if not os.path.exists(directory_path):
|
35
|
-
os.makedirs(directory_path)
|
36
|
-
if mode == AbsReaderWriter.MODE_TXT:
|
37
|
-
with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
|
38
|
-
f.write(content)
|
39
|
-
|
40
|
-
elif mode == AbsReaderWriter.MODE_BIN:
|
41
|
-
with open(abspath, "wb") as f:
|
42
|
-
f.write(content)
|
43
|
-
else:
|
44
|
-
raise ValueError("Invalid mode. Use 'text' or 'binary'.")
|
45
|
-
|
46
|
-
def read_offset(self, path: str, offset=0, limit=None):
|
47
|
-
abspath = path
|
48
|
-
if not os.path.isabs(path):
|
49
|
-
abspath = os.path.join(self.path, path)
|
50
|
-
with open(abspath, "rb") as f:
|
51
|
-
f.seek(offset)
|
52
|
-
return f.read(limit)
|
53
|
-
|
54
|
-
|
55
|
-
if __name__ == "__main__":
|
56
|
-
if 0:
|
57
|
-
file_path = "io/test/example.txt"
|
58
|
-
drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
|
59
|
-
|
60
|
-
# 写入内容到文件
|
61
|
-
drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
|
62
|
-
|
63
|
-
# 从文件读取内容
|
64
|
-
content = drw.read(path=file_path)
|
65
|
-
if content:
|
66
|
-
logger.info(f"从 {file_path} 读取的内容: {content}")
|
67
|
-
if 1:
|
68
|
-
drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
|
69
|
-
content_bin = drw.read_offset("1.txt")
|
70
|
-
assert content_bin == b"ABCD!"
|
71
|
-
|
72
|
-
content_bin = drw.read_offset("1.txt", offset=1, limit=2)
|
73
|
-
assert content_bin == b"BC"
|
74
|
-
|