magic-pdf 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. magic_pdf/data/batch_build_dataset.py +156 -0
  2. magic_pdf/data/dataset.py +56 -25
  3. magic_pdf/data/utils.py +108 -9
  4. magic_pdf/dict2md/ocr_mkcontent.py +4 -3
  5. magic_pdf/libs/pdf_image_tools.py +11 -6
  6. magic_pdf/libs/performance_stats.py +12 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/batch_analyze.py +175 -201
  9. magic_pdf/model/doc_analyze_by_custom_model.py +142 -92
  10. magic_pdf/model/pdf_extract_kit.py +5 -38
  11. magic_pdf/model/sub_modules/language_detection/utils.py +2 -4
  12. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +24 -19
  13. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +3 -1
  14. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +3 -1
  15. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +31 -102
  16. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +13 -0
  17. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +189 -0
  18. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +8 -0
  19. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +163 -0
  20. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +2351 -0
  21. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +9 -0
  22. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +132 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +132 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +1084 -0
  25. magic_pdf/model/sub_modules/model_init.py +50 -37
  26. magic_pdf/model/sub_modules/model_utils.py +18 -12
  27. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/ocr/{paddleocr → paddleocr2pytorch}/ocr_utils.py +102 -97
  29. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +193 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +39 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +8 -0
  32. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +48 -0
  33. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +418 -0
  34. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +25 -0
  35. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +105 -0
  36. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +62 -0
  37. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +269 -0
  38. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +290 -0
  39. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +516 -0
  40. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +136 -0
  41. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +234 -0
  42. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +638 -0
  43. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +76 -0
  44. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +43 -0
  45. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +23 -0
  46. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +109 -0
  47. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +54 -0
  48. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +58 -0
  49. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +29 -0
  50. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +456 -0
  51. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +117 -0
  52. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +228 -0
  53. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +33 -0
  54. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +20 -0
  55. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +179 -0
  56. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +690 -0
  57. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py +0 -0
  58. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +383 -0
  59. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +162 -0
  60. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +8421 -0
  61. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +163 -0
  62. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +167 -0
  63. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +95 -0
  64. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +4399 -0
  65. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +153 -0
  66. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +3688 -0
  67. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +185 -0
  68. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +6623 -0
  69. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +128 -0
  70. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +151 -0
  71. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +49 -0
  72. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +1 -0
  73. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py +1 -0
  74. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +106 -0
  75. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +217 -0
  76. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +440 -0
  77. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +104 -0
  78. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +227 -0
  79. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +15 -19
  80. magic_pdf/pdf_parse_union_core_v2.py +112 -74
  81. magic_pdf/pre_proc/ocr_dict_merge.py +9 -1
  82. magic_pdf/pre_proc/ocr_span_list_modify.py +51 -0
  83. magic_pdf/resources/model_config/model_configs.yaml +1 -1
  84. magic_pdf/resources/slanet_plus/slanet-plus.onnx +0 -0
  85. magic_pdf/tools/cli.py +30 -12
  86. magic_pdf/tools/common.py +90 -12
  87. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/METADATA +92 -59
  88. magic_pdf-1.3.1.dist-info/RECORD +203 -0
  89. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/WHEEL +1 -1
  90. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +0 -204
  91. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +0 -213
  92. magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py +0 -37
  93. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +0 -71
  94. magic_pdf/resources/model_config/UniMERNet/demo.yaml +0 -46
  95. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +0 -351
  96. magic_pdf-1.2.2.dist-info/RECORD +0 -147
  97. /magic_pdf/model/sub_modules/{ocr/paddleocr/__init__.py → mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py} +0 -0
  98. /magic_pdf/model/sub_modules/{table/structeqtable → ocr/paddleocr2pytorch/pytorchocr}/__init__.py +0 -0
  99. /magic_pdf/model/sub_modules/{table/tablemaster → ocr/paddleocr2pytorch/pytorchocr/modeling}/__init__.py +0 -0
  100. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/LICENSE.md +0 -0
  101. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/entry_points.txt +0 -0
  102. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,203 @@
1
+ magic_pdf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ magic_pdf/pdf_parse_union_core_v2.py,sha256=6b7EHzNJrKGBIIRnK8zqeWn8tcnNZpP-7hYGP9DA82I,40384
3
+ magic_pdf/config/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ magic_pdf/config/constants.py,sha256=fXGzANULnJWLPxwYp3BEFWx-rnorzpySMx63ffyEyq4,1272
5
+ magic_pdf/config/drop_reason.py,sha256=CqjMzBE96Qo8OeFvhhhItY8WhyqsKhE3DmyJLoQZNCc,2248
6
+ magic_pdf/config/drop_tag.py,sha256=CjveyzhAsHm_bfXB7ZZNKruw1NR-WdKD8Hz6OhQdG0A,680
7
+ magic_pdf/config/enums.py,sha256=CImYuw4sbKpq9zrj6zrrEvtdoGkjxDt8S8ByNVDpypU,89
8
+ magic_pdf/config/exceptions.py,sha256=2tsJxYUebVeimyYBGQkc9Nd1kIakTmWmz3SDcfJWy54,784
9
+ magic_pdf/config/make_content_config.py,sha256=J2eJIhVHBPGwX18zVQomQUOxs8LcfeGLxLgdBEeRvLg,248
10
+ magic_pdf/config/model_block_type.py,sha256=y5ie2ZLvo-h8OdVk8HOEha6qK0OJFtLmtOhYjrV680g,166
11
+ magic_pdf/config/ocr_content_type.py,sha256=e_7RBTdShaWvWhMO2SFou7GM521elMH_Jtn5usbHWdY,890
12
+ magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ magic_pdf/data/batch_build_dataset.py,sha256=rS4f50hBc7IvSqa_Gd84E_tSYpQ66BMaeZkCPd5Ajxw,4601
14
+ magic_pdf/data/dataset.py,sha256=nsS507s1lPyfjnzEhfsQiBy_CdScPy79h3Fvjk_VKp0,12237
15
+ magic_pdf/data/read_api.py,sha256=_faBnYE3iU_EiQLNFjVM6a8IQtOGAcSQNYBZsTSN1d8,5225
16
+ magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
17
+ magic_pdf/data/utils.py,sha256=dNWIJECPXaakKocI4z5Tq6vhDDSnR-bVWQV7DO2w_A8,5335
18
+ magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
19
+ magic_pdf/data/data_reader_writer/base.py,sha256=nqmAcdHOXMOJO6RAT3ILligDFaw8Op0STyCw5yOzAbI,1706
20
+ magic_pdf/data/data_reader_writer/filebase.py,sha256=VbNAxLyo0Io0j7iprJERt_TqxzHAtA7cUyPIaJstToU,2146
21
+ magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=4pEJ8PPd3nX7sccHobCs0mbDM8BiqDP_sAEz7CIvpNI,5938
22
+ magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
23
+ magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
24
+ magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
25
+ magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
26
+ magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
27
+ magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
+ magic_pdf/dict2md/ocr_mkcontent.py,sha256=U4DqKfD4dJ2S5Z8NEAGhuLYkEOIeC-BWuArMbwi7BJs,13784
29
+ magic_pdf/filter/__init__.py,sha256=_7lSez_myu4b6cdzPpQ-NfREuqeBSq_QdyBPKVLyq2U,1505
30
+ magic_pdf/filter/pdf_classify_by_type.py,sha256=YNYXamxYgEiSujwilCNHOtrwpgJGDiQ597qJfardDVc,42354
31
+ magic_pdf/filter/pdf_meta_scan.py,sha256=eOuM0-JgaXvHolSgepGoNDJDmv_uITWLQpH_0MfnVQw,17478
32
+ magic_pdf/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ magic_pdf/integrations/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
+ magic_pdf/integrations/rag/api.py,sha256=t38wvIBzLje4_JzTP3dewMLqV-tQJ-A3B92Sj2oyrfs,2507
35
+ magic_pdf/integrations/rag/type.py,sha256=Z_1g_ZIOCsb7-FmZBudReIXj8nzGrgj_BygCalhJdmk,3193
36
+ magic_pdf/integrations/rag/utils.py,sha256=DCb-UhC8TElb6Eq7_6NmmETreKEk5DVE18hNL8sTEBk,11762
37
+ magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
+ magic_pdf/libs/boxbase.py,sha256=DKZXhwpJd-HE2_Du7NmkeeYW5gG-iwX3GeSWL7rYGv0,16956
39
+ magic_pdf/libs/clean_memory.py,sha256=OsQexCjmBO2i-Hv-0uYQfn72dbUWR8sTW81nG2zlNQQ,479
40
+ magic_pdf/libs/commons.py,sha256=xD0fGA16KNB5rhbl4zRrOqdrNHYwaRablT_s9W2ZTbw,1174
41
+ magic_pdf/libs/config_reader.py,sha256=9GXK7jtDyA_jxXKWZAQ69rB02v5UW4mOmo1IaOYOkW0,4374
42
+ magic_pdf/libs/convert_utils.py,sha256=Ov-lsfCLBPz_15iSJXIslBNmrSf_E_1g_XDWJy8NgO8,143
43
+ magic_pdf/libs/coordinate_transform.py,sha256=Bbop2cP2uz2ZG0U0gwd7J6EKkgABq5Rv03qf2LMPw80,429
44
+ magic_pdf/libs/draw_bbox.py,sha256=hpUmpPiQVu7UgWQa3M49dS22G6A9gcG2jpq4dQjTjzA,18331
45
+ magic_pdf/libs/hash_utils.py,sha256=VEKK9WfFoZgrPfi8kfITjLpr8Ahufs8tXh9R1Y5lAL8,404
46
+ magic_pdf/libs/json_compressor.py,sha256=6-KCu0lb5ksmyqWtQGb4QqmP-FjRb5dP7P-Hevcn68g,875
47
+ magic_pdf/libs/language.py,sha256=7RT3mxSa7jdpoC5ySd7ZddHA7TO7UsnmDOWiYZAxuyg,1433
48
+ magic_pdf/libs/local_math.py,sha256=tqljQOgqh3fZc146HYhO88JXJaiXMVwArBkk_CSGICc,177
49
+ magic_pdf/libs/markdown_utils.py,sha256=86v2BmsSV4NkoRZrH4uQD1youJhYFF3vIKr_vDeg3z0,270
50
+ magic_pdf/libs/path_utils.py,sha256=Hykw_l5CU736b2egHV9P7B-qh3QNKO4nZSGCbsi0Z8E,1043
51
+ magic_pdf/libs/pdf_check.py,sha256=7GWWvDR6g_rj_fE6XJlbTq5AFVX11ngRIzT0N18F214,3396
52
+ magic_pdf/libs/pdf_image_tools.py,sha256=_au7plmKKctpPKozBumSKgP8689q4vH1mU8VMLO0IbM,2260
53
+ magic_pdf/libs/performance_stats.py,sha256=DW-c6nUTUnWKGTONRKfpucsYZm1ake016F9K7jJwbik,2136
54
+ magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
55
+ magic_pdf/libs/version.py,sha256=-ypEJktJToAL9by62JJKWEzDo_KPCQtmE5kwFgX24z4,22
56
+ magic_pdf/model/__init__.py,sha256=sa-dO2k-TLy25I2gRrzjm_cQeYfzMf-pLwBJHkIxGo0,51
57
+ magic_pdf/model/batch_analyze.py,sha256=6vRqGnZjDqznsifeDZhjD_v8RmDSdDNxOAci8GCFozo,11211
58
+ magic_pdf/model/doc_analyze_by_custom_model.py,sha256=z1JWvM24poMd2SsziRJRzeqJ9rKXbqSwJprCheuXSGg,10282
59
+ magic_pdf/model/magic_model.py,sha256=yZKWo_wRck_-YLyFGRiUHGar8sV1Y6458BFLbyBAt74,30682
60
+ magic_pdf/model/model_list.py,sha256=aqfEJlEfbib3D3ISrxc0Coh6SbffYh8Yq2FlQN35_zA,213
61
+ magic_pdf/model/pdf_extract_kit.py,sha256=C3sKqRkoD20Ldmo-cqGn1zRldEL-l5NYqcFvd05_fGU,10845
62
+ magic_pdf/model/pp_structure_v2.py,sha256=NcqFWL4nUtjl82MFak8HX_8V3i4Aw_fK4dATrIp5uGs,3840
63
+ magic_pdf/model/sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
+ magic_pdf/model/sub_modules/model_init.py,sha256=e2t95kxiuU47luOHByDokbQ2uob6oQhEA4b6UGVfMjY,8303
65
+ magic_pdf/model/sub_modules/model_utils.py,sha256=GGkVqdGPTmPUaYTuPHxjzzxIizg1kmYo8voIdE7ETdg,2653
66
+ magic_pdf/model/sub_modules/language_detection/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
67
+ magic_pdf/model/sub_modules/language_detection/utils.py,sha256=Q__v6DdNJztt8GhVSuSB0txahVq-aj8RLhWn2VScx4w,3047
68
+ magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py,sha256=7T8eFl8zlZe6F0j0jSB3jrwOapDft320JQ1fuWxpvAY,5230
69
+ magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
70
+ magic_pdf/model/sub_modules/layout/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
71
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py,sha256=c-OmqY9DOgkIWqiDrNGe0-E5RS6-diwnPPsN5pbPw-s,2346
72
+ magic_pdf/model/sub_modules/layout/doclayout_yolo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
73
+ magic_pdf/model/sub_modules/layout/layoutlmv3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
74
+ magic_pdf/model/sub_modules/layout/layoutlmv3/backbone.py,sha256=1cvSCczgvwOLdvzWyqttoYPMHsXmnzI3w9abJ1bAXoM,7106
75
+ magic_pdf/model/sub_modules/layout/layoutlmv3/beit.py,sha256=e-INve6bpEx_0FM5wYbQcEcelc79tzDlCljTVHaGt1w,30450
76
+ magic_pdf/model/sub_modules/layout/layoutlmv3/deit.py,sha256=Qyn5UWutZ-0GJczexCh-oMMSXtav_g3ovumMFJp8Om4,17000
77
+ magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py,sha256=PhWqqRwgSSmXTaUlLIjGqnBUNjzxwYDKgMzKjnxNy1k,4528
78
+ magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py,sha256=nI4G6AeLRmjavNhs5S2USKh0ozn-ftMuW0F0m_eVy3c,6649
79
+ magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py,sha256=H6UYeCCbaN2gbDjGthTkKkPoyWxfE3azRjsR7fVBwnw,49797
80
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/__init__.py,sha256=C4N9gXJr7is7uznvQefQ7dOhlzEhdp86Lgh-7p0Y-08,186
81
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/__init__.py,sha256=W7V62JOh12NdMZj2H1sde3Il0AqW2VKplmHEsLle6tg,76
82
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/cord.py,sha256=jR_lRZxy8SeEvTK3FdlXmQHF0kefJf7ZqwM_8pvyI5E,8153
83
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/data_collator.py,sha256=M2TE47BprHSuQJYcoMeWOSpqkr_nh8VK6t2l26XWmxg,6279
84
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/funsd.py,sha256=Ez9tMeruHncJlkKQ7iRGBB9Pk1uWtgxlGeqs-sOmIG0,5214
85
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/image_utils.py,sha256=vuNOMzYw_h7jmaD2XUqkGlrjDEPB7XUts16GRICBmG4,10334
86
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/data/xfund.py,sha256=6jLKyc_4VhbHY4YEzBXm5RkPdsd9ldnUGXFZBLiJ-_s,8270
87
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/__init__.py,sha256=d5bm3Rx-jTrgfJDWrzD7t5R5CdHfug9dCNvUEneIYW4,190
88
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py,sha256=a04w_C0B4P9jF-3I_tXCj3fLmfFQR5XSKGbhgGm--pM,1216
89
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py,sha256=CJBcAmmLeRFVMN1YjWefoUW7hk0KXek0Eb_tergKl4Y,2150
90
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
91
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
92
+ magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
93
+ magic_pdf/model/sub_modules/mfd/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
+ magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py,sha256=Wm6jh70l_q_P_LQxa_pmLbg9OnHZyEKF1Dfln7Y2c8w,1114
95
+ magic_pdf/model/sub_modules/mfd/yolov8/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
96
+ magic_pdf/model/sub_modules/mfr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
97
+ magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py,sha256=hq48TP1Ac8Y4FlK7GamnU-WZTQfdZotxBKuFhOIjrcM,5349
98
+ magic_pdf/model/sub_modules/mfr/unimernet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py,sha256=kHcISG8GS4TWJW34SCJCei1jxo6HxvO00aC0dqyNFgI,413
100
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py,sha256=F_rwlFytWOwOntNhhZoUqFAyXgzvq_HVjKyBizwlnjo,7913
101
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py,sha256=9T2rBpyGX5YFQYj89-mWujRokOuz4xgNreBuegcg1_c,228
102
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py,sha256=hwRar3pqN_cVs3TRTNSuhB4wacBncfJ-qvaTajRb0xc,7934
103
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py,sha256=Y5nwqDRzSb8zAbY1U0HGJGVtFggJW1zATP8RdnasNcA,113605
104
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
105
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py,sha256=8_1DKwDCDUBkeHYiJJ6MZnodZBsatHbqhygh11s9eEA,267
106
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py,sha256=OX3eRUKBnKCXtxJOG3sdNoB1IV-Z7efgWU-gaclYOGA,5780
107
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py,sha256=tlcCxOJVRus_35lCz4faMVZ8ulAjGxK5Yu9Y_IeHsDY,4406
108
+ magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py,sha256=Q_fdmFHUBtEoAfWp9aowdwTCE2MIFMOPbYjoSyXK2iU,48929
109
+ magic_pdf/model/sub_modules/ocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
110
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
111
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/ocr_utils.py,sha256=3qxu0lAjqzZQ2Ci-C_wz_YSakyq_5-KnckA3-5bICTM,12589
112
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py,sha256=FaPo02L1IJKybGYfydsohOiHstJIL8d5UKzGck2tYvk,7283
113
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
114
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py,sha256=5bI7MAu65r-vn28krwdJ6pjZMkEvWjspE7EQaTsRERw,1319
115
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py,sha256=YYu3c-W4fgEErxxDM98uQ3oWwPEh-6w75LY4zcj4VtM,199
116
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py,sha256=c4H0gXPRweQ0wMFnkrCLTR6MrtG-e4kUinxwq2G1V9U,1480
117
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py,sha256=uwS8t-6hVIBag3jJd3yiBM4DW_dEiynp22_WFmVppjA,14205
118
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
119
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py,sha256=3r2jTvPYQS4IgTvIqR4l6bBVwR7jn-87rSmpv3tlqxI,2294
120
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py,sha256=DCA9FS4mE5oCHDlBhUrkYLdxFeQIbhPj4P8oJ_gRZD8,832
121
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py,sha256=RhV2Dm-os08kCFylT57zRu72Hq_RJdFy3xQe1MPaCuU,3588
122
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py,sha256=TOLvLNeJhnAOus0D5jAq2TM67vLrtbzYlThTOhc-idc,1960
123
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py,sha256=r0gWnA1Xmt0Zw4FQLx7kf-WWwZd_26PfNzhM05drcuE,8334
124
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py,sha256=UsIbzqN_koyGoSh1TA9r27SggpHbeKS3HmmS-A2Aw04,8341
125
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py,sha256=3PNQG9B1cHe1hAg0NhcFR8p87rZnmH0jTBcfid-ZnKQ,15995
126
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py,sha256=mJmE6xGpjHZH2Vaw16LlIlqRFFm9R9yRsSJEa3Yn3nw,4822
127
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py,sha256=K4p9KFYNmltV3y3QsxHIASNxoqlGtxgAoCxeFofyCmw,6726
128
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py,sha256=AIaUZ3IWBkRz2pWmanBjS0QdJcYnimMSV4MWofNpQcg,20222
129
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py,sha256=7vl3hyn6Ug_DNtqdfUL1Hz9scA_ptch2FCDzNddpQgU,1282
130
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py,sha256=puIy5GlUtAKer6eS4HWKu07PzRd-HlDAqIz5WqjBHaA,596
131
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py,sha256=-k8bpuGQw_xIVDsumrfimOxg0O-oP2MOAyDJTjU70Ro,3633
132
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py,sha256=ywyk5RJgUITdXvrUZk2yBSWKsaZIqnTofdFbuQUtwjU,1311
133
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py,sha256=K40SMA8tAVWu-3fwgfh3jGWeVFAdVnMyHjeZeI9OO7Q,2016
134
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py,sha256=634L1y-QWv5P8opNiSmKvQEx3Uskc20RG8DYiCdbl8U,1030
135
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py,sha256=TLF2pSyvRC0oPzL0eVyNlg3W6Zvfr4J8fD1nziVB7uI,14146
136
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py,sha256=w2QdwdI9BpiW92VS4mqL31sVERIbY53TfbD5Q6okiaY,3410
137
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py,sha256=olZwnKQexVlMx0gJi0FVYAm38TxNn5BM6F-OrdHKEgk,7019
138
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py,sha256=iC1Ol6CTxRWZBUyQ_5IVMR6kIurv9WJPOWWo7NAuZBA,1183
139
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py,sha256=1VVWXT_b1vhGb7PGvqyfUQ3Ip7LupH62vPva98GtjTA,685
140
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py,sha256=EAc-cmhXtxLfFA6h5C871hIcDfXsjPDISiVCKwxh-qM,6339
141
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py,sha256=HiHNr4bhW5U1j4pYoyi8fPOaFsn8TUc4nSB6q8chfV4,26899
142
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
143
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml,sha256=eEzg5D5L3MHFL4H02gZnxdDiqtSCUzZDnt5pqDAmgCI,6980
144
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml,sha256=GOtAGMAretviqDXak409PPav7qHYMDBwSs9wxlSANRA,1388
145
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt,sha256=xbaXD14RWk0Vpc7fAHpephuszp1j-Qi3IWC4VrFKu70,407
146
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt,sha256=gyVR_uHy-8l1CHctgevcjboSwA3pejXHHJ3fQ92sGoM,33443
147
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt,sha256=NpqCxsjEeXhKXXJkSLg7Hq-1_vCkEppeqjkpYl3c0TI,410
148
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt,sha256=tfG-bYu_8aGfuWxdTKlqQjOAI0u30s4OB7WDittNGOo,508
149
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt,sha256=VmLfnS0D8OjKDTsGSdasurkEtqFLPTUhRjxxw3xmjOM,190
150
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt,sha256=Hc_LQe7JBXapRbMITyKt4RztUG4k8Uh5JFsHFpjzCOg,17332
151
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt,sha256=-tP3ZZQyde7CE0pvvJtSeFQmZBEE1OfbOhWdxz80Hd4,452
152
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt,sha256=qh_ciuj3zUCg7E7bRy6wQh4RQn5sz-6ZFUQHQsGLCiA,14480
153
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt,sha256=jm1ONil4jDXDH35TAofWFHtUm7eiZb1nCLsoETRCniw,468
154
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt,sha256=KLI2KtSrLcOHaapy_rU146nds_0qdYWgWSDmOTsdx_c,26249
155
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt,sha256=6T5pSBSv2f8ekYtvS7Qmf7TGWpNE7l10ZPkTW5DAonA,352
156
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt,sha256=7plGpg13AZd0dOiYg2lKTKIOqjhoojM0v3lA3NAI8Pk,429
157
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py,sha256=xEqR65Z8YOzOLorLjK0LCHos2zX-tCuxSrxndjU00hE,49
158
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
159
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py,sha256=8RmKl1vejnZl65caHZNV2ta6hMsg5B_LE-FuqCO8T8A,4225
160
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py,sha256=cRBKE0blzryj3Ar6yM0FKKgxmZdgMc44NDNl1S2wiRs,9136
161
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py,sha256=_fLTWjEmDZwXC-zzPT37PHO-nNlEvafemo2CyPJS7_w,19216
162
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py,sha256=hkegkn6hq2v2zqHVAP615-k-fkTS8swRYSbZeoqmSI8,3822
163
+ magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py,sha256=i1PFN-_kefJUUZ4Vk7igs1TU8gfErTDlDXY6-8Uaurw,9323
164
+ magic_pdf/model/sub_modules/reading_oreder/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
167
+ magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py,sha256=ezNSq_Y4UXiztB58hbXJsjTJlOBqWIjuW5A2uLSaZSo,7349
168
+ magic_pdf/model/sub_modules/table/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
169
+ magic_pdf/model/sub_modules/table/table_utils.py,sha256=B9BC4f5EEjlt2ldYxrIC8Wic2Tz3t3gTJeEyK3ggrOU,282
170
+ magic_pdf/model/sub_modules/table/rapidtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
171
+ magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py,sha256=w9nTdoTV5EJsG8ZlshNig0cdaMwlQ3XlQF1MKVuMwD8,2785
172
+ magic_pdf/operators/__init__.py,sha256=liU2-WYUvsQ1G4PYBppyvokS9z5IjrnlVMtoBAC1REI,2630
173
+ magic_pdf/operators/models.py,sha256=mRqbCVrxxaUVDpEBAsXaK7EL1M-goICkE1W0FYgewio,5305
174
+ magic_pdf/operators/pipes.py,sha256=XgBgisKQd_ruW-3Tw4v5LhqloZUHgn2aFcpi_q8LbCs,6767
175
+ magic_pdf/post_proc/__init__.py,sha256=8CRrCQVuExa0BttRFh3Z40lFy2K5jN0sp67KWjOlj5c,50
176
+ magic_pdf/post_proc/llm_aided.py,sha256=yzhu2cCpUZjdwf3v0swYDgSs9VWIfMAoXepYIP1EMZs,6367
177
+ magic_pdf/post_proc/para_split_v3.py,sha256=SPN_VVGvFX5KpFMGw9OzgoE-kTZq-FF036i0cIImGH8,16975
178
+ magic_pdf/pre_proc/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
179
+ magic_pdf/pre_proc/construct_page_dict.py,sha256=OFmq5XRKi6fYIo-lmGlL-NB16Sf0egzsfEx-fT2uYrc,660
180
+ magic_pdf/pre_proc/cut_image.py,sha256=NDzbxwD7z7Tb4uAxL4KR6LzURFdN1Tzr4nPvj-VmEqc,1225
181
+ magic_pdf/pre_proc/ocr_detect_all_bboxes.py,sha256=nt88ttXCEI_1ihAF7HU15SQjwM69V-iJmk-L_nyzA6o,9328
182
+ magic_pdf/pre_proc/ocr_dict_merge.py,sha256=PscKGF0uJIjMxZRM69FLUs1SZO_wOswDQQV1f0M2xAo,5627
183
+ magic_pdf/pre_proc/ocr_span_list_modify.py,sha256=bs5RLvk4kIyx9_Hqq0FU3AGPPxE8Sxs97Uwlf1sBryM,4725
184
+ magic_pdf/pre_proc/remove_bbox_overlap.py,sha256=mcdxAh4P56NZ3Ij8h3vW8qC_SrszfXflVWuWUuUiTNg,3089
185
+ magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
186
+ magic_pdf/resources/model_config/model_configs.yaml,sha256=SoyoDmXYQX4ltKclG6ZcpdSA8dX5auSgILBvaA0wPkg,325
187
+ magic_pdf/resources/slanet_plus/slanet-plus.onnx,sha256=1XqUKvai9X1qSgNyVzxpaiN5v1hXxF4qxpmT87M0UUs,7758305
188
+ magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt,sha256=dV4lcudF8wimEbAooYbvISvFhrXjp9i0rMRqv9VW6hY,3204667
189
+ magic_pdf/spark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
190
+ magic_pdf/spark/spark_api.py,sha256=BYO6zlRW0cEnIUB3ZzNQTu_LsPHEVitqiUN7gy3x_wo,1124
191
+ magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
192
+ magic_pdf/tools/cli.py,sha256=_oa-M5Hcopa5RZudVzrEip2W8pa9422Lmat7tMBJO5M,5171
193
+ magic_pdf/tools/cli_dev.py,sha256=3RbubfTIagWoFYdu8wSDanr-BJDjFGeDet55jTy7He0,3948
194
+ magic_pdf/tools/common.py,sha256=LoUz6Y36_U2odZqzBNKXngFNa6plf01U7_5jlDAFXaQ,12313
195
+ magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
196
+ magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
197
+ magic_pdf/utils/office_to_pdf.py,sha256=7aj-Ls2v8saD-Rgu_t3FIc-J3Ka9wnmiEH5zY-H1Vxs,729
198
+ magic_pdf-1.3.1.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
199
+ magic_pdf-1.3.1.dist-info/METADATA,sha256=PGXFggL8ni7iXJ5qUXfZLGZqXrbEi9TUhLYzCVxduWw,43499
200
+ magic_pdf-1.3.1.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
201
+ magic_pdf-1.3.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
202
+ magic_pdf-1.3.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
203
+ magic_pdf-1.3.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.45.1)
2
+ Generator: bdist_wheel (0.42.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,204 +0,0 @@
1
- import copy
2
- import platform
3
- import time
4
- import cv2
5
- import numpy as np
6
- import torch
7
-
8
- from paddleocr import PaddleOCR
9
- from ppocr.utils.logging import get_logger
10
- from ppocr.utils.utility import alpha_to_color, binarize_img
11
- from tools.infer.predict_system import sorted_boxes
12
- from tools.infer.utility import get_rotate_crop_image, get_minarea_rect_crop
13
-
14
- from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes, merge_det_boxes, check_img, \
15
- ONNXModelSingleton
16
-
17
- logger = get_logger()
18
-
19
-
20
- class ModifiedPaddleOCR(PaddleOCR):
21
- def __init__(self, *args, **kwargs):
22
-
23
- super().__init__(*args, **kwargs)
24
- self.lang = kwargs.get('lang', 'ch')
25
- # 在cpu架构为arm且不支持cuda时调用onnx、
26
- if not torch.cuda.is_available() and platform.machine() in ['arm64', 'aarch64']:
27
- self.use_onnx = True
28
- onnx_model_manager = ONNXModelSingleton()
29
- self.additional_ocr = onnx_model_manager.get_onnx_model(**kwargs)
30
- else:
31
- self.use_onnx = False
32
-
33
- def ocr(self,
34
- img,
35
- det=True,
36
- rec=True,
37
- cls=True,
38
- bin=False,
39
- inv=False,
40
- alpha_color=(255, 255, 255),
41
- mfd_res=None,
42
- ):
43
- """
44
- OCR with PaddleOCR
45
- args:
46
- img: img for OCR, support ndarray, img_path and list or ndarray
47
- det: use text detection or not. If False, only rec will be exec. Default is True
48
- rec: use text recognition or not. If False, only det will be exec. Default is True
49
- cls: use angle classifier or not. Default is True. If True, the text with rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance. Text with rotation of 90 or 270 degrees can be recognized even if cls=False.
50
- bin: binarize image to black and white. Default is False.
51
- inv: invert image colors. Default is False.
52
- alpha_color: set RGB color Tuple for transparent parts replacement. Default is pure white.
53
- """
54
- assert isinstance(img, (np.ndarray, list, str, bytes))
55
- if isinstance(img, list) and det == True:
56
- logger.error('When input a list of images, det must be false')
57
- exit(0)
58
- if cls == True and self.use_angle_cls == False:
59
- pass
60
- # logger.warning(
61
- # 'Since the angle classifier is not initialized, it will not be used during the forward process'
62
- # )
63
-
64
- img = check_img(img)
65
- # for infer pdf file
66
- if isinstance(img, list):
67
- if self.page_num > len(img) or self.page_num == 0:
68
- self.page_num = len(img)
69
- imgs = img[:self.page_num]
70
- else:
71
- imgs = [img]
72
-
73
- def preprocess_image(_image):
74
- _image = alpha_to_color(_image, alpha_color)
75
- if inv:
76
- _image = cv2.bitwise_not(_image)
77
- if bin:
78
- _image = binarize_img(_image)
79
- return _image
80
-
81
- if det and rec:
82
- ocr_res = []
83
- for img in imgs:
84
- img = preprocess_image(img)
85
- dt_boxes, rec_res, _ = self.__call__(img, cls, mfd_res=mfd_res)
86
- if not dt_boxes and not rec_res:
87
- ocr_res.append(None)
88
- continue
89
- tmp_res = [[box.tolist(), res]
90
- for box, res in zip(dt_boxes, rec_res)]
91
- ocr_res.append(tmp_res)
92
- return ocr_res
93
- elif det and not rec:
94
- ocr_res = []
95
- for img in imgs:
96
- img = preprocess_image(img)
97
- if self.lang in ['ch'] and self.use_onnx:
98
- dt_boxes, elapse = self.additional_ocr.text_detector(img)
99
- else:
100
- dt_boxes, elapse = self.text_detector(img)
101
- if dt_boxes is None:
102
- ocr_res.append(None)
103
- continue
104
- dt_boxes = sorted_boxes(dt_boxes)
105
- # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly,因此需要过滤所有倾斜程度较大的文本框
106
- dt_boxes = merge_det_boxes(dt_boxes)
107
- if mfd_res:
108
- bef = time.time()
109
- dt_boxes = update_det_boxes(dt_boxes, mfd_res)
110
- aft = time.time()
111
- logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
112
- len(dt_boxes), aft - bef))
113
- tmp_res = [box.tolist() for box in dt_boxes]
114
- ocr_res.append(tmp_res)
115
- return ocr_res
116
- else:
117
- ocr_res = []
118
- cls_res = []
119
- for img in imgs:
120
- if not isinstance(img, list):
121
- img = preprocess_image(img)
122
- img = [img]
123
- if self.use_angle_cls and cls:
124
- img, cls_res_tmp, elapse = self.text_classifier(img)
125
- if not rec:
126
- cls_res.append(cls_res_tmp)
127
- if self.lang in ['ch'] and self.use_onnx:
128
- rec_res, elapse = self.additional_ocr.text_recognizer(img)
129
- else:
130
- rec_res, elapse = self.text_recognizer(img)
131
- ocr_res.append(rec_res)
132
- if not rec:
133
- return cls_res
134
- return ocr_res
135
-
136
- def __call__(self, img, cls=True, mfd_res=None):
137
- time_dict = {'det': 0, 'rec': 0, 'cls': 0, 'all': 0}
138
-
139
- if img is None:
140
- logger.debug("no valid image provided")
141
- return None, None, time_dict
142
-
143
- start = time.time()
144
- ori_im = img.copy()
145
- if self.lang in ['ch'] and self.use_onnx:
146
- dt_boxes, elapse = self.additional_ocr.text_detector(img)
147
- else:
148
- dt_boxes, elapse = self.text_detector(img)
149
- time_dict['det'] = elapse
150
-
151
- if dt_boxes is None:
152
- logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
153
- end = time.time()
154
- time_dict['all'] = end - start
155
- return None, None, time_dict
156
- else:
157
- logger.debug("dt_boxes num : {}, elapsed : {}".format(
158
- len(dt_boxes), elapse))
159
- img_crop_list = []
160
-
161
- dt_boxes = sorted_boxes(dt_boxes)
162
-
163
- # merge_det_boxes 和 update_det_boxes 都会把poly转成bbox再转回poly,因此需要过滤所有倾斜程度较大的文本框
164
- dt_boxes = merge_det_boxes(dt_boxes)
165
-
166
- if mfd_res:
167
- bef = time.time()
168
- dt_boxes = update_det_boxes(dt_boxes, mfd_res)
169
- aft = time.time()
170
- logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
171
- len(dt_boxes), aft - bef))
172
-
173
- for bno in range(len(dt_boxes)):
174
- tmp_box = copy.deepcopy(dt_boxes[bno])
175
- if self.args.det_box_type == "quad":
176
- img_crop = get_rotate_crop_image(ori_im, tmp_box)
177
- else:
178
- img_crop = get_minarea_rect_crop(ori_im, tmp_box)
179
- img_crop_list.append(img_crop)
180
- if self.use_angle_cls and cls:
181
- img_crop_list, angle_list, elapse = self.text_classifier(
182
- img_crop_list)
183
- time_dict['cls'] = elapse
184
- logger.debug("cls num : {}, elapsed : {}".format(
185
- len(img_crop_list), elapse))
186
- if self.lang in ['ch'] and self.use_onnx:
187
- rec_res, elapse = self.additional_ocr.text_recognizer(img_crop_list)
188
- else:
189
- rec_res, elapse = self.text_recognizer(img_crop_list)
190
- time_dict['rec'] = elapse
191
- logger.debug("rec_res num : {}, elapsed : {}".format(
192
- len(rec_res), elapse))
193
- if self.args.save_crop_res:
194
- self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list,
195
- rec_res)
196
- filter_boxes, filter_rec_res = [], []
197
- for box, rec_result in zip(dt_boxes, rec_res):
198
- text, score = rec_result
199
- if score >= self.drop_score:
200
- filter_boxes.append(box)
201
- filter_rec_res.append(rec_result)
202
- end = time.time()
203
- time_dict['all'] = end - start
204
- return filter_boxes, filter_rec_res, time_dict
@@ -1,213 +0,0 @@
1
- import copy
2
- import time
3
-
4
-
5
- import cv2
6
- import numpy as np
7
- from paddleocr import PaddleOCR
8
- from paddleocr.paddleocr import check_img, logger
9
- from paddleocr.ppocr.utils.utility import alpha_to_color, binarize_img
10
- from paddleocr.tools.infer.predict_system import sorted_boxes
11
- from paddleocr.tools.infer.utility import slice_generator, merge_fragmented, get_rotate_crop_image, \
12
- get_minarea_rect_crop
13
-
14
- from magic_pdf.model.sub_modules.ocr.paddleocr.ocr_utils import update_det_boxes
15
-
16
-
17
- class ModifiedPaddleOCR(PaddleOCR):
18
-
19
- def ocr(
20
- self,
21
- img,
22
- det=True,
23
- rec=True,
24
- cls=True,
25
- bin=False,
26
- inv=False,
27
- alpha_color=(255, 255, 255),
28
- slice={},
29
- mfd_res=None,
30
- ):
31
- """
32
- OCR with PaddleOCR
33
-
34
- Args:
35
- img: Image for OCR. It can be an ndarray, img_path, or a list of ndarrays.
36
- det: Use text detection or not. If False, only text recognition will be executed. Default is True.
37
- rec: Use text recognition or not. If False, only text detection will be executed. Default is True.
38
- cls: Use angle classifier or not. Default is True. If True, the text with a rotation of 180 degrees can be recognized. If no text is rotated by 180 degrees, use cls=False to get better performance.
39
- bin: Binarize image to black and white. Default is False.
40
- inv: Invert image colors. Default is False.
41
- alpha_color: Set RGB color Tuple for transparent parts replacement. Default is pure white.
42
- slice: Use sliding window inference for large images. Both det and rec must be True. Requires int values for slice["horizontal_stride"], slice["vertical_stride"], slice["merge_x_thres"], slice["merge_y_thres"] (See doc/doc_en/slice_en.md). Default is {}.
43
-
44
- Returns:
45
- If both det and rec are True, returns a list of OCR results for each image. Each OCR result is a list of bounding boxes and recognized text for each detected text region.
46
- If det is True and rec is False, returns a list of detected bounding boxes for each image.
47
- If det is False and rec is True, returns a list of recognized text for each image.
48
- If both det and rec are False, returns a list of angle classification results for each image.
49
-
50
- Raises:
51
- AssertionError: If the input image is not of type ndarray, list, str, or bytes.
52
- SystemExit: If det is True and the input is a list of images.
53
-
54
- Note:
55
- - If the angle classifier is not initialized (use_angle_cls=False), it will not be used during the forward process.
56
- - For PDF files, if the input is a list of images and the page_num is specified, only the first page_num images will be processed.
57
- - The preprocess_image function is used to preprocess the input image by applying alpha color replacement, inversion, and binarization if specified.
58
- """
59
- assert isinstance(img, (np.ndarray, list, str, bytes))
60
- if isinstance(img, list) and det == True:
61
- logger.error("When input a list of images, det must be false")
62
- exit(0)
63
- if cls == True and self.use_angle_cls == False:
64
- logger.warning(
65
- "Since the angle classifier is not initialized, it will not be used during the forward process"
66
- )
67
-
68
- img, flag_gif, flag_pdf = check_img(img, alpha_color)
69
- # for infer pdf file
70
- if isinstance(img, list) and flag_pdf:
71
- if self.page_num > len(img) or self.page_num == 0:
72
- imgs = img
73
- else:
74
- imgs = img[: self.page_num]
75
- else:
76
- imgs = [img]
77
-
78
- def preprocess_image(_image):
79
- _image = alpha_to_color(_image, alpha_color)
80
- if inv:
81
- _image = cv2.bitwise_not(_image)
82
- if bin:
83
- _image = binarize_img(_image)
84
- return _image
85
-
86
- if det and rec:
87
- ocr_res = []
88
- for img in imgs:
89
- img = preprocess_image(img)
90
- dt_boxes, rec_res, _ = self.__call__(img, cls, slice, mfd_res=mfd_res)
91
- if not dt_boxes and not rec_res:
92
- ocr_res.append(None)
93
- continue
94
- tmp_res = [[box.tolist(), res] for box, res in zip(dt_boxes, rec_res)]
95
- ocr_res.append(tmp_res)
96
- return ocr_res
97
- elif det and not rec:
98
- ocr_res = []
99
- for img in imgs:
100
- img = preprocess_image(img)
101
- dt_boxes, elapse = self.text_detector(img)
102
- if dt_boxes.size == 0:
103
- ocr_res.append(None)
104
- continue
105
- tmp_res = [box.tolist() for box in dt_boxes]
106
- ocr_res.append(tmp_res)
107
- return ocr_res
108
- else:
109
- ocr_res = []
110
- cls_res = []
111
- for img in imgs:
112
- if not isinstance(img, list):
113
- img = preprocess_image(img)
114
- img = [img]
115
- if self.use_angle_cls and cls:
116
- img, cls_res_tmp, elapse = self.text_classifier(img)
117
- if not rec:
118
- cls_res.append(cls_res_tmp)
119
- rec_res, elapse = self.text_recognizer(img)
120
- ocr_res.append(rec_res)
121
- if not rec:
122
- return cls_res
123
- return ocr_res
124
-
125
- def __call__(self, img, cls=True, slice={}, mfd_res=None):
126
- time_dict = {"det": 0, "rec": 0, "cls": 0, "all": 0}
127
-
128
- if img is None:
129
- logger.debug("no valid image provided")
130
- return None, None, time_dict
131
-
132
- start = time.time()
133
- ori_im = img.copy()
134
- if slice:
135
- slice_gen = slice_generator(
136
- img,
137
- horizontal_stride=slice["horizontal_stride"],
138
- vertical_stride=slice["vertical_stride"],
139
- )
140
- elapsed = []
141
- dt_slice_boxes = []
142
- for slice_crop, v_start, h_start in slice_gen:
143
- dt_boxes, elapse = self.text_detector(slice_crop, use_slice=True)
144
- if dt_boxes.size:
145
- dt_boxes[:, :, 0] += h_start
146
- dt_boxes[:, :, 1] += v_start
147
- dt_slice_boxes.append(dt_boxes)
148
- elapsed.append(elapse)
149
- dt_boxes = np.concatenate(dt_slice_boxes)
150
-
151
- dt_boxes = merge_fragmented(
152
- boxes=dt_boxes,
153
- x_threshold=slice["merge_x_thres"],
154
- y_threshold=slice["merge_y_thres"],
155
- )
156
- elapse = sum(elapsed)
157
- else:
158
- dt_boxes, elapse = self.text_detector(img)
159
-
160
- time_dict["det"] = elapse
161
-
162
- if dt_boxes is None:
163
- logger.debug("no dt_boxes found, elapsed : {}".format(elapse))
164
- end = time.time()
165
- time_dict["all"] = end - start
166
- return None, None, time_dict
167
- else:
168
- logger.debug(
169
- "dt_boxes num : {}, elapsed : {}".format(len(dt_boxes), elapse)
170
- )
171
- img_crop_list = []
172
-
173
- dt_boxes = sorted_boxes(dt_boxes)
174
-
175
- if mfd_res:
176
- bef = time.time()
177
- dt_boxes = update_det_boxes(dt_boxes, mfd_res)
178
- aft = time.time()
179
- logger.debug("split text box by formula, new dt_boxes num : {}, elapsed : {}".format(
180
- len(dt_boxes), aft - bef))
181
-
182
- for bno in range(len(dt_boxes)):
183
- tmp_box = copy.deepcopy(dt_boxes[bno])
184
- if self.args.det_box_type == "quad":
185
- img_crop = get_rotate_crop_image(ori_im, tmp_box)
186
- else:
187
- img_crop = get_minarea_rect_crop(ori_im, tmp_box)
188
- img_crop_list.append(img_crop)
189
- if self.use_angle_cls and cls:
190
- img_crop_list, angle_list, elapse = self.text_classifier(img_crop_list)
191
- time_dict["cls"] = elapse
192
- logger.debug(
193
- "cls num : {}, elapsed : {}".format(len(img_crop_list), elapse)
194
- )
195
- if len(img_crop_list) > 1000:
196
- logger.debug(
197
- f"rec crops num: {len(img_crop_list)}, time and memory cost may be large."
198
- )
199
-
200
- rec_res, elapse = self.text_recognizer(img_crop_list)
201
- time_dict["rec"] = elapse
202
- logger.debug("rec_res num : {}, elapsed : {}".format(len(rec_res), elapse))
203
- if self.args.save_crop_res:
204
- self.draw_crop_rec_res(self.args.crop_res_save_dir, img_crop_list, rec_res)
205
- filter_boxes, filter_rec_res = [], []
206
- for box, rec_result in zip(dt_boxes, rec_res):
207
- text, score = rec_result[0], rec_result[1]
208
- if score >= self.drop_score:
209
- filter_boxes.append(box)
210
- filter_rec_res.append(rec_result)
211
- end = time.time()
212
- time_dict["all"] = end - start
213
- return filter_boxes, filter_rec_res, time_dict