magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,10 @@
1
1
  import enum
2
2
  import json
3
3
 
4
+ from magic_pdf.config.model_block_type import ModelBlockTypeEnum
5
+ from magic_pdf.config.ocr_content_type import CategoryId, ContentType
6
+ from magic_pdf.data.data_reader_writer import (FileBasedDataReader,
7
+ FileBasedDataWriter)
4
8
  from magic_pdf.data.dataset import Dataset
5
9
  from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
6
10
  bbox_relative_pos, box_area, calculate_iou,
@@ -9,11 +13,7 @@ from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
9
13
  from magic_pdf.libs.commons import fitz, join_path
10
14
  from magic_pdf.libs.coordinate_transform import get_scale_ratio
11
15
  from magic_pdf.libs.local_math import float_gt
12
- from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
13
- from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
14
16
  from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
15
- from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
16
- from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
17
17
 
18
18
  CAPATION_OVERLAP_AREA_RATIO = 0.6
19
19
  MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
@@ -1050,27 +1050,27 @@ class MagicModel:
1050
1050
 
1051
1051
 
1052
1052
  if __name__ == '__main__':
1053
- drw = DiskReaderWriter(r'D:/project/20231108code-clean')
1053
+ drw = FileBasedDataReader(r'D:/project/20231108code-clean')
1054
1054
  if 0:
1055
1055
  pdf_file_path = r'linshixuqiu\19983-00.pdf'
1056
1056
  model_file_path = r'linshixuqiu\19983-00_new.json'
1057
- pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
1058
- model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
1057
+ pdf_bytes = drw.read(pdf_file_path)
1058
+ model_json_txt = drw.read(model_file_path).decode()
1059
1059
  model_list = json.loads(model_json_txt)
1060
1060
  write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
1061
1061
  img_bucket_path = 'imgs'
1062
- img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
1062
+ img_writer = FileBasedDataWriter(join_path(write_path, img_bucket_path))
1063
1063
  pdf_docs = fitz.open('pdf', pdf_bytes)
1064
1064
  magic_model = MagicModel(model_list, pdf_docs)
1065
1065
 
1066
1066
  if 1:
1067
+ from magic_pdf.data.dataset import PymuDocDataset
1068
+
1067
1069
  model_list = json.loads(
1068
1070
  drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
1069
1071
  )
1070
- pdf_bytes = drw.read(
1071
- '/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf', AbsReaderWriter.MODE_BIN
1072
- )
1073
- pdf_docs = fitz.open('pdf', pdf_bytes)
1074
- magic_model = MagicModel(model_list, pdf_docs)
1072
+ pdf_bytes = drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf')
1073
+
1074
+ magic_model = MagicModel(model_list, PymuDocDataset(pdf_bytes))
1075
1075
  for i in range(7):
1076
1076
  print(magic_model.get_imgs(i))