magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/read_api.py +1 -1
  7. magic_pdf/dict2md/mkcontent.py +226 -185
  8. magic_pdf/dict2md/ocr_mkcontent.py +12 -12
  9. magic_pdf/filter/pdf_meta_scan.py +101 -79
  10. magic_pdf/integrations/rag/utils.py +4 -5
  11. magic_pdf/libs/config_reader.py +6 -6
  12. magic_pdf/libs/draw_bbox.py +13 -6
  13. magic_pdf/libs/pdf_image_tools.py +36 -12
  14. magic_pdf/libs/version.py +1 -1
  15. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  16. magic_pdf/model/magic_model.py +13 -13
  17. magic_pdf/model/pdf_extract_kit.py +142 -351
  18. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +21 -0
  19. magic_pdf/model/sub_modules/mfd/__init__.py +0 -0
  20. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +12 -0
  21. magic_pdf/model/sub_modules/mfd/yolov8/__init__.py +0 -0
  22. magic_pdf/model/sub_modules/mfr/__init__.py +0 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +98 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/__init__.py +0 -0
  25. magic_pdf/model/sub_modules/model_init.py +149 -0
  26. magic_pdf/model/sub_modules/model_utils.py +51 -0
  27. magic_pdf/model/sub_modules/ocr/__init__.py +0 -0
  28. magic_pdf/model/sub_modules/ocr/paddleocr/__init__.py +0 -0
  29. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +285 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +176 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +213 -0
  32. magic_pdf/model/sub_modules/reading_oreder/__init__.py +0 -0
  33. magic_pdf/model/sub_modules/reading_oreder/layoutreader/__init__.py +0 -0
  34. magic_pdf/model/sub_modules/reading_oreder/layoutreader/xycut.py +242 -0
  35. magic_pdf/model/sub_modules/table/__init__.py +0 -0
  36. magic_pdf/model/sub_modules/table/rapidtable/__init__.py +0 -0
  37. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +16 -0
  38. magic_pdf/model/sub_modules/table/structeqtable/__init__.py +0 -0
  39. magic_pdf/model/{pek_sub_modules/structeqtable/StructTableModel.py → sub_modules/table/structeqtable/struct_eqtable.py} +3 -11
  40. magic_pdf/model/sub_modules/table/table_utils.py +11 -0
  41. magic_pdf/model/sub_modules/table/tablemaster/__init__.py +0 -0
  42. magic_pdf/model/{ppTableModel.py → sub_modules/table/tablemaster/tablemaster_paddle.py} +31 -29
  43. magic_pdf/para/para_split.py +411 -248
  44. magic_pdf/para/para_split_v2.py +352 -182
  45. magic_pdf/para/para_split_v3.py +121 -66
  46. magic_pdf/pdf_parse_by_ocr.py +2 -0
  47. magic_pdf/pdf_parse_by_txt.py +2 -0
  48. magic_pdf/pdf_parse_union_core.py +174 -100
  49. magic_pdf/pdf_parse_union_core_v2.py +253 -50
  50. magic_pdf/pipe/AbsPipe.py +28 -44
  51. magic_pdf/pipe/OCRPipe.py +5 -5
  52. magic_pdf/pipe/TXTPipe.py +5 -6
  53. magic_pdf/pipe/UNIPipe.py +24 -25
  54. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  55. magic_pdf/pre_proc/cut_image.py +9 -11
  56. magic_pdf/pre_proc/equations_replace.py +203 -212
  57. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  58. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  59. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  60. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  61. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  62. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  63. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  64. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  65. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  66. magic_pdf/resources/model_config/model_configs.yaml +2 -1
  67. magic_pdf/spark/spark_api.py +15 -17
  68. magic_pdf/tools/cli.py +3 -4
  69. magic_pdf/tools/cli_dev.py +6 -9
  70. magic_pdf/tools/common.py +70 -36
  71. magic_pdf/user_api.py +29 -38
  72. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +18 -13
  73. magic_pdf-0.10.0.dist-info/RECORD +198 -0
  74. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +1 -1
  75. magic_pdf/libs/Constants.py +0 -53
  76. magic_pdf/libs/MakeContentConfig.py +0 -11
  77. magic_pdf/libs/drop_reason.py +0 -27
  78. magic_pdf/libs/drop_tag.py +0 -19
  79. magic_pdf/model/pek_sub_modules/post_process.py +0 -36
  80. magic_pdf/model/pek_sub_modules/self_modify.py +0 -388
  81. magic_pdf/para/para_pipeline.py +0 -297
  82. magic_pdf-0.9.2.dist-info/RECORD +0 -178
  83. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  84. /magic_pdf/model/{pek_sub_modules → sub_modules}/__init__.py +0 -0
  85. /magic_pdf/model/{pek_sub_modules/layoutlmv3 → sub_modules/layout}/__init__.py +0 -0
  86. /magic_pdf/model/{pek_sub_modules/structeqtable → sub_modules/layout/doclayout_yolo}/__init__.py +0 -0
  87. /magic_pdf/model/{v3 → sub_modules/layout/layoutlmv3}/__init__.py +0 -0
  88. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/backbone.py +0 -0
  89. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/beit.py +0 -0
  90. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/deit.py +0 -0
  91. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/__init__.py +0 -0
  92. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/__init__.py +0 -0
  93. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/cord.py +0 -0
  94. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/data_collator.py +0 -0
  95. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/funsd.py +0 -0
  96. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/image_utils.py +0 -0
  97. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/data/xfund.py +0 -0
  98. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/__init__.py +0 -0
  99. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +0 -0
  100. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +0 -0
  101. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +0 -0
  102. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +0 -0
  103. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +0 -0
  104. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/model_init.py +0 -0
  105. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/rcnn_vl.py +0 -0
  106. /magic_pdf/model/{pek_sub_modules → sub_modules/layout}/layoutlmv3/visualizer.py +0 -0
  107. /magic_pdf/model/{v3 → sub_modules/reading_oreder/layoutreader}/helpers.py +0 -0
  108. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
  109. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
  110. {magic_pdf-0.9.2.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,11 @@
1
+ import re
2
+
3
+
4
+ def minify_html(html):
5
+ # 移除多余的空白字符
6
+ html = re.sub(r'\s+', ' ', html)
7
+ # 移除行尾的空白字符
8
+ html = re.sub(r'\s*>\s*', '>', html)
9
+ # 移除标签前的空白字符
10
+ html = re.sub(r'\s*<\s*', '<', html)
11
+ return html.strip()
@@ -1,23 +1,25 @@
1
+ import os
2
+
1
3
  import cv2
4
+ import numpy as np
2
5
  from paddleocr.ppstructure.table.predict_table import TableSystem
3
6
  from paddleocr.ppstructure.utility import init_args
4
- from magic_pdf.libs.Constants import *
5
- import os
6
7
  from PIL import Image
7
- import numpy as np
8
8
 
9
+ from magic_pdf.config.constants import * # noqa: F403
9
10
 
10
- class ppTableModel(object):
11
- """
12
- This class is responsible for converting image of table into HTML format using a pre-trained model.
13
11
 
14
- Attributes:
15
- - table_sys: An instance of TableSystem initialized with parsed arguments.
12
+ class TableMasterPaddleModel(object):
13
+ """This class is responsible for converting image of table into HTML format
14
+ using a pre-trained model.
15
+
16
+ Attributes:
17
+ - table_sys: An instance of TableSystem initialized with parsed arguments.
16
18
 
17
- Methods:
18
- - __init__(config): Initializes the model with configuration parameters.
19
- - img2html(image): Converts a PIL Image or NumPy array to HTML string.
20
- - parse_args(**kwargs): Parses configuration arguments.
19
+ Methods:
20
+ - __init__(config): Initializes the model with configuration parameters.
21
+ - img2html(image): Converts a PIL Image or NumPy array to HTML string.
22
+ - parse_args(**kwargs): Parses configuration arguments.
21
23
  """
22
24
 
23
25
  def __init__(self, config):
@@ -40,30 +42,30 @@ class ppTableModel(object):
40
42
  image = np.asarray(image)
41
43
  image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
42
44
  pred_res, _ = self.table_sys(image)
43
- pred_html = pred_res["html"]
45
+ pred_html = pred_res['html']
44
46
  # res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
45
47
  # "</table></body></html>","") + "</table></td>\n"
46
48
  return pred_html
47
49
 
48
50
  def parse_args(self, **kwargs):
49
51
  parser = init_args()
50
- model_dir = kwargs.get("model_dir")
51
- table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)
52
- table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)
53
- det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)
54
- rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
55
- rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
56
- device = kwargs.get("device", "cpu")
57
- use_gpu = True if device.startswith("cuda") else False
52
+ model_dir = kwargs.get('model_dir')
53
+ table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR) # noqa: F405
54
+ table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT) # noqa: F405
55
+ det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR) # noqa: F405
56
+ rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR) # noqa: F405
57
+ rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT) # noqa: F405
58
+ device = kwargs.get('device', 'cpu')
59
+ use_gpu = True if device.startswith('cuda') else False
58
60
  config = {
59
- "use_gpu": use_gpu,
60
- "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
61
- "table_algorithm": "TableMaster",
62
- "table_model_dir": table_model_dir,
63
- "table_char_dict_path": table_char_dict_path,
64
- "det_model_dir": det_model_dir,
65
- "rec_model_dir": rec_model_dir,
66
- "rec_char_dict_path": rec_char_dict_path,
61
+ 'use_gpu': use_gpu,
62
+ 'table_max_len': kwargs.get('table_max_len', TABLE_MAX_LEN), # noqa: F405
63
+ 'table_algorithm': 'TableMaster',
64
+ 'table_model_dir': table_model_dir,
65
+ 'table_char_dict_path': table_char_dict_path,
66
+ 'det_model_dir': det_model_dir,
67
+ 'rec_model_dir': rec_model_dir,
68
+ 'rec_char_dict_path': rec_char_dict_path,
67
69
  }
68
70
  parser.set_defaults(**config)
69
71
  return parser.parse_args([])