magic-pdf 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. magic_pdf/config/constants.py +53 -0
  2. magic_pdf/config/drop_reason.py +35 -0
  3. magic_pdf/config/drop_tag.py +19 -0
  4. magic_pdf/config/make_content_config.py +11 -0
  5. magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
  6. magic_pdf/data/data_reader_writer/filebase.py +3 -0
  7. magic_pdf/data/read_api.py +1 -1
  8. magic_pdf/dict2md/mkcontent.py +226 -185
  9. magic_pdf/dict2md/ocr_mkcontent.py +11 -11
  10. magic_pdf/filter/pdf_meta_scan.py +101 -79
  11. magic_pdf/integrations/rag/utils.py +4 -5
  12. magic_pdf/libs/config_reader.py +5 -5
  13. magic_pdf/libs/draw_bbox.py +3 -2
  14. magic_pdf/libs/pdf_image_tools.py +36 -12
  15. magic_pdf/libs/version.py +1 -1
  16. magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
  17. magic_pdf/model/magic_model.py +13 -13
  18. magic_pdf/model/pdf_extract_kit.py +122 -76
  19. magic_pdf/model/sub_modules/model_init.py +40 -35
  20. magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
  21. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
  22. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
  23. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
  24. magic_pdf/para/para_split.py +411 -248
  25. magic_pdf/para/para_split_v2.py +352 -182
  26. magic_pdf/para/para_split_v3.py +110 -53
  27. magic_pdf/pdf_parse_by_ocr.py +2 -0
  28. magic_pdf/pdf_parse_by_txt.py +2 -0
  29. magic_pdf/pdf_parse_union_core.py +174 -100
  30. magic_pdf/pdf_parse_union_core_v2.py +202 -36
  31. magic_pdf/pipe/AbsPipe.py +28 -44
  32. magic_pdf/pipe/OCRPipe.py +5 -5
  33. magic_pdf/pipe/TXTPipe.py +5 -6
  34. magic_pdf/pipe/UNIPipe.py +24 -25
  35. magic_pdf/post_proc/pdf_post_filter.py +7 -14
  36. magic_pdf/pre_proc/cut_image.py +9 -11
  37. magic_pdf/pre_proc/equations_replace.py +203 -212
  38. magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
  39. magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
  40. magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
  41. magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
  42. magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
  43. magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
  44. magic_pdf/pre_proc/remove_footer_header.py +2 -5
  45. magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
  46. magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
  47. magic_pdf/spark/spark_api.py +15 -17
  48. magic_pdf/tools/cli.py +3 -4
  49. magic_pdf/tools/cli_dev.py +6 -9
  50. magic_pdf/tools/common.py +26 -36
  51. magic_pdf/user_api.py +29 -38
  52. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/METADATA +11 -12
  53. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/RECORD +58 -59
  54. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/WHEEL +1 -1
  55. magic_pdf/libs/Constants.py +0 -55
  56. magic_pdf/libs/MakeContentConfig.py +0 -11
  57. magic_pdf/libs/drop_reason.py +0 -27
  58. magic_pdf/libs/drop_tag.py +0 -19
  59. magic_pdf/para/para_pipeline.py +0 -297
  60. /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
  61. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/LICENSE.md +0 -0
  62. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/entry_points.txt +0 -0
  63. {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,25 @@
1
+ import os
2
+
1
3
  import cv2
4
+ import numpy as np
2
5
  from paddleocr.ppstructure.table.predict_table import TableSystem
3
6
  from paddleocr.ppstructure.utility import init_args
4
- from magic_pdf.libs.Constants import *
5
- import os
6
7
  from PIL import Image
7
- import numpy as np
8
+
9
+ from magic_pdf.config.constants import * # noqa: F403
8
10
 
9
11
 
10
12
  class TableMasterPaddleModel(object):
11
- """
12
- This class is responsible for converting image of table into HTML format using a pre-trained model.
13
+ """This class is responsible for converting image of table into HTML format
14
+ using a pre-trained model.
13
15
 
14
- Attributes:
15
- - table_sys: An instance of TableSystem initialized with parsed arguments.
16
+ Attributes:
17
+ - table_sys: An instance of TableSystem initialized with parsed arguments.
16
18
 
17
- Methods:
18
- - __init__(config): Initializes the model with configuration parameters.
19
- - img2html(image): Converts a PIL Image or NumPy array to HTML string.
20
- - parse_args(**kwargs): Parses configuration arguments.
19
+ Methods:
20
+ - __init__(config): Initializes the model with configuration parameters.
21
+ - img2html(image): Converts a PIL Image or NumPy array to HTML string.
22
+ - parse_args(**kwargs): Parses configuration arguments.
21
23
  """
22
24
 
23
25
  def __init__(self, config):
@@ -40,30 +42,30 @@ class TableMasterPaddleModel(object):
40
42
  image = np.asarray(image)
41
43
  image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
42
44
  pred_res, _ = self.table_sys(image)
43
- pred_html = pred_res["html"]
45
+ pred_html = pred_res['html']
44
46
  # res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
45
47
  # "</table></body></html>","") + "</table></td>\n"
46
48
  return pred_html
47
49
 
48
50
  def parse_args(self, **kwargs):
49
51
  parser = init_args()
50
- model_dir = kwargs.get("model_dir")
51
- table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)
52
- table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)
53
- det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)
54
- rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
55
- rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
56
- device = kwargs.get("device", "cpu")
57
- use_gpu = True if device.startswith("cuda") else False
52
+ model_dir = kwargs.get('model_dir')
53
+ table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR) # noqa: F405
54
+ table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT) # noqa: F405
55
+ det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR) # noqa: F405
56
+ rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR) # noqa: F405
57
+ rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT) # noqa: F405
58
+ device = kwargs.get('device', 'cpu')
59
+ use_gpu = True if device.startswith('cuda') else False
58
60
  config = {
59
- "use_gpu": use_gpu,
60
- "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
61
- "table_algorithm": "TableMaster",
62
- "table_model_dir": table_model_dir,
63
- "table_char_dict_path": table_char_dict_path,
64
- "det_model_dir": det_model_dir,
65
- "rec_model_dir": rec_model_dir,
66
- "rec_char_dict_path": rec_char_dict_path,
61
+ 'use_gpu': use_gpu,
62
+ 'table_max_len': kwargs.get('table_max_len', TABLE_MAX_LEN), # noqa: F405
63
+ 'table_algorithm': 'TableMaster',
64
+ 'table_model_dir': table_model_dir,
65
+ 'table_char_dict_path': table_char_dict_path,
66
+ 'det_model_dir': det_model_dir,
67
+ 'rec_model_dir': rec_model_dir,
68
+ 'rec_char_dict_path': rec_char_dict_path,
67
69
  }
68
70
  parser.set_defaults(**config)
69
71
  return parser.parse_args([])