magic-pdf 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- magic_pdf/config/constants.py +53 -0
- magic_pdf/config/drop_reason.py +35 -0
- magic_pdf/config/drop_tag.py +19 -0
- magic_pdf/config/make_content_config.py +11 -0
- magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
- magic_pdf/data/data_reader_writer/filebase.py +3 -0
- magic_pdf/data/read_api.py +1 -1
- magic_pdf/dict2md/mkcontent.py +226 -185
- magic_pdf/dict2md/ocr_mkcontent.py +11 -11
- magic_pdf/filter/pdf_meta_scan.py +101 -79
- magic_pdf/integrations/rag/utils.py +4 -5
- magic_pdf/libs/config_reader.py +5 -5
- magic_pdf/libs/draw_bbox.py +3 -2
- magic_pdf/libs/pdf_image_tools.py +36 -12
- magic_pdf/libs/version.py +1 -1
- magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
- magic_pdf/model/magic_model.py +13 -13
- magic_pdf/model/pdf_extract_kit.py +122 -76
- magic_pdf/model/sub_modules/model_init.py +40 -35
- magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
- magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
- magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
- magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
- magic_pdf/para/para_split.py +411 -248
- magic_pdf/para/para_split_v2.py +352 -182
- magic_pdf/para/para_split_v3.py +110 -53
- magic_pdf/pdf_parse_by_ocr.py +2 -0
- magic_pdf/pdf_parse_by_txt.py +2 -0
- magic_pdf/pdf_parse_union_core.py +174 -100
- magic_pdf/pdf_parse_union_core_v2.py +202 -36
- magic_pdf/pipe/AbsPipe.py +28 -44
- magic_pdf/pipe/OCRPipe.py +5 -5
- magic_pdf/pipe/TXTPipe.py +5 -6
- magic_pdf/pipe/UNIPipe.py +24 -25
- magic_pdf/post_proc/pdf_post_filter.py +7 -14
- magic_pdf/pre_proc/cut_image.py +9 -11
- magic_pdf/pre_proc/equations_replace.py +203 -212
- magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
- magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
- magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
- magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
- magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
- magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
- magic_pdf/pre_proc/remove_footer_header.py +2 -5
- magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
- magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
- magic_pdf/spark/spark_api.py +15 -17
- magic_pdf/tools/cli.py +3 -4
- magic_pdf/tools/cli_dev.py +6 -9
- magic_pdf/tools/common.py +26 -36
- magic_pdf/user_api.py +29 -38
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/METADATA +11 -12
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/RECORD +58 -59
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/WHEEL +1 -1
- magic_pdf/libs/Constants.py +0 -55
- magic_pdf/libs/MakeContentConfig.py +0 -11
- magic_pdf/libs/drop_reason.py +0 -27
- magic_pdf/libs/drop_tag.py +0 -19
- magic_pdf/para/para_pipeline.py +0 -297
- /magic_pdf/{libs → config}/ocr_content_type.py +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/LICENSE.md +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/entry_points.txt +0 -0
- {magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,23 +1,25 @@
|
|
1
|
+
import os
|
2
|
+
|
1
3
|
import cv2
|
4
|
+
import numpy as np
|
2
5
|
from paddleocr.ppstructure.table.predict_table import TableSystem
|
3
6
|
from paddleocr.ppstructure.utility import init_args
|
4
|
-
from magic_pdf.libs.Constants import *
|
5
|
-
import os
|
6
7
|
from PIL import Image
|
7
|
-
|
8
|
+
|
9
|
+
from magic_pdf.config.constants import * # noqa: F403
|
8
10
|
|
9
11
|
|
10
12
|
class TableMasterPaddleModel(object):
|
11
|
-
"""
|
12
|
-
|
13
|
+
"""This class is responsible for converting image of table into HTML format
|
14
|
+
using a pre-trained model.
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
+
Attributes:
|
17
|
+
- table_sys: An instance of TableSystem initialized with parsed arguments.
|
16
18
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
19
|
+
Methods:
|
20
|
+
- __init__(config): Initializes the model with configuration parameters.
|
21
|
+
- img2html(image): Converts a PIL Image or NumPy array to HTML string.
|
22
|
+
- parse_args(**kwargs): Parses configuration arguments.
|
21
23
|
"""
|
22
24
|
|
23
25
|
def __init__(self, config):
|
@@ -40,30 +42,30 @@ class TableMasterPaddleModel(object):
|
|
40
42
|
image = np.asarray(image)
|
41
43
|
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
|
42
44
|
pred_res, _ = self.table_sys(image)
|
43
|
-
pred_html = pred_res[
|
45
|
+
pred_html = pred_res['html']
|
44
46
|
# res = '<td><table border="1">' + pred_html.replace("<html><body><table>", "").replace(
|
45
47
|
# "</table></body></html>","") + "</table></td>\n"
|
46
48
|
return pred_html
|
47
49
|
|
48
50
|
def parse_args(self, **kwargs):
|
49
51
|
parser = init_args()
|
50
|
-
model_dir = kwargs.get(
|
51
|
-
table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)
|
52
|
-
table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)
|
53
|
-
det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)
|
54
|
-
rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
|
55
|
-
rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
|
56
|
-
device = kwargs.get(
|
57
|
-
use_gpu = True if device.startswith(
|
52
|
+
model_dir = kwargs.get('model_dir')
|
53
|
+
table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR) # noqa: F405
|
54
|
+
table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT) # noqa: F405
|
55
|
+
det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR) # noqa: F405
|
56
|
+
rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR) # noqa: F405
|
57
|
+
rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT) # noqa: F405
|
58
|
+
device = kwargs.get('device', 'cpu')
|
59
|
+
use_gpu = True if device.startswith('cuda') else False
|
58
60
|
config = {
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
61
|
+
'use_gpu': use_gpu,
|
62
|
+
'table_max_len': kwargs.get('table_max_len', TABLE_MAX_LEN), # noqa: F405
|
63
|
+
'table_algorithm': 'TableMaster',
|
64
|
+
'table_model_dir': table_model_dir,
|
65
|
+
'table_char_dict_path': table_char_dict_path,
|
66
|
+
'det_model_dir': det_model_dir,
|
67
|
+
'rec_model_dir': rec_model_dir,
|
68
|
+
'rec_char_dict_path': rec_char_dict_path,
|
67
69
|
}
|
68
70
|
parser.set_defaults(**config)
|
69
71
|
return parser.parse_args([])
|