PyPI - magic-pdf - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl - Mend

magic-pdf 0.6.1py3-none-any.whl → 0.7.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

magic_pdf/dict2md/ocr_mkcontent.py +20 -7
magic_pdf/libs/config_reader.py +28 -10
magic_pdf/libs/language.py +12 -0
magic_pdf/libs/version.py +1 -1
magic_pdf/model/__init__.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +35 -3
magic_pdf/model/magic_model.py +49 -41
magic_pdf/model/pdf_extract_kit.py +155 -60
magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +7 -6
magic_pdf/model/pek_sub_modules/self_modify.py +87 -43
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
magic_pdf/model/pp_structure_v2.py +1 -1
magic_pdf/pdf_parse_union_core.py +4 -2
magic_pdf/pre_proc/citationmarker_remove.py +5 -1
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +40 -2
magic_pdf/pre_proc/ocr_span_list_modify.py +12 -7
magic_pdf/resources/fasttext-langdetect/lid.176.ftz +0 -0
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +2 -2
magic_pdf/resources/model_config/model_configs.yaml +4 -0
magic_pdf/rw/AbsReaderWriter.py +1 -18
magic_pdf/rw/DiskReaderWriter.py +32 -24
magic_pdf/rw/S3ReaderWriter.py +83 -48
magic_pdf/tools/cli.py +79 -0
magic_pdf/tools/cli_dev.py +156 -0
magic_pdf/tools/common.py +119 -0
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +120 -72
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +34 -35
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
magic_pdf/cli/magicpdf.py +0 -337
magic_pdf/pdf_parse_for_train.py +0 -685
magic_pdf/train_utils/convert_to_train_format.py +0 -65
magic_pdf/train_utils/extract_caption.py +0 -59
magic_pdf/train_utils/remove_footer_header.py +0 -159
magic_pdf/train_utils/vis_utils.py +0 -327
magic_pdf-0.6.1.dist-info/entry_points.txt +0 -2
/magic_pdf/libs/{math.py → local_math.py} +0 -0
/magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
/magic_pdf/{train_utils → tools}/__init__.py +0 -0
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -120,15 +120,20 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
             if mode == 'nlp':
                 continue
             elif mode == 'mm':
+                table_caption = ''
                 for block in para_block['blocks']:  # 1st.拼table_caption
                     if block['type'] == BlockType.TableCaption:
-                        para_text += merge_para_with_text(block)
+                        table_caption = merge_para_with_text(block)
                 for block in para_block['blocks']:  # 2nd.拼table_body
                     if block['type'] == BlockType.TableBody:
                         for line in block['lines']:
                             for span in line['spans']:
                                 if span['type'] == ContentType.Table:
-                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
+                                    # if processed by table model
+                                    if span.get('latex', ''):
+                                        para_text += f"\n\n$\n {span['latex']}\n$\n\n"
+                                    else:
+                                        para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
                     if block['type'] == BlockType.TableFootnote:
                         para_text += merge_para_with_text(block)
@@ -163,7 +168,7 @@ def merge_para_with_text(para_block):
                 else:
                     content = ocr_escape_special_markdown_char(content)
             elif span_type == ContentType.InlineEquation:
-                content = f"${span['content']}$"
+                content = f" ${span['content']}$ "
             elif span_type == ContentType.InterlineEquation:
                 content = f"\n$$\n{span['content']}\n$$\n"
@@ -210,28 +215,32 @@ def para_to_standard_format(para, img_buket_path):
     return para_content
-def para_to_standard_format_v2(para_block, img_buket_path):
+def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
     para_type = para_block['type']
     if para_type == BlockType.Text:
         para_content = {
             'type': 'text',
             'text': merge_para_with_text(para_block),
+            'page_idx': page_idx
         }
     elif para_type == BlockType.Title:
         para_content = {
             'type': 'text',
             'text': merge_para_with_text(para_block),
-            'text_level': 1
+            'text_level': 1,
+            'page_idx': page_idx
         }
     elif para_type == BlockType.InterlineEquation:
         para_content = {
             'type': 'equation',
             'text': merge_para_with_text(para_block),
-            'text_format': "latex"
+            'text_format': "latex",
+            'page_idx': page_idx
         }
     elif para_type == BlockType.Image:
         para_content = {
             'type': 'image',
+            'page_idx': page_idx
         }
         for block in para_block['blocks']:
             if block['type'] == BlockType.ImageBody:
@@ -241,9 +250,12 @@ def para_to_standard_format_v2(para_block, img_buket_path):
     elif para_type == BlockType.Table:
         para_content = {
             'type': 'table',
+            'page_idx': page_idx
         }
         for block in para_block['blocks']:
             if block['type'] == BlockType.TableBody:
+                if block["lines"][0]["spans"][0].get('latex', ''):
+                    para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
                 para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
             if block['type'] == BlockType.TableCaption:
                 para_content['table_caption'] = merge_para_with_text(block)
@@ -345,6 +357,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
                 raise Exception(f"drop_mode can not be null")
         paras_of_layout = page_info.get("para_blocks")
+        page_idx = page_info.get("page_idx")
         if not paras_of_layout:
             continue
         if make_mode == MakeMode.MM_MD:
@@ -355,7 +368,7 @@ def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_pa
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.STANDARD_FORMAT:
             for para_block in paras_of_layout:
-                para_content = para_to_standard_format_v2(para_block, img_buket_path)
+                para_content = para_to_standard_format_v2(para_block, img_buket_path, page_idx)
                 output_content.append(para_content)
     if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
         return '\n\n'.join(output_content)

magic_pdf/libs/config_reader.py CHANGED Viewed

@@ -10,16 +10,19 @@ from loguru import logger
 from magic_pdf.libs.commons import parse_bucket_key
+# 定义配置文件名常量
+CONFIG_FILE_NAME = "magic-pdf.json"
 def read_config():
     home_dir = os.path.expanduser("~")
-    config_file = os.path.join(home_dir, "magic-pdf.json")
+    config_file = os.path.join(home_dir, CONFIG_FILE_NAME)
     if not os.path.exists(config_file):
-        raise Exception(f"{config_file} not found")
+        raise FileNotFoundError(f"{config_file} not found")
-    with open(config_file, "r") as f:
+    with open(config_file, "r", encoding="utf-8") as f:
         config = json.load(f)
     return config
@@ -37,7 +40,7 @@ def get_s3_config(bucket_name: str):
         access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
     if access_key is None or secret_key is None or storage_endpoint is None:
-        raise Exception("ak, sk or endpoint not found in magic-pdf.json")
+        raise Exception(f"ak, sk or endpoint not found in {CONFIG_FILE_NAME}")
     # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
@@ -54,19 +57,34 @@ def get_bucket_name(path):
     return bucket
-def get_local_dir():
+def get_local_models_dir():
     config = read_config()
-    return config.get("temp-output-dir", "/tmp")
+    models_dir = config.get("models-dir")
+    if models_dir is None:
+        logger.warning(f"'models-dir' not found in {CONFIG_FILE_NAME}, use '/tmp/models' as default")
+        return "/tmp/models"
+    else:
+        return models_dir
-def get_local_models_dir():
+def get_device():
     config = read_config()
-    return config.get("models-dir", "/tmp/models")
+    device = config.get("device-mode")
+    if device is None:
+        logger.warning(f"'device-mode' not found in {CONFIG_FILE_NAME}, use 'cpu' as default")
+        return "cpu"
+    else:
+        return device
-def get_device():
+def get_table_recog_config():
     config = read_config()
-    return config.get("device-mode", "cpu")
+    table_config = config.get("table-config")
+    if table_config is None:
+        logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
+        return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
+    else:
+        return table_config
 if __name__ == "__main__":

magic_pdf/libs/language.py CHANGED Viewed

@@ -1,8 +1,19 @@
+import os
 import unicodedata
+if not os.getenv("FTLANG_CACHE"):
+    current_file_path = os.path.abspath(__file__)
+    current_dir = os.path.dirname(current_file_path)
+    root_dir = os.path.dirname(current_dir)
+    ftlang_cache_dir = os.path.join(root_dir, 'resources', 'fasttext-langdetect')
+    os.environ["FTLANG_CACHE"] = str(ftlang_cache_dir)
+    # print(os.getenv("FTLANG_CACHE"))
 from fast_langdetect import detect_language
 def detect_lang(text: str) -> str:
     if len(text) == 0:
         return ""
     try:
@@ -18,6 +29,7 @@ def detect_lang(text: str) -> str:
 if __name__ == '__main__':
+    print(os.getenv("FTLANG_CACHE"))
     print(detect_lang("This is a test."))
     print(detect_lang("<html>This is a test</html>"))
     print(detect_lang("这个是中文测试。"))

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.6.1"
1	+ __version__ = "0.7.0a1"

magic_pdf/model/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
-__use_inside_model__ = False
+__use_inside_model__ = True
 __model_mode__ = "full"

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -4,7 +4,7 @@ import fitz
 import numpy as np
 from loguru import logger
-from magic_pdf.libs.config_reader import get_local_models_dir, get_device
+from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config
 from magic_pdf.model.model_list import MODEL
 import magic_pdf.model as model_config
@@ -48,10 +48,28 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
     return images
-def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
+class ModelSingleton:
+    _instance = None
+    _models = {}
+    def __new__(cls, *args, **kwargs):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+    def get_model(self, ocr: bool, show_log: bool):
+        key = (ocr, show_log)
+        if key not in self._models:
+            self._models[key] = custom_model_init(ocr=ocr, show_log=show_log)
+        return self._models[key]
+def custom_model_init(ocr: bool = False, show_log: bool = False):
     model = None
     if model_config.__model_mode__ == "lite":
+        logger.warning("The Lite mode is provided for developers to conduct testing only, and the output quality is "
+                       "not guaranteed to be reliable.")
         model = MODEL.Paddle
     elif model_config.__model_mode__ == "full":
         model = MODEL.PEK
@@ -66,7 +84,13 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
             # 从配置文件读取model-dir和device
             local_models_dir = get_local_models_dir()
             device = get_device()
-            custom_model = CustomPEKModel(ocr=ocr, show_log=show_log, models_dir=local_models_dir, device=device)
+            table_config = get_table_recog_config()
+            model_input = {"ocr": ocr,
+                           "show_log": show_log,
+                           "models_dir": local_models_dir,
+                           "device": device,
+                           "table_config": table_config}
+            custom_model = CustomPEKModel(**model_input)
         else:
             logger.error("Not allow model_name!")
             exit(1)
@@ -76,6 +100,14 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
         logger.error("use_inside_model is False, not allow to use inside model")
         exit(1)
+    return custom_model
+def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
+    model_manager = ModelSingleton()
+    custom_model = model_manager.get_model(ocr, show_log)
     images = load_images_from_pdf(pdf_bytes)
     model_json = []

magic_pdf/model/magic_model.py CHANGED Viewed

@@ -9,13 +9,14 @@ from magic_pdf.libs.coordinate_transform import get_scale_ratio
 from magic_pdf.libs.ocr_content_type import ContentType
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-from magic_pdf.libs.math import float_gt
+from magic_pdf.libs.local_math import float_gt
 from magic_pdf.libs.boxbase import (
     _is_in,
     bbox_relative_pos,
     bbox_distance,
     _is_part_overlap,
-    calculate_overlap_area_in_bbox1_area_ratio, calculate_iou,
+    calculate_overlap_area_in_bbox1_area_ratio,
+    calculate_iou,
 )
 from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
@@ -78,9 +79,23 @@ class MagicModel:
                 for layout_det2 in layout_dets:
                     if layout_det1 == layout_det2:
                         continue
-                    if layout_det1["category_id"] in [0,1,2,3,4,5,6,7,8,9] and layout_det2["category_id"] in [0,1,2,3,4,5,6,7,8,9]:
-                        if calculate_iou(layout_det1['bbox'], layout_det2['bbox']) > 0.9:
-                            if layout_det1['score'] < layout_det2['score']:
+                    if layout_det1["category_id"] in [
+                        0,
+                        1,
+                        2,
+                        3,
+                        4,
+                        5,
+                        6,
+                        7,
+                        8,
+                        9,
+                    ] and layout_det2["category_id"] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
+                        if (
+                            calculate_iou(layout_det1["bbox"], layout_det2["bbox"])
+                            > 0.9
+                        ):
+                            if layout_det1["score"] < layout_det2["score"]:
                                 layout_det_need_remove = layout_det1
                             else:
                                 layout_det_need_remove = layout_det2
@@ -97,11 +112,11 @@ class MagicModel:
     def __init__(self, model_list: list, docs: fitz.Document):
         self.__model_list = model_list
         self.__docs = docs
-        '''为所有模型数据添加bbox信息(缩放，poly->bbox)'''
+        """为所有模型数据添加bbox信息(缩放，poly->bbox)"""
         self.__fix_axis()
-        '''删除置信度特别低的模型数据(<0.05),提高质量'''
+        """删除置信度特别低的模型数据(<0.05),提高质量"""
         self.__fix_by_remove_low_confidence()
-        '''删除高iou(>0.9)数据中置信度较低的那个'''
+        """删除高iou(>0.9)数据中置信度较低的那个"""
         self.__fix_by_remove_high_iou_and_low_confidence()
     def __reduct_overlap(self, bboxes):
@@ -125,16 +140,6 @@ class MagicModel:
         ret = []
         MAX_DIS_OF_POINT = 10**9 + 7
-        def expand_bbox(bbox1, bbox2):
-            x0 = min(bbox1[0], bbox2[0])
-            y0 = min(bbox1[1], bbox2[1])
-            x1 = max(bbox1[2], bbox2[2])
-            y1 = max(bbox1[3], bbox2[3])
-            return [x0, y0, x1, y1]
-        def get_bbox_area(bbox):
-            return abs(bbox[2] - bbox[0]) * abs(bbox[3] - bbox[1])
         # subject 和 object 的 bbox 会合并成一个大的 bbox （named: merged bbox）。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
         # 再求出筛选出的 subjects 和 object 的最短距离！
         def may_find_other_nearest_bbox(subject_idx, object_idx):
@@ -177,6 +182,13 @@ class MagicModel:
             return ret
+        def expand_bbbox(idxes):
+            x0s = [all_bboxes[idx]["bbox"][0] for idx in idxes]
+            y0s = [all_bboxes[idx]["bbox"][1] for idx in idxes]
+            x1s = [all_bboxes[idx]["bbox"][2] for idx in idxes]
+            y1s = [all_bboxes[idx]["bbox"][3] for idx in idxes]
+            return min(x0s), min(y0s), max(x1s), max(y1s)
         subjects = self.__reduct_overlap(
             list(
                 map(
@@ -268,7 +280,9 @@ class MagicModel:
                     or dis[i][j] == MAX_DIS_OF_POINT
                 ):
                     continue
-                left, right, _, _ = bbox_relative_pos(all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]) # 由  pos_flag_count 相关逻辑保证本段逻辑准确性
+                left, right, _, _ = bbox_relative_pos(
+                    all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
+                )  # 由  pos_flag_count 相关逻辑保证本段逻辑准确性
                 if left or right:
                     one_way_dis = all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
                 else:
@@ -322,6 +336,10 @@ class MagicModel:
                             break
                     if is_nearest:
+                        nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
+                        n_dis = bbox_distance(all_bboxes[i]["bbox"], [nx0, ny0, nx1, ny1])
+                        if float_gt(dis[i][j], n_dis):
+                            continue
                         tmp.append(k)
                         seen.add(k)
@@ -331,20 +349,7 @@ class MagicModel:
             # 已经获取到某个 figure 下所有的最靠近的 captions，以及最靠近这些 captions 的 captions 。
             # 先扩一下 bbox，
-            x0s = [all_bboxes[idx]["bbox"][0] for idx in seen] + [
-                all_bboxes[i]["bbox"][0]
-            ]
-            y0s = [all_bboxes[idx]["bbox"][1] for idx in seen] + [
-                all_bboxes[i]["bbox"][1]
-            ]
-            x1s = [all_bboxes[idx]["bbox"][2] for idx in seen] + [
-                all_bboxes[i]["bbox"][2]
-            ]
-            y1s = [all_bboxes[idx]["bbox"][3] for idx in seen] + [
-                all_bboxes[i]["bbox"][3]
-            ]
-            ox0, oy0, ox1, oy1 = min(x0s), min(y0s), max(x1s), max(y1s)
+            ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
             ix0, iy0, ix1, iy1 = all_bboxes[i]["bbox"]
             # 分成了 4 个截取空间，需要计算落在每个截取空间下 objects 合并后占据的矩形面积
@@ -455,8 +460,10 @@ class MagicModel:
                 with_caption_subject.add(j)
         return ret, total_subject_object_dis
-    def get_imgs(self, page_no: int):  # @许瑞
-        records, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
+    def get_imgs(self, page_no: int):
+        figure_captions, _ = self.__tie_up_category_by_distance(
+            page_no, 3, 4
+        )
         return [
             {
                 "bbox": record["all"],
@@ -464,7 +471,7 @@ class MagicModel:
                 "img_caption_bbox": record.get("object_body", None),
                 "score": record["score"],
             }
-            for record in records
+            for record in figure_captions
         ]
     def get_tables(
@@ -535,6 +542,7 @@ class MagicModel:
                 if not any(span == existing_span for existing_span in new_spans):
                     new_spans.append(span)
             return new_spans
         all_spans = []
         model_page_info = self.__model_list[page_no]
         layout_dets = model_page_info["layout_dets"]
@@ -548,13 +556,14 @@ class MagicModel:
         for layout_det in layout_dets:
             category_id = layout_det["category_id"]
             if category_id in allow_category_id_list:
-                span = {
-                    "bbox": layout_det["bbox"],
-                    "score": layout_det["score"]
-                }
+                span = {"bbox": layout_det["bbox"], "score": layout_det["score"]}
                 if category_id == 3:
                     span["type"] = ContentType.Image
                 elif category_id == 5:
+                    # 获取table模型结果
+                    latex = layout_det.get("latex", None)
+                    if latex:
+                        span["latex"] = latex
                     span["type"] = ContentType.Table
                 elif category_id == 13:
                     span["content"] = layout_det["latex"]
@@ -604,7 +613,6 @@ class MagicModel:
         return self.__model_list[page_no]
 if __name__ == "__main__":
     drw = DiskReaderWriter(r"D:/project/20231108code-clean")
     if 0:

magic-pdf 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl

magic-pdf 0.6.1py3-none-any.whl → 0.7.0a1py3-none-any.whl