PyPI - magic-pdf - Versions diffs - 0.7.0b1__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

magic-pdf 0.7.0b1py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -132,6 +132,8 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                                     # if processed by table model
                                     if span.get('latex', ''):
                                         para_text += f"\n\n$\n {span['latex']}\n$\n\n"
+                                    elif span.get('html', ''):
+                                        para_text += f"\n\n{span['html']}\n\n"
                                     else:
                                         para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
@@ -256,6 +258,8 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
             if block['type'] == BlockType.TableBody:
                 if block["lines"][0]["spans"][0].get('latex', ''):
                     para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
+                elif block["lines"][0]["spans"][0].get('html', ''):
+                    para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
                 para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
             if block['type'] == BlockType.TableCaption:
                 para_content['table_caption'] = merge_para_with_text(block)

magic_pdf/libs/Constants.py CHANGED Viewed

@@ -10,5 +10,31 @@ block维度自定义字段
 # block中lines是否被删除
 LINES_DELETED = "lines_deleted"
+# struct eqtable
+STRUCT_EQTABLE = "struct_eqtable"
 # table recognition max time default value
-TABLE_MAX_TIME_VALUE = 400
+TABLE_MAX_TIME_VALUE = 400
+# pp_table_result_max_length
+TABLE_MAX_LEN = 480
+# pp table structure algorithm
+TABLE_MASTER = "TableMaster"
+# table master structure dict
+TABLE_MASTER_DICT = "table_master_structure_dict.txt"
+# table master dir
+TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
+# pp detect model dir
+DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
+# pp rec model dir
+REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
+# pp rec char dict path
+REC_CHAR_DICT = "ppocr_keys_v1.txt"

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.7.~~0b1~~"
1	+ __version__ = "0.7.1"

magic_pdf/model/magic_model.py CHANGED Viewed

@@ -562,8 +562,11 @@ class MagicModel:
                 elif category_id == 5:
                     # 获取table模型结果
                     latex = layout_det.get("latex", None)
+                    html = layout_det.get("html", None)
                     if latex:
                         span["latex"] = latex
+                    elif html:
+                        span["html"] = html
                     span["type"] = ContentType.Table
                 elif category_id == 13:
                     span["content"] = layout_det["latex"]

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -2,7 +2,7 @@ from loguru import logger
 import os
 import time
-from magic_pdf.libs.Constants import TABLE_MAX_TIME_VALUE
+from magic_pdf.libs.Constants import *
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
@@ -34,10 +34,18 @@ from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Pre
 from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
 from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
-def table_model_init(model_path, max_time, _device_='cpu'):
-    table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+from magic_pdf.model.ppTableModel import ppTableModel
+def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
+    if table_model_type == STRUCT_EQTABLE:
+        table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+    else:
+        config = {
+            "model_dir": model_path,
+            "device": _device_
+        }
+        table_model = ppTableModel(config)
     return table_model
@@ -104,9 +112,11 @@ class CustomPEKModel:
         # 初始化解析配置
         self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
         self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        # table config
         self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
         self.apply_table = self.table_config.get("is_table_recog_enable", False)
         self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
+        self.table_model_type = self.table_config.get("model", TABLE_MASTER)
         self.apply_ocr = ocr
         logger.info(
             "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
@@ -141,10 +151,11 @@ class CustomPEKModel:
         if self.apply_ocr:
             self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
-        # init structeqtable
+        # init table model
         if self.apply_table:
-            self.table_model = table_model_init(str(os.path.join(models_dir, self.configs["weights"]["table"])),
-                                                max_time = self.table_max_time, _device_=self.device)
+            table_model_dir = self.configs["weights"][self.table_model_type]
+            self.table_model = table_model_init(self.table_model_type, str(os.path.join(models_dir, table_model_dir)),
+                                                max_time=self.table_max_time, _device_=self.device)
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
@@ -278,16 +289,28 @@ class CustomPEKModel:
                 new_image, _ = crop_img(res, pil_img)
                 single_table_start_time = time.time()
                 logger.info("------------------table recognition processing begins-----------------")
+                latex_code = None
+                html_code = None
                 with torch.no_grad():
-                    latex_code = self.table_model.image2latex(new_image)[0]
+                    if self.table_model_type == STRUCT_EQTABLE:
+                        latex_code = self.table_model.image2latex(new_image)[0]
+                    else:
+                        html_code = self.table_model.img2html(new_image)
                 run_time = time.time() - single_table_start_time
                 logger.info(f"------------table recognition processing ends within {run_time}s-----")
                 if run_time > self.table_max_time:
                     logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
                 # 判断是否返回正常
-                expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
-                if latex_code and expected_ending:
-                    res["latex"] = latex_code
+                if latex_code:
+                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
+                        'end{table}')
+                    if expected_ending:
+                        res["latex"] = latex_code
+                    else:
+                        logger.warning(f"------------table recognition processing fails----------")
+                elif html_code:
+                    res["html"] = html_code
                 else:
                     logger.warning(f"------------table recognition processing fails----------")
             table_cost = round(time.time() - table_start, 2)

magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py CHANGED Viewed

@@ -12,7 +12,6 @@ class StructTableModel:
             self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
     def image2latex(self, image) -> str:
-        #
         table_latex = self.model.forward(image)
         return table_latex

magic_pdf/model/ppTableModel.py ADDED Viewed

@@ -0,0 +1,67 @@
+from paddleocr.ppstructure.table.predict_table import TableSystem
+from paddleocr.ppstructure.utility import init_args
+from magic_pdf.libs.Constants import *
+import os
+from PIL import Image
+import numpy as np
+class ppTableModel(object):
+    """
+        This class is responsible for converting image of table into HTML format using a pre-trained model.
+        Attributes:
+        - table_sys: An instance of TableSystem initialized with parsed arguments.
+        Methods:
+        - __init__(config): Initializes the model with configuration parameters.
+        - img2html(image): Converts a PIL Image or NumPy array to HTML string.
+        - parse_args(**kwargs): Parses configuration arguments.
+    """
+    def __init__(self, config):
+        """
+        Parameters:
+        - config (dict): Configuration dictionary containing model_dir and device.
+        """
+        args = self.parse_args(**config)
+        self.table_sys = TableSystem(args)
+    def img2html(self, image):
+        """
+        Parameters:
+        - image (PIL.Image or np.ndarray): The image of the table to be converted.
+        Return:
+        - HTML (str): A string representing the HTML structure with content of the table.
+        """
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        pred_res, _ = self.table_sys(image)
+        pred_html = pred_res["html"]
+        res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
+                                                                                               "") + "</table></td>\n"
+        return res
+    def parse_args(self, **kwargs):
+        parser = init_args()
+        model_dir = kwargs.get("model_dir")
+        table_model_dir = os.path.join(model_dir, TABLE_MASTER_DIR)
+        table_char_dict_path = os.path.join(model_dir, TABLE_MASTER_DICT)
+        det_model_dir = os.path.join(model_dir, DETECT_MODEL_DIR)
+        rec_model_dir = os.path.join(model_dir, REC_MODEL_DIR)
+        rec_char_dict_path = os.path.join(model_dir, REC_CHAR_DICT)
+        device = kwargs.get("device", "cpu")
+        use_gpu = True if device == "cuda" else False
+        config = {
+            "use_gpu": use_gpu,
+            "table_max_len": kwargs.get("table_max_len", TABLE_MAX_LEN),
+            "table_algorithm": TABLE_MASTER,
+            "table_model_dir": table_model_dir,
+            "table_char_dict_path": table_char_dict_path,
+            "det_model_dir": det_model_dir,
+            "rec_model_dir": rec_model_dir,
+            "rec_char_dict_path": rec_char_dict_path,
+        }
+        parser.set_defaults(**config)
+        return parser.parse_args([])

magic_pdf/para/para_split_v2.py CHANGED Viewed

@@ -100,59 +100,62 @@ def __detect_list_lines(lines, new_layout_bboxes, lang):
     if lang != 'en':
         return lines, None
-    else:
-        total_lines = len(lines)
-        line_fea_encode = []
-        """
-        对每一行进行特征编码，编码规则如下：
-        1. 如果行顶格，且大写字母开头或者数字开头，编码为1
-        2. 如果顶格，其他非大写开头编码为4
-        3. 如果非顶格，首字符大写，编码为2
-        4. 如果非顶格，首字符非大写编码为3
-        """
-        if len(lines) > 0:
-            x_map_tag_dict, min_x_tag = cluster_line_x(lines)
-        for l in lines:
-            span_text = __get_span_text(l['spans'][0])
-            first_char = span_text[0]
-            layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
-            if not layout:
-                line_fea_encode.append(0)
+    total_lines = len(lines)
+    line_fea_encode = []
+    """
+    对每一行进行特征编码，编码规则如下：
+    1. 如果行顶格，且大写字母开头或者数字开头，编码为1
+    2. 如果顶格，其他非大写开头编码为4
+    3. 如果非顶格，首字符大写，编码为2
+    4. 如果非顶格，首字符非大写编码为3
+    """
+    if len(lines) > 0:
+        x_map_tag_dict, min_x_tag = cluster_line_x(lines)
+    for l in lines:
+        span_text = __get_span_text(l['spans'][0])
+        if not span_text:
+            line_fea_encode.append(0)
+            continue
+        first_char = span_text[0]
+        layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
+        if not layout:
+            line_fea_encode.append(0)
+        else:
+            #
+            if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
+                # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
+                if not first_char.isalnum() or if_match_reference_list(span_text):
+                    line_fea_encode.append(1)
+                else:
+                    line_fea_encode.append(4)
             else:
-                #
-                if x_map_tag_dict[round(l['bbox'][0])] == min_x_tag:
-                    # if first_char.isupper() or first_char.isdigit() or not first_char.isalnum():
-                    if not first_char.isalnum() or if_match_reference_list(span_text):
-                        line_fea_encode.append(1)
-                    else:
-                        line_fea_encode.append(4)
+                if first_char.isupper():
+                    line_fea_encode.append(2)
                 else:
-                    if first_char.isupper():
-                        line_fea_encode.append(2)
-                    else:
-                        line_fea_encode.append(3)
+                    line_fea_encode.append(3)
-        # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
+    # 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行，认为是列表。
-        list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
-        if len(list_indice) > 0:
+    list_indice, list_start_idx = find_repeating_patterns2(line_fea_encode)
+    if len(list_indice) > 0:
+        if debug_able:
+            logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
+    # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
+    segments = []
+    for start, end in list_indice:
+        for i in range(start, end + 1):
+            if i > 0:
+                if line_fea_encode[i] == 4:
+                    if debug_able:
+                        logger.info(f"列表行的第{i}行不是顶格的")
+                    break
+        else:
             if debug_able:
-                logger.info(f"发现了列表，列表行数：{list_indice}， {list_start_idx}")
-        # TODO check一下这个特列表里缩进的行左侧是不是对齐的。
-        segments = []
-        for start, end in list_indice:
-            for i in range(start, end + 1):
-                if i > 0:
-                    if line_fea_encode[i] == 4:
-                        if debug_able:
-                            logger.info(f"列表行的第{i}行不是顶格的")
-                        break
-            else:
-                if debug_able:
-                    logger.info(f"列表行的第{start}到第{end}行是列表")
+                logger.info(f"列表行的第{start}到第{end}行是列表")
-        return split_indices(total_lines, list_indice), list_start_idx
+    return split_indices(total_lines, list_indice), list_start_idx
 def cluster_line_x(lines: list) -> dict:

magic_pdf/resources/model_config/model_configs.yaml CHANGED Viewed

@@ -3,6 +3,7 @@ config:
   layout: True
   formula: True
   table_config:
+    model: TableMaster
     is_table_recog_enable: False
     max_time: 400
@@ -10,4 +11,5 @@ weights:
   layout: Layout/model_final.pth
   mfd: MFD/weights.pt
   mfr: MFR/UniMERNet
-  table: TabRec/StructEqTable
+  struct_eqtable: TabRec/StructEqTable
+  TableMaster: TabRec/TableMaster

{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: magic-pdf
-Version: 0.7.0b1
+Version: 0.7.1
 Summary: A practical tool for converting PDF to Markdown
 Home-page: https://github.com/opendatalab/MinerU
 Requires-Python: >=3.9
@@ -64,6 +64,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
 </div>
 # Changelog
+- 2024/08/30: Version 0.7.1 released, add paddle tablemaster table recognition option
 - 2024/08/09: Version 0.7.0b1 released, simplified installation process, added table recognition functionality
 - 2024/08/01: Version 0.6.2b1 released, optimized dependency conflict issues and installation documentation
 - 2024/07/05: Initial open-source release
@@ -205,7 +206,7 @@ In non-mainline environments, due to the diversity of hardware and software conf
 ```bash
 conda create -n MinerU python=3.10
 conda activate MinerU
-pip install magic-pdf[full]==0.7.0b1 --extra-index-url https://wheels.myhloli.com
+pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
 ```
 #### 2. Download model weight files
@@ -234,6 +235,7 @@ Find the `magic-pdf.json` file in your user directory and configure the "models-
   // other config
   "models-dir": "D:/models",
   "table-config": {
+        "model": "TableMaster", // Another option of this value is 'struct_eqtable'
         "is_table_recog_enable": false, // Table recognition is disabled by default, modify this value to enable it
         "max_time": 400
     }
@@ -345,13 +347,7 @@ TODO
 - Comic books, art books, elementary school textbooks, and exercise books are not well-parsed yet
 - Enabling OCR may produce better results in PDFs with a high density of formulas
 - If you are processing PDFs with a large number of formulas, it is strongly recommended to enable the OCR function. When using PyMuPDF to extract text, overlapping text lines can occur, leading to inaccurate formula insertion positions.
-- **Table Recognition** is currently in the testing phase; recognition speed is slow, and accuracy needs improvement. Below are some performance test results in an Ubuntu 22.04 LTS + Intel(R) Xeon(R) Platinum 8352V CPU @ 2.10GHz + NVIDIA GeForce RTX 4090 environment for reference.
-| Table Size     | Parsing Time        |
-|---------------|----------------------------|
-| 6\*5 55kb     | 37s                   |
-| 16\*12 284kb  | 3m18s                 |
-| 44\*7 559kb   | 4m12s                 |
 # FAQ
 [FAQ in Chinese](docs/FAQ_zh_cn.md)

{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/RECORD RENAMED Viewed

@@ -5,7 +5,7 @@ magic_pdf/pdf_parse_union_core.py,sha256=jNly6l9pGcCf7wr6s6PgQhITJZ1m9PaI32Q26zx
 magic_pdf/user_api.py,sha256=CVQH-VSiZpz0bSkyMT4czk1epZriIPSJsLsPbluPa9Q,3054
 magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
-magic_pdf/dict2md/ocr_mkcontent.py,sha256=mq6tACGkL383bdUla7xOkRXRTBBydRtdbgIHwkk_daM,16169
+magic_pdf/dict2md/ocr_mkcontent.py,sha256=jg_v2Bj62xBObg0LDayvqUVX_O9DrIBli5Z9_i7Qduw,16479
 magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
 magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
@@ -15,7 +15,7 @@ magic_pdf/layout/layout_det_utils.py,sha256=NCYBTvsrULE3Cue53aMD1MfXTmOL9Xy0nivl
 magic_pdf/layout/layout_sort.py,sha256=ovqRX1xcRA7E7s8VvsI7ZNbaNSElJe07bApCh5hxwIE,33533
 magic_pdf/layout/layout_spiler_recog.py,sha256=QjBSgB-a7J2yjUR1eaCs9ZD7URtiRnV6W934hpAeuC4,3067
 magic_pdf/layout/mcol_sort.py,sha256=ADnLisBJBHXDKYChcf2lzTb_TC_vZ4q89_CSN8mwEJc,11331
-magic_pdf/libs/Constants.py,sha256=aKdTHeK75qkVvxvE_2EA5LYis6Z6HLmiuk9o8ESOnNg,260
+magic_pdf/libs/Constants.py,sha256=rdJVadmgN0UlIB-xcMQ9j7Qk9q1Qahxt3KEY-vL7hSU,774
 magic_pdf/libs/MakeContentConfig.py,sha256=UDZPpsv8q4DqTy8h0vRtrT2kHqWiVI205VnVhlUEQc0,206
 magic_pdf/libs/ModelBlockTypeEnum.py,sha256=kalXPbo5ya6hKhhBHPGlHl1yjWOURoXZWQM3rVUyPsY,164
 magic_pdf/libs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -41,13 +41,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
 magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
 magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
 magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
-magic_pdf/libs/version.py,sha256=95eHzU5LYX2l3ASu7OvUb95xo-2kfuwh1uUYnY54K90,24
+magic_pdf/libs/version.py,sha256=2KJZDSMOG7KS82AxYOrZ4ZihYxX0wjfUjDsIZh3L024,22
 magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
 magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
 magic_pdf/model/doc_analyze_by_custom_model.py,sha256=SoT21VHy6ICXoRfC9V3XS6BMiX8EZI6zaqSNgoE17oo,4347
-magic_pdf/model/magic_model.py,sha256=xwKV9BrdjOJecJSzbErT54N6qeJu0tvFuJg2S1z_2kU,25413
+magic_pdf/model/magic_model.py,sha256=3eAfmglKFkmIVPoz3TG8xAzkNK2g_VLI5rRMQAb_cK4,25544
 magic_pdf/model/model_list.py,sha256=AqxAtKGLDn7VVXWYwk0l9LnACxDLyU2jwOJ7vjPZj04,72
-magic_pdf/model/pdf_extract_kit.py,sha256=21vBy8p6pI5a0b6V45ul52yE8zD1R0xrjv4Tx8r9gaw,13620
+magic_pdf/model/pdf_extract_kit.py,sha256=WO54IoxX8XYXLGrjPts--84qRO1FQZm9f_yVyfpPi0s,14539
+magic_pdf/model/ppTableModel.py,sha256=wWiui9VOjkKYlNX-viPqsWpzgkNJ-9_S2Se-j4oyLqU,2687
 magic_pdf/model/pp_structure_v2.py,sha256=1sn8IJK0d5ZmqJ2XFt9FdaSdI0RQf-iwNAWBrVrIeuc,2872
 magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
@@ -72,7 +73,7 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
 magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
 magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
 magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
-magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=BdrBZ_2B2jgF0vzn_ted8bE9Te-DC1Ea2UijqULNKjg,928
+magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=C9WluPhwaqsFg154WsNxN2HlhFXVkAAw0prR7t8r5J4,918
 magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/para/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/para/block_continuation_processor.py,sha256=IkReB5hirjm0OAirNzQQpnWe2f2bdP3Hug3Ef8qTRDA,22749
@@ -84,7 +85,7 @@ magic_pdf/para/exceptions.py,sha256=kpjGxrSZ-drNmoKlmuQ0asTjI8cKKKWsdDDBoDHQP9M,
 magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG_mhmWd8nLwI,1554
 magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
 magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
-magic_pdf/para/para_split_v2.py,sha256=jGOhsubdh_CEgSv9WMNmp1loq1YNlpcAj3yh3g0gPhw,37027
+magic_pdf/para/para_split_v2.py,sha256=jJnn8numhxVgojGwKGCqBNIIYn2AYsucO-q-eQgsPb4,36911
 magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
 magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
 magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -127,7 +128,7 @@ magic_pdf/pre_proc/resolve_bbox_conflict.py,sha256=bJiegofPUeDyi--oZjfipQ5Q5RLm6
 magic_pdf/pre_proc/solve_line_alien.py,sha256=aNoQptPcC38Sm1I2ABhgw8jeH_5kjsRHx3VYlFFtm1g,853
 magic_pdf/pre_proc/statistics.py,sha256=_9jGlXq0iXd03UMxB92ZqCiu7cjNkG5vHvFlTF_9ytA,220
 magic_pdf/resources/fasttext-langdetect/lid.176.ftz,sha256=jzRyz-hzintgmejpmcPL-uDc0VaWqsfXc4qAOdtgPoM,938013
-magic_pdf/resources/model_config/model_configs.yaml,sha256=2MxCOJ5yNUupQqvrAvEuJKlygjxxV_o1qE64K_4NWKA,235
+magic_pdf/resources/model_config/model_configs.yaml,sha256=_gOSxK9jxe1bFwtH_uwovsyZnRi1sEVNYb1OAexDmF4,301
 magic_pdf/resources/model_config/UniMERNet/demo.yaml,sha256=al9_--m3n2j9zEn9OjlmmpfQbqVBAYFakXc_hY4vDXo,807
 magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml,sha256=9aNAEYgpHTAWpcUrDvuPG2y4V-Qw8QdcJefi96y8yDU,6109
 magic_pdf/rw/AbsReaderWriter.py,sha256=2H5SDJfAAOX9kPfel06a8VRCHxD1Y8aPbWEkQDdn9JM,452
@@ -140,9 +141,9 @@ magic_pdf/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/tools/cli.py,sha256=aVmurGAEyWT-MOv0MOaCRrfef1-jkRTpeVVWUsEVyeY,2157
 magic_pdf/tools/cli_dev.py,sha256=w-J4OixDzHjknnUuRW44PXsUlUqyiD4nPbBSSk9WkXM,4160
 magic_pdf/tools/common.py,sha256=XoSs19DD-4ubbjrDFQer83T9O6O_MmgEO61NbjlP_2M,3939
-magic_pdf-0.7.0b1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
-magic_pdf-0.7.0b1.dist-info/METADATA,sha256=47QGAd2iGc0i1osA_jbBS1QT_Jrfmofoyetsrh9KRy8,18571
-magic_pdf-0.7.0b1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-magic_pdf-0.7.0b1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
-magic_pdf-0.7.0b1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
-magic_pdf-0.7.0b1.dist-info/RECORD,,
+magic_pdf-0.7.1.dist-info/LICENSE.md,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
+magic_pdf-0.7.1.dist-info/METADATA,sha256=SD5oVg3vUEuFg7IyAbwncQ_mtgXljhKiJCOwRCTSOVo,18232
+magic_pdf-0.7.1.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
+magic_pdf-0.7.1.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
+magic_pdf-0.7.1.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
+magic_pdf-0.7.1.dist-info/RECORD,,

{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/WHEEL RENAMED Viewed

File without changes

{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.7.1.dist-info}/top_level.txt RENAMED Viewed

File without changes

magic-pdf 0.7.0b1__py3-none-any.whl → 0.7.1__py3-none-any.whl

magic-pdf 0.7.0b1py3-none-any.whl → 0.7.1py3-none-any.whl