PyPI - magic-pdf - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl - Mend

magic-pdf 0.9.0py3-none-any.whl → 0.9.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

magic_pdf/data/data_reader_writer/multi_bucket_s3.py +25 -19
magic_pdf/data/data_reader_writer/s3.py +6 -2
magic_pdf/data/io/__init__.py +6 -0
magic_pdf/data/io/base.py +1 -1
magic_pdf/data/schemas.py +4 -0
magic_pdf/dict2md/ocr_mkcontent.py +31 -9
magic_pdf/libs/version.py +1 -1
magic_pdf/model/pdf_extract_kit.py +12 -22
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +37 -20
magic_pdf/model/ppTableModel.py +6 -4
magic_pdf/para/para_split_v3.py +32 -6
{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/METADATA +32 -27
{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/RECORD +17 -17
{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/WHEEL +0 -0
{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/top_level.txt +0 -0

magic_pdf/data/data_reader_writer/multi_bucket_s3.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from magic_pdf.config.exceptions import InvalidConfig, InvalidParams
 from magic_pdf.data.data_reader_writer.base import DataReader, DataWriter
 from magic_pdf.data.io.s3 import S3Reader, S3Writer
@@ -7,30 +8,34 @@ from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
 class MultiS3Mixin:
-    def __init__(self, default_bucket: str, s3_configs: list[S3Config]):
+    def __init__(self, default_prefix: str, s3_configs: list[S3Config]):
         """Initialized with multiple s3 configs.
         Args:
-            default_bucket (str): the default bucket name of the relative path
+            default_prefix (str): the default prefix of the relative path. for example, {some_bucket}/{some_prefix} or {some_bucket}
             s3_configs (list[S3Config]): list of s3 configs, the bucket_name must be unique in the list.
         Raises:
-            InvalidConfig: default bucket config not in s3_configs
-            InvalidConfig: bucket name not unique in s3_configs
-            InvalidConfig: default bucket must be provided
+            InvalidConfig: default bucket config not in s3_configs.
+            InvalidConfig: bucket name not unique in s3_configs.
+            InvalidConfig: default bucket must be provided.
         """
-        if len(default_bucket) == 0:
-            raise InvalidConfig('default_bucket must be provided')
+        if len(default_prefix) == 0:
+            raise InvalidConfig('default_prefix must be provided')
+        arr = default_prefix.strip("/").split("/")
+        self.default_bucket = arr[0]
+        self.default_prefix = "/".join(arr[1:])
         found_default_bucket_config = False
         for conf in s3_configs:
-            if conf.bucket_name == default_bucket:
+            if conf.bucket_name == self.default_bucket:
                 found_default_bucket_config = True
                 break
         if not found_default_bucket_config:
             raise InvalidConfig(
-                f'default_bucket: {default_bucket} config must be provided in s3_configs: {s3_configs}'
+                f'default_bucket: {self.default_bucket} config must be provided in s3_configs: {s3_configs}'
             )
         uniq_bucket = set([conf.bucket_name for conf in s3_configs])
@@ -39,7 +44,6 @@ class MultiS3Mixin:
                 f'the bucket_name in s3_configs: {s3_configs} must be unique'
             )
-        self.default_bucket = default_bucket
         self.s3_configs = s3_configs
         self._s3_clients_h: dict = {}
@@ -47,14 +51,14 @@ class MultiS3Mixin:
 class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
     def read(self, path: str) -> bytes:
         """Read the path from s3, select diffect bucket client for each request
-        based on the path, also support range read.
+        based on the bucket, also support range read.
         Args:
-            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit
-            for example: s3://bucket_name/path?0,100
+            path (str): the s3 path of file, the path must be in the format of s3://bucket_name/path?offset,limit.
+            for example: s3://bucket_name/path?0,100.
         Returns:
-            bytes: the content of s3 file
+            bytes: the content of s3 file.
         """
         may_range_params = parse_s3_range_params(path)
         if may_range_params is None or 2 != len(may_range_params):
@@ -84,21 +88,22 @@ class MultiBucketS3DataReader(DataReader, MultiS3Mixin):
     def read_at(self, path: str, offset: int = 0, limit: int = -1) -> bytes:
         """Read the file with offset and limit, select diffect bucket client
-        for each request based on the path.
+        for each request based on the bucket.
         Args:
-            path (str): the file path
+            path (str): the file path.
             offset (int, optional): the number of bytes skipped. Defaults to 0.
             limit (int, optional): the number of bytes want to read. Defaults to -1 which means infinite.
         Returns:
-            bytes: the file content
+            bytes: the file content.
         """
         if path.startswith('s3://'):
             bucket_name, path = parse_s3path(path)
             s3_reader = self.__get_s3_client(bucket_name)
         else:
             s3_reader = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
         return s3_reader.read_at(path, offset, limit)
@@ -123,15 +128,16 @@ class MultiBucketS3DataWriter(DataWriter, MultiS3Mixin):
     def write(self, path: str, data: bytes) -> None:
         """Write file with data, also select diffect bucket client for each
-        request based on the path.
+        request based on the bucket.
         Args:
             path (str): the path of file, if the path is relative path, it will be joined with parent_dir.
-            data (bytes): the data want to write
+            data (bytes): the data want to write.
         """
         if path.startswith('s3://'):
             bucket_name, path = parse_s3path(path)
             s3_writer = self.__get_s3_client(bucket_name)
         else:
             s3_writer = self.__get_s3_client(self.default_bucket)
+            path = os.path.join(self.default_prefix, path)
         return s3_writer.write(path, data)

magic_pdf/data/data_reader_writer/s3.py CHANGED Viewed

@@ -6,6 +6,7 @@ from magic_pdf.data.schemas import S3Config
 class S3DataReader(MultiBucketS3DataReader):
     def __init__(
         self,
+        default_prefix_without_bucket: str,
         bucket: str,
         ak: str,
         sk: str,
@@ -15,6 +16,7 @@ class S3DataReader(MultiBucketS3DataReader):
         """s3 reader client.
         Args:
+            default_prefix_without_bucket: prefix that not contains bucket
             bucket (str): bucket name
             ak (str): access key
             sk (str): secret key
@@ -23,7 +25,7 @@ class S3DataReader(MultiBucketS3DataReader):
             refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
         """
         super().__init__(
-            bucket,
+            f'{bucket}/{default_prefix_without_bucket}',
             [
                 S3Config(
                     bucket_name=bucket,
@@ -39,6 +41,7 @@ class S3DataReader(MultiBucketS3DataReader):
 class S3DataWriter(MultiBucketS3DataWriter):
     def __init__(
         self,
+        default_prefix_without_bucket: str,
         bucket: str,
         ak: str,
         sk: str,
@@ -48,6 +51,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
         """s3 writer client.
         Args:
+            default_prefix_without_bucket: prefix that not contains bucket
             bucket (str): bucket name
             ak (str): access key
             sk (str): secret key
@@ -56,7 +60,7 @@ class S3DataWriter(MultiBucketS3DataWriter):
             refer to https://boto3.amazonaws.com/v1/documentation/api/1.9.42/guide/s3.html
         """
         super().__init__(
-            bucket,
+            f'{bucket}/{default_prefix_without_bucket}',
             [
                 S3Config(
                     bucket_name=bucket,

magic_pdf/data/io/__init__.py CHANGED Viewed

@@ -0,0 +1,6 @@
+from magic_pdf.data.io.base import IOReader, IOWriter  # noqa: F401
+from magic_pdf.data.io.http import HttpReader, HttpWriter  # noqa: F401
+from magic_pdf.data.io.s3 import S3Reader, S3Writer  # noqa: F401
+__all__ = ['IOReader', 'IOWriter', 'HttpReader', 'HttpWriter', 'S3Reader', 'S3Writer']

magic_pdf/data/io/base.py CHANGED Viewed

@@ -29,7 +29,7 @@ class IOReader(ABC):
         pass
-class IOWriter:
+class IOWriter(ABC):
     @abstractmethod
     def write(self, path: str, data: bytes) -> None:

magic_pdf/data/schemas.py CHANGED Viewed

@@ -3,6 +3,8 @@ from pydantic import BaseModel, Field
 class S3Config(BaseModel):
+    """S3 config
+    """
     bucket_name: str = Field(description='s3 bucket name', min_length=1)
     access_key: str = Field(description='s3 access key', min_length=1)
     secret_key: str = Field(description='s3 secret key', min_length=1)
@@ -11,5 +13,7 @@ class S3Config(BaseModel):
 class PageInfo(BaseModel):
+    """The width and height of page
+    """
     w: float = Field(description='the width of page')
     h: float = Field(description='the height of page')

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -119,6 +119,16 @@ def detect_language(text):
         return 'empty'
+# 连写字符拆分
+def __replace_ligatures(text: str):
+    text = re.sub(r'ﬁ', 'fi', text)  # 替换 fi 连写符
+    text = re.sub(r'ﬂ', 'fl', text)  # 替换 fl 连写符
+    text = re.sub(r'ﬀ', 'ff', text)  # 替换 ff 连写符
+    text = re.sub(r'ﬃ', 'ffi', text)  # 替换 ffi 连写符
+    text = re.sub(r'ﬄ', 'ffl', text)  # 替换 ffl 连写符
+    return text
 def merge_para_with_text(para_block):
     para_text = ''
     for i, line in enumerate(para_block['lines']):
@@ -141,22 +151,34 @@ def merge_para_with_text(para_block):
             if span_type == ContentType.Text:
                 content = ocr_escape_special_markdown_char(span['content'])
             elif span_type == ContentType.InlineEquation:
-                content = f" ${span['content']}$ "
+                content = f"${span['content']}$"
             elif span_type == ContentType.InterlineEquation:
                 content = f"\n$$\n{span['content']}\n$$\n"
+            content = content.strip()
             if content != '':
                 langs = ['zh', 'ja', 'ko']
                 if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
-                elif line_lang == 'en':
-                    # 如果是前一行带有-连字符，那么末尾不应该加空格
-                    if __is_hyphen_at_line_end(content):
-                        para_text += content[:-1]
-                    else:
-                        para_text += content + ' '
+                    if span_type in [ContentType.Text, ContentType.InterlineEquation]:
+                        para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
+                    elif span_type == ContentType.InlineEquation:
+                        para_text += f" {content} "
                 else:
-                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
+                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
+                        # 如果是前一行带有-连字符，那么末尾不应该加空格
+                        if __is_hyphen_at_line_end(content):
+                            para_text += content[:-1]
+                        elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']:
+                            para_text += content
+                        else:  # 西方文本语境下 content间需要空格分隔
+                            para_text += f"{content} "
+                    elif span_type == ContentType.InterlineEquation:
+                        para_text += content
+            else:
+                continue
+    # 连写字符拆分
+    para_text = __replace_ligatures(para_text)
     return para_text

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.9.0"
1	+ __version__ = "0.9.2"

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -38,15 +38,13 @@ except ImportError as e:
 from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
 from magic_pdf.model.pek_sub_modules.post_process import latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
-# from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
+from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
 from magic_pdf.model.ppTableModel import ppTableModel
 def table_model_init(table_model_type, model_path, max_time, _device_='cpu'):
     if table_model_type == MODEL_NAME.STRUCT_EQTABLE:
-        # table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
-        logger.error("StructEqTable is under upgrade, the current version does not support it.")
-        exit(1)
+        table_model = StructTableModel(model_path, max_time=max_time)
     elif table_model_type == MODEL_NAME.TABLE_MASTER:
         config = {
             "model_dir": model_path,
@@ -284,8 +282,6 @@ class CustomPEKModel:
             )
         # 初始化ocr
         if self.apply_ocr:
-            # self.ocr_model = ModifiedPaddleOCR(show_log=show_log, det_db_box_thresh=0.3)
             self.ocr_model = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.OCR,
                 ocr_show_log=show_log,
@@ -303,17 +299,6 @@ class CustomPEKModel:
                 device=self.device
             )
-            home_directory = Path.home()
-            det_source = os.path.join(models_dir, table_model_dir, DETECT_MODEL_DIR)
-            rec_source = os.path.join(models_dir, table_model_dir, REC_MODEL_DIR)
-            det_dest_dir = os.path.join(home_directory, PP_DET_DIRECTORY)
-            rec_dest_dir = os.path.join(home_directory, PP_REC_DIRECTORY)
-            if not os.path.exists(det_dest_dir):
-                shutil.copytree(det_source, det_dest_dir)
-            if not os.path.exists(rec_dest_dir):
-                shutil.copytree(rec_source, rec_dest_dir)
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
@@ -393,7 +378,7 @@ class CustomPEKModel:
             elif int(res['category_id']) in [5]:
                 table_res_list.append(res)
-        if torch.cuda.is_available():
+        if torch.cuda.is_available() and self.device != 'cpu':
             properties = torch.cuda.get_device_properties(self.device)
             total_memory = properties.total_memory / (1024 ** 3)  # 将字节转换为 GB
             if total_memory <= 10:
@@ -463,7 +448,9 @@ class CustomPEKModel:
                 html_code = None
                 if self.table_model_name == MODEL_NAME.STRUCT_EQTABLE:
                     with torch.no_grad():
-                        latex_code = self.table_model.image2latex(new_image)[0]
+                        table_result = self.table_model.predict(new_image, "html")
+                        if len(table_result) > 0:
+                            html_code = table_result[0]
                 else:
                     html_code = self.table_model.img2html(new_image)
@@ -474,14 +461,17 @@ class CustomPEKModel:
                 # 判断是否返回正常
                 if latex_code:
-                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith(
-                        'end{table}')
+                    expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
                     if expected_ending:
                         res["latex"] = latex_code
                     else:
                         logger.warning(f"table recognition processing fails, not found expected LaTeX table end")
                 elif html_code:
-                    res["html"] = html_code
+                    expected_ending = html_code.strip().endswith('</html>') or html_code.strip().endswith('</table>')
+                    if expected_ending:
+                        res["html"] = html_code
+                    else:
+                        logger.warning(f"table recognition processing fails, not found expected HTML table end")
                 else:
                     logger.warning(f"table recognition processing fails, not get latex or html return")
             logger.info(f"table time: {round(time.time() - table_start, 2)}")

magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py CHANGED Viewed

@@ -1,28 +1,45 @@
-from loguru import logger
+import re
-try:
-    from struct_eqtable.model import StructTable
-except ImportError:
-    logger.error("StructEqTable is under upgrade, the current version does not support it.")
-from pypandoc import convert_text
+import torch
+from struct_eqtable import build_model
 class StructTableModel:
-    def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
+    def __init__(self, model_path, max_new_tokens=1024, max_time=60):
         # init
-        self.model_path = model_path
-        self.max_new_tokens = max_new_tokens # maximum output tokens length
-        self.max_time = max_time # timeout for processing in seconds
-        if device == 'cuda':
-            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda()
+        assert torch.cuda.is_available(), "CUDA must be available for StructEqTable model."
+        self.model = build_model(
+            model_ckpt=model_path,
+            max_new_tokens=max_new_tokens,
+            max_time=max_time,
+            lmdeploy=False,
+            flash_attn=False,
+            batch_size=1,
+        ).cuda()
+        self.default_format = "html"
+    def predict(self, images, output_format=None, **kwargs):
+        if output_format is None:
+            output_format = self.default_format
         else:
-            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
+            if output_format not in ['latex', 'markdown', 'html']:
+                raise ValueError(f"Output format {output_format} is not supported.")
+        results = self.model(
+            images, output_format=output_format
+        )
+        if output_format == "html":
+            results = [self.minify_html(html) for html in results]
-    def image2latex(self, image) -> str:
-        table_latex = self.model.forward(image)
-        return table_latex
+        return results
-    def image2html(self, image) -> str:
-        table_latex = self.image2latex(image)
-        table_html = convert_text(table_latex, 'html', format='latex')
-        return table_html
+    def minify_html(self, html):
+        # 移除多余的空白字符
+        html = re.sub(r'\s+', ' ', html)
+        # 移除行尾的空白字符
+        html = re.sub(r'\s*>\s*', '>', html)
+        # 移除标签前的空白字符
+        html = re.sub(r'\s*<\s*', '<', html)
+        return html.strip()

magic_pdf/model/ppTableModel.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import cv2
 from paddleocr.ppstructure.table.predict_table import TableSystem
 from paddleocr.ppstructure.utility import init_args
 from magic_pdf.libs.Constants import *
@@ -36,12 +37,13 @@ class ppTableModel(object):
         - HTML (str): A string representing the HTML structure with content of the table.
         """
         if isinstance(image, Image.Image):
-            image = np.array(image)
+            image = np.asarray(image)
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
         pred_res, _ = self.table_sys(image)
         pred_html = pred_res["html"]
-        res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace("</table></body></html>",
-                                                                                               "") + "</table></td>\n"
-        return res
+        # res = '<td><table  border="1">' + pred_html.replace("<html><body><table>", "").replace(
+        # "</table></body></html>","") + "</table></td>\n"
+        return pred_html
     def parse_args(self, **kwargs):
         parser = init_args()

magic_pdf/para/para_split_v3.py CHANGED Viewed

@@ -63,15 +63,18 @@ def __is_list_or_index_block(block):
         first_line = block['lines'][0]
         line_height = first_line['bbox'][3] - first_line['bbox'][1]
         block_weight = block['bbox_fs'][2] - block['bbox_fs'][0]
+        block_height = block['bbox_fs'][3] - block['bbox_fs'][1]
         left_close_num = 0
         left_not_close_num = 0
         right_not_close_num = 0
         right_close_num = 0
         lines_text_list = []
+        center_close_num = 0
+        external_sides_not_close_num = 0
         multiple_para_flag = False
         last_line = block['lines'][-1]
         # 如果首行左边不顶格而右边顶格,末行左边顶格而右边不顶格 （第一行可能可以右边不顶格）
         if (first_line['bbox'][0] - block['bbox_fs'][0] > line_height / 2 and
                 # block['bbox_fs'][2] - first_line['bbox'][2] < line_height and
@@ -82,6 +85,16 @@ def __is_list_or_index_block(block):
         for line in block['lines']:
+            line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
+            block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
+            if (
+                    line['bbox'][0] - block['bbox_fs'][0] > 0.8 * line_height and
+                    block['bbox_fs'][2] - line['bbox'][2] > 0.8 * line_height
+            ):
+                external_sides_not_close_num += 1
+            if abs(line_mid_x - block_mid_x) < line_height / 2:
+                center_close_num += 1
             line_text = ""
             for span in line['spans']:
@@ -103,7 +116,7 @@ def __is_list_or_index_block(block):
                 right_close_num += 1
             else:
                 # 右侧不顶格情况下是否有一段距离，拍脑袋用0.3block宽度做阈值
-                closed_area = 0.3 * block_weight
+                closed_area = 0.26 * block_weight
                 # closed_area = 5 * line_height
                 if block['bbox_fs'][2] - line['bbox'][2] > closed_area:
                     right_not_close_num += 1
@@ -132,17 +145,29 @@ def __is_list_or_index_block(block):
                 line_num_flag = True
         # 有的目录右侧不贴边, 目前认为左边或者右边有一边全贴边，且符合数字规则极为index
-        if ((left_close_num/len(block['lines']) >= 0.8 or right_close_num/len(block['lines']) >= 0.8)
+        if ((left_close_num / len(block['lines']) >= 0.8 or right_close_num / len(block['lines']) >= 0.8)
                 and line_num_flag
         ):
             for line in block['lines']:
                 line[ListLineTag.IS_LIST_START_LINE] = True
             return BlockType.Index
+        # 全部line都居中的特殊list识别，每行都需要换行，特征是多行，且大多数行都前后not_close,每line中点x坐标接近
+        # 补充条件block的长宽比有要求
+        elif (
+                external_sides_not_close_num >= 2 and
+                center_close_num == len(block['lines']) and
+                external_sides_not_close_num / len(block['lines']) >= 0.5 and
+                block_height / block_weight > 0.4
+        ):
+            for line in block['lines']:
+                line[ListLineTag.IS_LIST_START_LINE] = True
+            return BlockType.List
         elif left_close_num >= 2 and (
                 right_not_close_num >= 2 or line_end_flag or left_not_close_num >= 2) and not multiple_para_flag:
             # 处理一种特殊的没有缩进的list，所有行都贴左边，通过右边的空隙判断是否是item尾
-            if left_close_num / len(block['lines']) > 0.9:
+            if left_close_num / len(block['lines']) > 0.8:
                 # 这种是每个item只有一行，且左边都贴边的短item list
                 if flag_end_count == 0 and right_close_num / len(block['lines']) < 0.5:
                     for line in block['lines']:
@@ -154,7 +179,7 @@ def __is_list_or_index_block(block):
                         if lines_text_list[i][-1] in LIST_END_FLAG:
                             line[ListLineTag.IS_LIST_END_LINE] = True
                             if i + 1 < len(block['lines']):
-                                block['lines'][i+1][ListLineTag.IS_LIST_START_LINE] = True
+                                block['lines'][i + 1][ListLineTag.IS_LIST_START_LINE] = True
                 # line item基本没有结束标识符，而且也没有缩进，按右侧空隙判断哪些是item end
                 else:
                     line_start_flag = False
@@ -162,7 +187,8 @@ def __is_list_or_index_block(block):
                         if line_start_flag:
                             line[ListLineTag.IS_LIST_START_LINE] = True
                             line_start_flag = False
-                        elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                        # elif abs(block['bbox_fs'][2] - line['bbox'][2]) > line_height:
+                        if abs(block['bbox_fs'][2] - line['bbox'][2]) > 0.1 * block_weight:
                             line[ListLineTag.IS_LIST_END_LINE] = True
                             line_start_flag = True
             # 一种有缩进的特殊有序list,start line 左侧不贴边且以数字开头，end line 以 IS_LIST_END_LINE 结尾且数量和start line 一致

{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: magic-pdf
-Version: 0.9.0
+Version: 0.9.2
 Summary: A practical tool for converting PDF to Markdown
 Home-page: https://github.com/opendatalab/MinerU
 Requires-Python: >=3.9
@@ -22,8 +22,9 @@ Provides-Extra: full
 Requires-Dist: unimernet==0.2.1; extra == "full"
 Requires-Dist: ultralytics; extra == "full"
 Requires-Dist: paddleocr==2.7.3; extra == "full"
-Requires-Dist: pypandoc; extra == "full"
-Requires-Dist: struct-eqtable==0.1.0; extra == "full"
+Requires-Dist: struct-eqtable==0.3.2; extra == "full"
+Requires-Dist: einops; extra == "full"
+Requires-Dist: accelerate; extra == "full"
 Requires-Dist: doclayout-yolo==0.0.2; extra == "full"
 Requires-Dist: detectron2; extra == "full"
 Requires-Dist: paddlepaddle==3.0.0b1; platform_system == "Linux" and extra == "full"
@@ -54,8 +55,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
 [![OpenDataLab](https://img.shields.io/badge/Demo_on_OpenDataLab-blue?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMzAiIGhlaWdodD0iMzAiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgZmlsbD0ibm9uZSI+CiA8ZGVmcz4KICA8bGluZWFyR3JhZGllbnQgeTI9IjAuNTMzNjciIHgyPSIxLjAwMDQiIHkxPSIwLjI5MjE5IiB4MT0iLTAuMTEyNjgiIGlkPSJhIj4KICAgPHN0b3Agc3RvcC1jb2xvcj0iIzE1NDNGRSIvPgogICA8c3RvcCBzdG9wLWNvbG9yPSIjOEM0NkZGIiBvZmZzZXQ9IjEiLz4KICA8L2xpbmVhckdyYWRpZW50PgogIDxsaW5lYXJHcmFkaWVudCB5Mj0iMC41OTc1NyIgeDI9IjEuMDExMzciIHkxPSIwLjExMDIzIiB4MT0iLTAuMDg0NzQiIGlkPSJiIj4KICAgPHN0b3Agc3RvcC1jb2xvcj0iIzE1NDNGRSIvPgogICA8c3RvcCBzdG9wLWNvbG9yPSIjOEM0NkZGIiBvZmZzZXQ9IjEiLz4KICA8L2xpbmVhckdyYWRpZW50PgogPC9kZWZzPgogPGc+CiAgPHRpdGxlPkxheWVyIDE8L3RpdGxlPgogIDxwYXRoIGlkPSJzdmdfMSIgZmlsbD0idXJsKCNhKSIgZD0ibTEuNjIzLDEyLjA2N2EwLjQ4NCwwLjQ4NCAwIDAgMSAwLjA3LC0wLjM4NGw1LjMxLC03Ljg5NWMwLjA2OCwtMC4xIDAuMTcsLTAuMTcyIDAuMjg4LC0wLjJsMTQuMzc3LC0zLjQ3NGEwLjQ4NCwwLjQ4NCAwIDAgMSAwLjU4NCwwLjM1N2wzLjY2MiwxNS4xNTJjMS40NzcsNi4xMTQgLTIuMjgxLDEyLjI2NyAtOC4zOTQsMTMuNzQ1Yy02LjExNCwxLjQ3NyAtMTIuMjY3LC0yLjI4MSAtMTMuNzQ1LC04LjM5NWwtMi4xNTIsLTguOTA2eiIgb3BhY2l0eT0iMC40Ii8+CiAgPHBhdGggaWQ9InN2Z18yIiBmaWxsPSJ1cmwoI2IpIiBkPSJtNS44MjYsOC42NzNjMCwtMC4xMzYgMC4wNTcsLTAuMjY2IDAuMTU3LC0wLjM1OGw3LjAxNywtNi40MjVhMC40ODQsMC40ODQgMCAwIDEgMC4zMjcsLTAuMTI3bDE0Ljc5LDBjMC4yNjgsMCAwLjQ4NSwwLjIxNiAwLjQ4NSwwLjQ4NGwwLDE1LjU4OWMwLDYuMjkgLTUuMDk5LDExLjM4OCAtMTEuMzg4LDExLjM4OGMtNi4yOSwwIC0xMS4zODgsLTUuMDk5IC0xMS4zODgsLTExLjM4OGwwLC05LjE2M3oiLz4KICA8cGF0aCBpZD0ic3ZnXzMiIGZpbGw9IiM1RDc2RkYiIGQ9Im0xMi4zMzEsOC43NTNsLTYuMzgzLC0wLjM5OGw3LjEyMiwtNi41MmwwLjI5OSw1Ljg5MmEwLjk3OCwwLjk3OCAwIDAgMSAtMS4wMzgsMS4wMjZ6Ii8+CiAgPHBhdGggaWQ9InN2Z180IiBmaWxsPSIjMDAyOEZEIiBkPSJtMjAuNDE2LDE1LjAyMmwwLDEuNzExYTIuNDA0LDIuNDA0IDAgMCAxIC00LjgwOCwwbDAsLTQuMjc4bC0yLjgxLDBsMCw0LjY4NmE1LjIxNSw1LjIxNSAwIDEgMCAxMC40MywwbDAsLTQuNjg2bDAsMi41NjdsLTIuODEyLDB6IiBjbGlwLXJ1bGU9ImV2ZW5vZGQiIGZpbGwtcnVsZT0iZXZlbm9kZCIvPgogIDxwYXRoIGlkPSJzdmdfNSIgZmlsbD0iIzAwMjhGRCIgZD0ibTIzLjIyOCwxMy44ODFsMS4xNCwwbDAsMS4xNDFsLTEuMTQsMGwwLC0xLjE0bDAsLTAuMDAxem0tMi44MTIsLTAuNjkybDEuODM0LDBsMCwxLjgzM2wtMS44MzQsMGwwLC0xLjgzMmwwLC0wLjAwMXptMS44MzQsLTAuOTc5bDAuOTc4LDBsMCwwLjk3OWwtMC45NzgsMGwwLC0wLjk3OGwwLC0wLjAwMXptMS41NDgsLTEuNjI5bDAuNjExLDBsMCwwLjYxMWwtMC42MTEsMGwwLC0wLjYxMXoiLz4KICA8cGF0aCBpZD0ic3ZnXzYiIGZpbGw9IiNmZmYiIGQ9Im0yMC4wODYsMTQuOTEybDAsMS43MTFhMi40MDQsMi40MDQgMCAxIDEgLTQuODA3LDBsMCwtNC4yNzhsLTIuODEyLDBsMCw0LjY4NmE1LjIxNSw1LjIxNSAwIDAgMCAxMC40MywwbDAsLTQuNjg2bDAsMi41NjdsLTIuODEsMGwtMC4wMDEsMHoiIGNsaXAtcnVsZT0iZXZlbm9kZCIgZmlsbC1ydWxlPSJldmVub2RkIi8+CiAgPHBhdGggaWQ9InN2Z183IiBmaWxsPSIjZmZmIiBkPSJtMjIuODk4LDEzLjc3MWwxLjE0LDBsMCwxLjE0MWwtMS4xNCwwbDAsLTEuMTRsMCwtMC4wMDF6bS0yLjgxMiwtMC42OTJsMS44MzQsMGwwLDEuODMzbC0xLjgzNCwwbDAsLTEuODMybDAsLTAuMDAxem0xLjgzNCwtMC45NzlsMC45NzgsMGwwLDAuOTc5bC0wLjk3OCwwbDAsLTAuOTc5em0xLjU0OCwtMS42MjlsMC42MTEsMGwwLDAuNjExbC0wLjYxLDBsMCwtMC42MWwtMC4wMDEsLTAuMDAxeiIvPgogPC9nPgo8L3N2Zz4=&labelColor=white)](https://opendatalab.com/OpenSourceTools/Extractor/PDF)
 [![HuggingFace](https://img.shields.io/badge/Demo_on_HuggingFace-yellow.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAF8AAABYCAMAAACkl9t/AAAAk1BMVEVHcEz/nQv/nQv/nQr/nQv/nQr/nQv/nQv/nQr/wRf/txT/pg7/yRr/rBD/zRz/ngv/oAz/zhz/nwv/txT/ngv/0B3+zBz/nQv/0h7/wxn/vRb/thXkuiT/rxH/pxD/ogzcqyf/nQvTlSz/czCxky7/SjifdjT/Mj3+Mj3wMj15aTnDNz+DSD9RTUBsP0FRO0Q6O0WyIxEIAAAAGHRSTlMADB8zSWF3krDDw8TJ1NbX5efv8ff9/fxKDJ9uAAAGKklEQVR42u2Z63qjOAyGC4RwCOfB2JAGqrSb2WnTw/1f3UaWcSGYNKTdf/P+mOkTrE+yJBulvfvLT2A5ruenaVHyIks33npl/6C4s/ZLAM45SOi/1FtZPyFur1OYofBX3w7d54Bxm+E8db+nDr12ttmESZ4zludJEG5S7TO72YPlKZFyE+YCYUJTBZsMiNS5Sd7NlDmKM2Eg2JQg8awbglfqgbhArjxkS7dgp2RH6hc9AMLdZYUtZN5DJr4molC8BfKrEkPKEnEVjLbgW1fLy77ZVOJagoIcLIl+IxaQZGjiX597HopF5CkaXVMDO9Pyix3AFV3kw4lQLCbHuMovz8FallbcQIJ5Ta0vks9RnolbCK84BtjKRS5uA43hYoZcOBGIG2Epbv6CvFVQ8m8loh66WNySsnN7htL58LNp+NXT8/PhXiBXPMjLSxtwp8W9f/1AngRierBkA+kk/IpUSOeKByzn8y3kAAAfh//0oXgV4roHm/kz4E2z//zRc3/lgwBzbM2mJxQEa5pqgX7d1L0htrhx7LKxOZlKbwcAWyEOWqYSI8YPtgDQVjpB5nvaHaSnBaQSD6hweDi8PosxD6/PT09YY3xQA7LTCTKfYX+QHpA0GCcqmEHvr/cyfKQTEuwgbs2kPxJEB0iNjfJcCTPyocx+A0griHSmADiC91oNGVwJ69RudYe65vJmoqfpul0lrqXadW0jFKH5BKwAeCq+Den7s+3zfRJzA61/Uj/9H/VzLKTx9jFPPdXeeP+L7WEvDLAKAIoF8bPTKT0+TM7W8ePj3Rz/Yn3kOAp2f1Kf0Weony7pn/cPydvhQYV+eFOfmOu7VB/ViPe34/EN3RFHY/yRuT8ddCtMPH/McBAT5s+vRde/gf2c/sPsjLK+m5IBQF5tO+h2tTlBGnP6693JdsvofjOPnnEHkh2TnV/X1fBl9S5zrwuwF8NFrAVJVwCAPTe8gaJlomqlp0pv4Pjn98tJ/t/fL++6unpR1YGC2n/KCoa0tTLoKiEeUPDl94nj+5/Tv3/eT5vBQ60X1S0oZr+IWRR8Ldhu7AlLjPISlJcO9vrFotky9SpzDequlwEir5beYAc0R7D9KS1DXva0jhYRDXoExPdc6yw5GShkZXe9QdO/uOvHofxjrV/TNS6iMJS+4TcSTgk9n5agJdBQbB//IfF/HpvPt3Tbi7b6I6K0R72p6ajryEJrENW2bbeVUGjfgoals4L443c7BEE4mJO2SpbRngxQrAKRudRzGQ8jVOL2qDVjjI8K1gc3TIJ5KiFZ1q+gdsARPB4NQS4AjwVSt72DSoXNyOWUrU5mQ9nRYyjp89Xo7oRI6Bga9QNT1mQ/ptaJq5T/7WcgAZywR/XlPGAUDdet3LE+qS0TI+g+aJU8MIqjo0Kx8Ly+maxLjJmjQ18rA0YCkxLQbUZP1WqdmyQGJLUm7VnQFqodmXSqmRrdVpqdzk5LvmvgtEcW8PMGdaS23EOWyDVbACZzUJPaqMbjDxpA3Qrgl0AikimGDbqmyT8P8NOYiqrldF8rX+YN7TopX4UoHuSCYY7cgX4gHwclQKl1zhx0THf+tCAUValzjI7Wg9EhptrkIcfIJjA94evOn8B2eHaVzvBrnl2ig0So6hvPaz0IGcOvTHvUIlE2+prqAxLSQxZlU2stql1NqCCLdIiIN/i1DBEHUoElM9dBravbiAnKqgpi4IBkw+utSPIoBijDXJipSVV7MpOEJUAc5Qmm3BnUN+w3hteEieYKfRZSIUcXKMVf0u5wD4EwsUNVvZOtUT7A2GkffHjByWpHqvRBYrTV72a6j8zZ6W0DTE86Hn04bmyWX3Ri9WH7ZU6Q7h+ZHo0nHUAcsQvVhXRDZHChwiyi/hnPuOsSEF6Exk3o6Y9DT1eZ+6cASXk2Y9k+6EOQMDGm6WBK10wOQJCBwren86cPPWUcRAnTVjGcU1LBgs9FURiX/e6479yZcLwCBmTxiawEwrOcleuu12t3tbLv/N4RLYIBhYexm7Fcn4OJcn0+zc+s8/VfPeddZHAGN6TT8eGczHdR/Gts1/MzDkThr23zqrVfAMFT33Nx1RJsx1k5zuWILLnG/vsH+Fv5D4NTVcp1Gzo8AAAAAElFTkSuQmCC&labelColor=white)](https://huggingface.co/spaces/opendatalab/MinerU)
 [![ModelScope](https://img.shields.io/badge/Demo_on_ModelScope-purple?logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjIzIiBoZWlnaHQ9IjIwMCIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KCiA8Zz4KICA8dGl0bGU+TGF5ZXIgMTwvdGl0bGU+CiAgPHBhdGggaWQ9InN2Z18xNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTAsODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTUiIGZpbGw9IiM2MjRhZmYiIGQ9Im05OS4xNCwxMTUuNDlsMjUuNjUsMGwwLDI1LjY1bC0yNS42NSwwbDAsLTI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTYiIGZpbGw9IiM2MjRhZmYiIGQ9Im0xNzYuMDksMTQxLjE0bC0yNS42NDk5OSwwbDAsMjIuMTlsNDcuODQsMGwwLC00Ny44NGwtMjIuMTksMGwwLDI1LjY1eiIvPgogIDxwYXRoIGlkPSJzdmdfMTciIGZpbGw9IiMzNmNmZDEiIGQ9Im0xMjQuNzksODkuODRsMjUuNjUsMGwwLDI1LjY0OTk5bC0yNS42NSwwbDAsLTI1LjY0OTk5eiIvPgogIDxwYXRoIGlkPSJzdmdfMTgiIGZpbGw9IiMzNmNmZDEiIGQ9Im0wLDY0LjE5bDI1LjY1LDBsMCwyNS42NWwtMjUuNjUsMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzE5IiBmaWxsPSIjNjI0YWZmIiBkPSJtMTk4LjI4LDg5Ljg0bDI1LjY0OTk5LDBsMCwyNS42NDk5OWwtMjUuNjQ5OTksMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIwIiBmaWxsPSIjMzZjZmQxIiBkPSJtMTk4LjI4LDY0LjE5bDI1LjY0OTk5LDBsMCwyNS42NWwtMjUuNjQ5OTksMGwwLC0yNS42NXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIxIiBmaWxsPSIjNjI0YWZmIiBkPSJtMTUwLjQ0LDQybDAsMjIuMTlsMjUuNjQ5OTksMGwwLDI1LjY1bDIyLjE5LDBsMCwtNDcuODRsLTQ3Ljg0LDB6Ii8+CiAgPHBhdGggaWQ9InN2Z18yMiIgZmlsbD0iIzM2Y2ZkMSIgZD0ibTczLjQ5LDg5Ljg0bDI1LjY1LDBsMCwyNS42NDk5OWwtMjUuNjUsMGwwLC0yNS42NDk5OXoiLz4KICA8cGF0aCBpZD0ic3ZnXzIzIiBmaWxsPSIjNjI0YWZmIiBkPSJtNDcuODQsNjQuMTlsMjUuNjUsMGwwLC0yMi4xOWwtNDcuODQsMGwwLDQ3Ljg0bDIyLjE5LDBsMCwtMjUuNjV6Ii8+CiAgPHBhdGggaWQ9InN2Z18yNCIgZmlsbD0iIzYyNGFmZiIgZD0ibTQ3Ljg0LDExNS40OWwtMjIuMTksMGwwLDQ3Ljg0bDQ3Ljg0LDBsMCwtMjIuMTlsLTI1LjY1LDBsMCwtMjUuNjV6Ii8+CiA8L2c+Cjwvc3ZnPg==&labelColor=white)](https://www.modelscope.cn/studios/OpenDataLab/MinerU)
-[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/papayalove/b5f4913389e7ff9883c6b687de156e78/mineru_demo.ipynb)
+[![Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/myhloli/3b3a00a4a0a61577b6c30f989092d20d/mineru_demo.ipynb)
 [![Paper](https://img.shields.io/badge/Paper-arXiv-green)](https://arxiv.org/abs/2409.18839)
@@ -80,6 +80,7 @@ Requires-Dist: paddlepaddle==2.6.1; (platform_system == "Windows" or platform_sy
 </div>
 # Changelog
+- 2024/11/06 0.9.2 released. Integrated the [StructTable-InternVL2-1B](https://huggingface.co/U4R/StructTable-InternVL2-1B) model for table recognition functionality.
 - 2024/10/31 0.9.0 released. This is a major new version with extensive code refactoring, addressing numerous issues, improving performance, reducing hardware requirements, and enhancing usability:
   - Refactored the sorting module code to use [layoutreader](https://github.com/ppaanngggg/layoutreader) for reading order sorting, ensuring high accuracy in various layouts.
   - Refactored the paragraph concatenation module to achieve good results in cross-column, cross-page, cross-figure, and cross-table scenarios.
@@ -175,13 +176,14 @@ There are three different ways to experience MinerU:
 - [Quick CPU Demo (Windows, Linux, Mac)](#quick-cpu-demo)
 - [Linux/Windows + CUDA](#Using-GPU)
-**⚠️ Pre-installation Notice—Hardware and Software Environment Support**
-To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
-By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
-In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
+> [!WARNING]
+> **Pre-installation Notice—Hardware and Software Environment Support**
+>
+> To ensure the stability and reliability of the project, we only optimize and test for specific hardware and software environments during development. This ensures that users deploying and running the project on recommended system configurations will get the best performance with the fewest compatibility issues.
+>
+> By focusing resources on the mainline environment, our team can more efficiently resolve potential bugs and develop new features.
+>
+> In non-mainline environments, due to the diversity of hardware and software configurations, as well as third-party dependency compatibility issues, we cannot guarantee 100% project availability. Therefore, for users who wish to use this project in non-recommended environments, we suggest carefully reading the documentation and FAQ first. Most issues already have corresponding solutions in the FAQ. We also encourage community feedback to help us gradually expand support.
 <table>
     <tr>
@@ -261,11 +263,13 @@ Refer to [How to Download Model Files](docs/how_to_download_models_en.md) for de
 After completing the [2. Download model weight files](#2-download-model-weight-files) step, the script will automatically generate a `magic-pdf.json` file in the user directory and configure the default model path.
 You can find the `magic-pdf.json` file in your 【user directory】.
+> [!TIP]
 > The user directory for Windows is "C:\\Users\\username", for Linux it is "/home/username", and for macOS it is "/Users/username".
 You can modify certain configurations in this file to enable or disable features, such as table recognition:
+> [!NOTE]
 > If the following items are not present in the JSON, please manually add the required items and remove the comment content (standard JSON does not support comments).
 ```json
@@ -294,13 +298,14 @@ If your device supports CUDA and meets the GPU requirements of the mainline envi
 - [Ubuntu 22.04 LTS + GPU](docs/README_Ubuntu_CUDA_Acceleration_en_US.md)
 - [Windows 10/11 + GPU](docs/README_Windows_CUDA_Acceleration_en_US.md)
 - Quick Deployment with Docker
-    > Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
-    >
-    > Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
-    >
-    > ```bash
-    > docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
-    > ```
+> [!IMPORTANT]
+> Docker requires a GPU with at least 16GB of VRAM, and all acceleration features are enabled by default.
+>
+> Before running this Docker, you can use the following command to check if your device supports CUDA acceleration on Docker.
+>
+> ```bash
+> docker run --rm --gpus=all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi
+> ```
   ```bash
   wget https://github.com/opendatalab/MinerU/raw/master/Dockerfile
   docker build -t mineru:latest .
@@ -362,8 +367,8 @@ The results will be saved in the `{some_output_dir}` directory. The output file
 ├── some_pdf_spans.pdf                   # smallest granularity bbox position information diagram
 └── some_pdf_content_list.json           # Rich text JSON arranged in reading order
 ```
-For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
+> [!TIP]
+> For more information about the output files, please refer to the [Output File Description](docs/output_file_en_us.md).
 ### API
@@ -414,12 +419,12 @@ TODO
 # TODO
-- 🗹 Reading order based on the model
-- 🗹 Recognition of `index` and `list` in the main text
-- 🗹 Table recognition
-- ☐ Code block recognition in the main text
-- ☐ [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
-- ☐ Geometric shape recognition
+- [x] Reading order based on the model
+- [x] Recognition of `index` and `list` in the main text
+- [x] Table recognition
+- [ ] Code block recognition in the main text
+- [ ] [Chemical formula recognition](docs/chemical_knowledge_introduction/introduction.pdf)
+- [ ] Geometric shape recognition
 # Known Issues

{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/RECORD RENAMED Viewed

@@ -10,20 +10,20 @@ magic_pdf/config/exceptions.py,sha256=87UX7gyUpj4HqjPcz2hLqdnYeImtDQAxOxj8oXZ_zk
 magic_pdf/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/data/dataset.py,sha256=n8rGw1-wizABR8giSk_XWPCXzx3478u5DK2Z0wOCOeI,5089
 magic_pdf/data/read_api.py,sha256=3fKLsEYAow5RwAmGFMMgvcCh0-_WEEHem2uewukjXOA,3570
-magic_pdf/data/schemas.py,sha256=XSFNxyYbIWgU_Z4U0695elpGQP3J5dpq4Rlyr3S0O_s,595
+magic_pdf/data/schemas.py,sha256=oIUTBzK8Wq8Wuy8A_uilWAbVhucRvOs9_f3lSKYgcmQ,664
 magic_pdf/data/utils.py,sha256=dJZiqygwNier0UG5tbt5jAPjgwcnfsAN6-m-G1kVPLQ,917
 magic_pdf/data/data_reader_writer/__init__.py,sha256=QtevUaeSivv9dQKi3Tomfn4Z0E4To0cB8qXTnglxaHc,705
 magic_pdf/data/data_reader_writer/base.py,sha256=gUrHCMTHYBrWpqgHdIc-hN7HHwUC2ApK_VXrDUrnfdg,1320
 magic_pdf/data/data_reader_writer/filebase.py,sha256=21RYy4m9MqJGqwd2HWICQJHM-PZXp7UYETCQQK390Kk,1988
-magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=BY3faRfZTg27kfkaI4iXPjgFW_RecN0am9r9z2RuYgY,5582
-magic_pdf/data/data_reader_writer/s3.py,sha256=4tT_hcb5I1m-qojNP2CAUKGOoWBH2ripKQmBa9_dAfg,2096
-magic_pdf/data/io/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-magic_pdf/data/io/base.py,sha256=So3G_Kndunfs0f9nn3l9dRJG_7N09CX0JbFqYEvyaRI,1113
+magic_pdf/data/data_reader_writer/multi_bucket_s3.py,sha256=_HA8NJO1Be7KwozlwOJ90o8Ik2vfjlvlDPXppESeIfk,5885
+magic_pdf/data/data_reader_writer/s3.py,sha256=9Oy1cNuXMwG1e8PgZ7AR-pn_MqHAhkgAGnyEZCYoYAA,2408
+magic_pdf/data/io/__init__.py,sha256=WKaIlu8i5AWYxFCGNJcorAfMnlUQDOF8CX07Ycfnu2c,294
+magic_pdf/data/io/base.py,sha256=SqNQqe30ZvoVvg7GVv-hLMCjN6yBgDyQQWeLgGsTfhQ,1118
 magic_pdf/data/io/http.py,sha256=XlKB0DNf4a_uUnfgcclvaaOtmE7lmddx0DnK8A-emAM,958
 magic_pdf/data/io/s3.py,sha256=hyA7sbNriQy64xd_uyJ7acN_oneQ1Pdmoc7_xcvkue8,3606
 magic_pdf/dict2md/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/dict2md/mkcontent.py,sha256=rWUY-2opd0jeowEUEVOV_uWcKum1Q7ng4nOoT6-ka_s,17459
-magic_pdf/dict2md/ocr_mkcontent.py,sha256=eMd3qOIh21cZgTl-LMLGh42uxDMpHU2nwE6iA6b_qrA,11915
+magic_pdf/dict2md/ocr_mkcontent.py,sha256=ClxKUwrK7wlXKCcDfuTryztKl5e8pzcnh5x_fODFm2U,12928
 magic_pdf/filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/filter/pdf_classify_by_type.py,sha256=spmDO-f2ihAl1d6-EP-j271Yi50oyu6mw4X2kRd_m0s,42320
 magic_pdf/filter/pdf_meta_scan.py,sha256=5R2XDiBZw0xd4ugbDxuyk6fztGlT5jFsGN85hLvo-hQ,17390
@@ -65,14 +65,14 @@ magic_pdf/libs/pdf_check.py,sha256=MAe8wzwT0qvPf_I72wEZG7k1g4haNHS7oUtLqkB5rlE,2
 magic_pdf/libs/pdf_image_tools.py,sha256=CAd01giTKr_UJz1_QtDOARG9G9z69GFpzRZwcWSfLtE,1282
 magic_pdf/libs/safe_filename.py,sha256=ckwcM_eqoysTb5id8czp-tXq2G9da0-l3pshZDCHQtE,236
 magic_pdf/libs/textbase.py,sha256=SC1Frhz3Fb7V7n2SFRBsl7Bmg0JZdlvZskq0lfW1vIk,732
-magic_pdf/libs/version.py,sha256=H9NWRZb7NbeRRPLP_V1fARmLNXranorVM-OOY-8_2ug,22
+magic_pdf/libs/version.py,sha256=gqT-BGoeEItda9fICQDvLbxEjWRIBhFJxPxxKvmHLUo,22
 magic_pdf/libs/vis_utils.py,sha256=hTOTEakKV0pGMbk0tbRkVI_tku7A3dGc96ynObZ4kwI,10207
 magic_pdf/model/__init__.py,sha256=1QcfMKET0xQhSaZMjNQHi_TjzSSDR6PI5mjkmaXHPe8,52
 magic_pdf/model/doc_analyze_by_custom_model.py,sha256=eYrtOIlFqw8O95ShoCTaAhLBHk7TXc5DGif93VikW4s,6977
 magic_pdf/model/magic_model.py,sha256=RKJOruUGAV1lHcGqSlCDbkJn5kutb3fphDreOHASPQg,43505
 magic_pdf/model/model_list.py,sha256=tJ9jtMB93HGx8Rmt8wmQSDFXZBUIPQrwaaYsep4luTM,183
-magic_pdf/model/pdf_extract_kit.py,sha256=qlZANx8DErfSyaPHlOYNYW_Qp50dAEX_4jG8N3coDmM,21317
-magic_pdf/model/ppTableModel.py,sha256=Qm5vy6v5aw2wwO5aZTyVr-r1sr3Pi9ManG86WZvfvEo,2697
+magic_pdf/model/pdf_extract_kit.py,sha256=9pdtcQgwn-XMvyQ7yMfzqKgjPfxEuNXR7juCPx-OM-M,20929
+magic_pdf/model/ppTableModel.py,sha256=fqMuMahN2BW4sKGCgFLsi1X1OFaIG8Dab_eHUhKPcH4,2692
 magic_pdf/model/pp_structure_v2.py,sha256=BKPN7W4BjG0eWPAPjPEac1RMnb5eIzmAz4E4Rq-9b1U,3019
 magic_pdf/model/pek_sub_modules/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/model/pek_sub_modules/post_process.py,sha256=HzRxV2sVR3Qo8XKYEHhT6tae-bYTb6dnAfGP6gfVNaM,1135
@@ -97,7 +97,7 @@ magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configur
 magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py,sha256=mdo8tO-DrJcv0Lbk9Pp98n3NQXYOnFFyXQWjU7t35kA,54633
 magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py,sha256=diKlrfxYjKAmYrUgjYdx-FXLh-swShC3tl-EBX1b3oI,1197
 magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py,sha256=0lxiG69_fGpSSBYA9CBLnDa_qqa1rInZ0pJpqBwZ0Yw,1372
-magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=qtAkShYlXBrrkRWHvgAy3y9SEBtMRYVIvI3CASTuLHU,1069
+magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py,sha256=qQthlYDvDPah1mzzrnKXU4fYqlJdXOPBnJ8tYf-o_0k,1384
 magic_pdf/model/pek_sub_modules/structeqtable/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/model/v3/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/model/v3/helpers.py,sha256=IVUFcNMDF3-kio-BIxjppHnWS3eHPqvvNihIw2fbIFM,4372
@@ -112,7 +112,7 @@ magic_pdf/para/layout_match_processor.py,sha256=yr4FEO7GJ502udShqGRqIJQ_FQxoa0aG
 magic_pdf/para/para_pipeline.py,sha256=zLaCHI9jLi1UPzh0lHP44mUjpKVTHS0gE_5YrkjVqEY,11796
 magic_pdf/para/para_split.py,sha256=-UJM2jREW_2h3ZlJAU7dRD8bK3CMGKuhJrfgqv3Auvk,31310
 magic_pdf/para/para_split_v2.py,sha256=ZIiLzpvVL364x1zcEG9IbT6ARJ-6JnWLIVrsDmf4w1M,36878
-magic_pdf/para/para_split_v3.py,sha256=vHHswSAcTpXqnaEAbGEbt2g96YLh9eh839HdRNilDT8,13378
+magic_pdf/para/para_split_v3.py,sha256=k02I9Rdc8jfYr3bMT_Gm38b5ginkl-ZIU5C_XcfAcs8,14704
 magic_pdf/para/raw_processor.py,sha256=mHxD9FrdOSXH7NqM41s55URyCyuyACvm9kKtowkIb3k,6317
 magic_pdf/para/stats.py,sha256=-6Pf9Y8jkP1uJOYWiHUjw9Lb-Fb9GY7MHr_ok7x2GX0,9731
 magic_pdf/para/title_processor.py,sha256=pYZv9vEkIjAtCz8jIUtl9AVUy_ib5SdAZmMVoZtsMRI,38593
@@ -170,9 +170,9 @@ magic_pdf/tools/cli_dev.py,sha256=3e5eyCQEt_EujXZu5fUAWr_W-YQQVqS9pB0Qgw7t1D8,41
 magic_pdf/tools/common.py,sha256=2S8N60pcA6bFqAmdchoEmn22l9ntQxEfyaKpxfCKJ-Y,5465
 magic_pdf/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 magic_pdf/utils/annotations.py,sha256=82ou3uELNbQWa9hOFFkVt0gsIskAKf5msCv5J2IJ5V0,211
-magic_pdf-0.9.0.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
-magic_pdf-0.9.0.dist-info/METADATA,sha256=DPanG2IP5v1TNR6Qyto-UqZ53IOA09lNCQpMyjguJ_k,39420
-magic_pdf-0.9.0.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
-magic_pdf-0.9.0.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
-magic_pdf-0.9.0.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
-magic_pdf-0.9.0.dist-info/RECORD,,
+magic_pdf-0.9.2.dist-info/LICENSE.md,sha256=jVa0BUaKrRH4erV2P5AeJ24I2WRv9chIGxditreJ6e0,34524
+magic_pdf-0.9.2.dist-info/METADATA,sha256=CxyxzxwoOTK3GfaQCGAR8lcjQR3fK4teYf0pXLVDiNQ,39654
+magic_pdf-0.9.2.dist-info/WHEEL,sha256=eOLhNAGa2EW3wWl_TU484h7q1UNgy0JXjjoqKoxAAQc,92
+magic_pdf-0.9.2.dist-info/entry_points.txt,sha256=wXwYke3j8fqDQTocUspL-CqDUEv3Tfcwp09fM8dZAhA,98
+magic_pdf-0.9.2.dist-info/top_level.txt,sha256=J9I0AzmHWGkp9c6DL8Oe4mEx3yYphLzkRn4H25Lg1rE,10
+magic_pdf-0.9.2.dist-info/RECORD,,

{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/LICENSE.md RENAMED Viewed

File without changes

{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{magic_pdf-0.9.0.dist-info → magic_pdf-0.9.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

magic-pdf 0.9.0__py3-none-any.whl → 0.9.2__py3-none-any.whl

magic-pdf 0.9.0py3-none-any.whl → 0.9.2py3-none-any.whl