PyPI - magic-pdf - Versions diffs - 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

magic-pdf 0.9.2py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

magic_pdf/pre_proc/remove_rotate_bbox.py CHANGED Viewed

@@ -1,19 +1,21 @@
 import math
+import re
+from magic_pdf.config.drop_tag import (EMPTY_SIDE_BLOCK, ROTATE_TEXT,
+                                       VERTICAL_TEXT)
 from magic_pdf.libs.boxbase import is_vbox_on_side
-from magic_pdf.libs.drop_tag import EMPTY_SIDE_BLOCK, ROTATE_TEXT, VERTICAL_TEXT
 def detect_non_horizontal_texts(result_dict):
-    """
-    This function detects watermarks and vertical margin notes in the document.
+    """This function detects watermarks and vertical margin notes in the
+    document.
     Watermarks are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
     If these conditions are met, the blocks are highly likely to be watermarks, as opposed to headers or footers, which can change from page to page.
     If the direction of these blocks is not horizontal, they are definitely considered to be watermarks.
     Vertical margin notes are identified by finding blocks with the same coordinates and frequently occurring identical texts across multiple pages.
-    If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page.
+    If these conditions are met, the blocks are highly likely to be vertical margin notes, which typically appear on the left and right sides of the page. # noqa: E501
     If the direction of these blocks is vertical, they are definitely considered to be vertical margin notes.
@@ -32,13 +34,16 @@ def detect_non_horizontal_texts(result_dict):
     potential_margin_notes = {}
     for page_id, page_content in result_dict.items():
-        if page_id.startswith("page_"):
+        if page_id.startswith('page_'):
             for block_id, block_data in page_content.items():
-                if block_id.startswith("block_"):
-                    if "dir" in block_data:
-                        coordinates_text = (block_data["bbox"], block_data["text"])  # Tuple of coordinates and text
-                        angle = math.atan2(block_data["dir"][1], block_data["dir"][0])
+                if block_id.startswith('block_'):
+                    if 'dir' in block_data:
+                        coordinates_text = (
+                            block_data['bbox'],
+                            block_data['text'],
+                        )  # Tuple of coordinates and text
+                        angle = math.atan2(block_data['dir'][1], block_data['dir'][0])
                         angle = abs(math.degrees(angle))
                         if angle > 5 and angle < 85:  # Check if direction is watermarks
@@ -49,32 +54,40 @@ def detect_non_horizontal_texts(result_dict):
                         if angle > 85 and angle < 105:  # Check if direction is vertical
                             if coordinates_text in potential_margin_notes:
-                                potential_margin_notes[coordinates_text] += 1  # Increment count
+                                potential_margin_notes[coordinates_text] += (
+                                    1  # Increment count
+                                )
                             else:
-                                potential_margin_notes[coordinates_text] = 1  # Initialize count
+                                potential_margin_notes[coordinates_text] = (
+                                    1  # Initialize count
+                                )
     # Identify watermarks by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
     watermark_threshold = len(result_dict) // 2
-    watermarks = {k: v for k, v in potential_watermarks.items() if v > watermark_threshold}
+    watermarks = {
+        k: v for k, v in potential_watermarks.items() if v > watermark_threshold
+    }
     # Identify margin notes by finding entries with counts higher than a threshold (e.g., appearing on more than half of the pages)
     margin_note_threshold = len(result_dict) // 2
-    margin_notes = {k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold}
+    margin_notes = {
+        k: v for k, v in potential_margin_notes.items() if v > margin_note_threshold
+    }
     # Add watermark information to the result dictionary
     for page_id, blocks in result_dict.items():
-        if page_id.startswith("page_"):
+        if page_id.startswith('page_'):
             for block_id, block_data in blocks.items():
-                coordinates_text = (block_data["bbox"], block_data["text"])
+                coordinates_text = (block_data['bbox'], block_data['text'])
                 if coordinates_text in watermarks:
-                    block_data["is_watermark"] = 1
+                    block_data['is_watermark'] = 1
                 else:
-                    block_data["is_watermark"] = 0
+                    block_data['is_watermark'] = 0
                 if coordinates_text in margin_notes:
-                    block_data["is_vertical_margin_note"] = 1
+                    block_data['is_vertical_margin_note'] = 1
                 else:
-                    block_data["is_vertical_margin_note"] = 0
+                    block_data['is_vertical_margin_note'] = 0
     return result_dict
@@ -83,21 +96,21 @@ def detect_non_horizontal_texts(result_dict):
 1. 当一个block里全部文字都不是dir=(1,0)，这个block整体去掉
 2. 当一个block里全部文字都是dir=(1,0)，但是每行只有一个字，这个block整体去掉。这个block必须出现在页面的四周，否则不去掉
 """
-import re
 def __is_a_word(sentence):
     # 如果输入是中文并且长度为1，则返回True
     if re.fullmatch(r'[\u4e00-\u9fa5]', sentence):
         return True
     # 判断是否为单个英文单词或字符（包括ASCII标点）
-    elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <=2:
+    elif re.fullmatch(r'[a-zA-Z0-9]+', sentence) and len(sentence) <= 2:
         return True
     else:
         return False
 def __get_text_color(num):
-    """获取字体的颜色RGB值"""
+    """获取字体的颜色RGB值."""
     blue = num & 255
     green = (num >> 8) & 255
     red = (num >> 16) & 255
@@ -105,84 +118,119 @@ def __get_text_color(num):
 def __is_empty_side_box(text_block):
-    """
-    是否是边缘上的空白没有任何内容的block
-    """
+    """是否是边缘上的空白没有任何内容的block."""
     for line in text_block['lines']:
         for span in line['spans']:
             font_color = span['color']
-            r,g,b = __get_text_color(font_color)
-            if len(span['text'].strip())>0 and (r,g,b)!=(255,255,255):
+            r, g, b = __get_text_color(font_color)
+            if len(span['text'].strip()) > 0 and (r, g, b) != (255, 255, 255):
                 return False
     return True
 def remove_rotate_side_textblock(pymu_text_block, page_width, page_height):
-    """
-    返回删除了垂直，水印，旋转的textblock
-    删除的内容打上tag返回
-    """
+    """返回删除了垂直，水印，旋转的textblock 删除的内容打上tag返回."""
     removed_text_block = []
-    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
+    for i, block in enumerate(
+        pymu_text_block
+    ):  # 格式参考test/assets/papre/pymu_textblocks.json
         lines = block['lines']
         block_bbox = block['bbox']
-        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
-           continue
-        if all([__is_a_word(line['spans'][0]["text"]) for line in lines if len(line['spans'])>0]) and len(lines)>1 and all([len(line['spans'])==1 for line in lines]):
-            is_box_valign = (len(set([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0]))==1) and (len([int(line['spans'][0]['bbox'][0] ) for line in lines if len(line['spans'])>0])>1)  # 测试bbox在垂直方向是不是x0都相等，也就是在垂直方向排列.同时必须大于等于2个字
+        if not is_vbox_on_side(
+            block_bbox, page_width, page_height, 0.2
+        ):  # 保证这些box必须在页面的两边
+            continue
+        if (
+            all(
+                [
+                    __is_a_word(line['spans'][0]['text'])
+                    for line in lines
+                    if len(line['spans']) > 0
+                ]
+            )
+            and len(lines) > 1
+            and all([len(line['spans']) == 1 for line in lines])
+        ):
+            is_box_valign = (
+                (
+                    len(
+                        set(
+                            [
+                                int(line['spans'][0]['bbox'][0])
+                                for line in lines
+                                if len(line['spans']) > 0
+                            ]
+                        )
+                    )
+                    == 1
+                )
+                and (
+                    len(
+                        [
+                            int(line['spans'][0]['bbox'][0])
+                            for line in lines
+                            if len(line['spans']) > 0
+                        ]
+                    )
+                    > 1
+                )
+            )  # 测试bbox在垂直方向是不是x0都相等，也就是在垂直方向排列.同时必须大于等于2个字
             if is_box_valign:
                 block['tag'] = VERTICAL_TEXT
                 removed_text_block.append(block)
                 continue
         for line in lines:
-            if line['dir']!=(1,0):
+            if line['dir'] != (1, 0):
                 block['tag'] = ROTATE_TEXT
-                removed_text_block.append(block) # 只要有一个line不是dir=(1,0)，就把整个block都删掉
+                removed_text_block.append(
+                    block
+                )  # 只要有一个line不是dir=(1,0)，就把整个block都删掉
                 break
     for block in removed_text_block:
         pymu_text_block.remove(block)
     return pymu_text_block, removed_text_block
 def get_side_boundry(rotate_bbox, page_width, page_height):
-    """
-    根据rotate_bbox，返回页面的左右正文边界
-    """
+    """根据rotate_bbox，返回页面的左右正文边界."""
     left_x = 0
     right_x = page_width
     for x in rotate_bbox:
         box = x['bbox']
-        if box[2]<page_width/2:
+        if box[2] < page_width / 2:
             left_x = max(left_x, box[2])
         else:
             right_x = min(right_x, box[0])
-    return left_x+1, right_x-1
+    return left_x + 1, right_x - 1
 def remove_side_blank_block(pymu_text_block, page_width, page_height):
-    """
-    删除页面两侧的空白block
-    """
+    """删除页面两侧的空白block."""
     removed_text_block = []
-    for i, block in enumerate(pymu_text_block): # 格式参考test/assets/papre/pymu_textblocks.json
+    for i, block in enumerate(
+        pymu_text_block
+    ):  # 格式参考test/assets/papre/pymu_textblocks.json
         block_bbox = block['bbox']
-        if not is_vbox_on_side(block_bbox, page_width, page_height, 0.2): # 保证这些box必须在页面的两边
-           continue
+        if not is_vbox_on_side(
+            block_bbox, page_width, page_height, 0.2
+        ):  # 保证这些box必须在页面的两边
+            continue
         if __is_empty_side_box(block):
             block['tag'] = EMPTY_SIDE_BLOCK
             removed_text_block.append(block)
             continue
     for block in removed_text_block:
         pymu_text_block.remove(block)
-    return pymu_text_block, removed_text_block
+    return pymu_text_block, removed_text_block

magic_pdf/pre_proc/resolve_bbox_conflict.py CHANGED Viewed

@@ -4,8 +4,9 @@
 2. 然后去掉出现在文字blcok上的图片bbox
 """
-from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, _is_left_overlap
-from magic_pdf.libs.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
+from magic_pdf.config.drop_tag import ON_IMAGE_TEXT, ON_TABLE_TEXT
+from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
+                                    _is_left_overlap)
 def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equations: list, inline_equations: list,
@@ -26,14 +27,14 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
     # 去掉位于图片上的文字block
     for image_box in images:
         for text_block in text_raw_blocks:
-            text_bbox = text_block["bbox"]
+            text_bbox = text_block['bbox']
             if _is_in(text_bbox, image_box):
                 text_block['tag'] = ON_IMAGE_TEXT
                 text_block_removed.append(text_block)
     # 去掉table上的文字block
     for table_box in tables:
         for text_block in text_raw_blocks:
-            text_bbox = text_block["bbox"]
+            text_bbox = text_block['bbox']
             if _is_in(text_bbox, table_box):
                 text_block['tag'] = ON_TABLE_TEXT
                 text_block_removed.append(text_block)
@@ -77,7 +78,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
     # 图片和文字重叠，丢掉图片
     for image_box in images:
         for text_block in text_raw_blocks:
-            text_bbox = text_block["bbox"]
+            text_bbox = text_block['bbox']
             if _is_in_or_part_overlap(image_box, text_bbox):
                 images_backup.append(image_box)
                 break
@@ -122,11 +123,7 @@ def resolve_bbox_overlap_conflict(images: list, tables: list, interline_equation
 def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bool:
-    """
-    检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。
-    因为这种情况大概率发生了公式没有被检测出来。
-    """
+    """检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
     if len(text_blocks) == 0:
         return False
@@ -148,7 +145,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo
     txt_bboxes = []
     for text_block in text_blocks:
-        bbox = text_block["bbox"]
+        bbox = text_block['bbox']
         if bbox[1] >= clip_y0 and bbox[3] <= clip_y1:
             txt_bboxes.append(bbox)
@@ -161,11 +158,7 @@ def check_text_block_horizontal_overlap(text_blocks: list, header, footer) -> bo
 def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
-    """
-    检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。
-    因为这种情况大概率发生了公式没有被检测出来。
-    """
+    """检查文本block之间的水平重叠情况，这种情况如果发生，那么这个pdf就不再继续处理了。 因为这种情况大概率发生了公式没有被检测出来。"""
     if len(useful_blocks) == 0:
         return False
@@ -174,7 +167,7 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:
     useful_bboxes = []
     for text_block in useful_blocks:
-        bbox = text_block["bbox"]
+        bbox = text_block['bbox']
         if bbox[1] >= page_min_y and bbox[3] <= page_max_y:
             useful_bboxes.append(bbox)

magic_pdf/resources/model_config/model_configs.yaml CHANGED Viewed

@@ -4,4 +4,5 @@ weights:
   yolo_v8_mfd: MFD/YOLO/yolo_v8_ft.pt
   unimernet_small: MFR/unimernet_small
   struct_eqtable: TabRec/StructEqTable
-  tablemaster: TabRec/TableMaster
+  tablemaster: TabRec/TableMaster
+  rapid_table: TabRec/RapidTable

magic_pdf/spark/spark_api.py CHANGED Viewed

@@ -1,51 +1,49 @@
 from loguru import logger
-from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.config.drop_reason import DropReason
 def get_data_source(jso: dict):
-    data_source = jso.get("data_source")
+    data_source = jso.get('data_source')
     if data_source is None:
-        data_source = jso.get("file_source")
+        data_source = jso.get('file_source')
     return data_source
 def get_data_type(jso: dict):
-    data_type = jso.get("data_type")
+    data_type = jso.get('data_type')
     if data_type is None:
-        data_type = jso.get("file_type")
+        data_type = jso.get('file_type')
     return data_type
 def get_bookid(jso: dict):
-    book_id = jso.get("bookid")
+    book_id = jso.get('bookid')
     if book_id is None:
-        book_id = jso.get("original_file_id")
+        book_id = jso.get('original_file_id')
     return book_id
 def exception_handler(jso: dict, e):
     logger.exception(e)
-    jso["_need_drop"] = True
-    jso["_drop_reason"] = DropReason.Exception
-    jso["_exception"] = f"ERROR: {e}"
+    jso['_need_drop'] = True
+    jso['_drop_reason'] = DropReason.Exception
+    jso['_exception'] = f'ERROR: {e}'
     return jso
 def get_bookname(jso: dict):
     data_source = get_data_source(jso)
-    file_id = jso.get("file_id")
-    book_name = f"{data_source}/{file_id}"
+    file_id = jso.get('file_id')
+    book_name = f'{data_source}/{file_id}'
     return book_name
 def spark_json_extractor(jso: dict) -> dict:
-    """
-    从json中提取数据，返回一个dict
-    """
+    """从json中提取数据，返回一个dict."""
     return {
-        "_pdf_type": jso["_pdf_type"],
-        "model_list": jso["doc_layout_result"],
+        '_pdf_type': jso['_pdf_type'],
+        'model_list': jso['doc_layout_result'],
     }

magic_pdf/tools/cli.py CHANGED Viewed

@@ -5,9 +5,8 @@ import click
 from loguru import logger
 import magic_pdf.model as model_config
+from magic_pdf.data.data_reader_writer import FileBasedDataReader
 from magic_pdf.libs.version import __version__
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.tools.common import do_parse, parse_pdf_methods
@@ -86,8 +85,8 @@ def cli(path, output_dir, method, lang, debug_able, start_page_id, end_page_id):
     os.makedirs(output_dir, exist_ok=True)
     def read_fn(path):
-        disk_rw = DiskReaderWriter(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
+        disk_rw = FileBasedDataReader(os.path.dirname(path))
+        return disk_rw.read(os.path.basename(path))
     def parse_doc(doc_path: str):
         try:

magic_pdf/tools/cli_dev.py CHANGED Viewed

@@ -5,13 +5,11 @@ from pathlib import Path
 import click
 import magic_pdf.model as model_config
+from magic_pdf.data.data_reader_writer import FileBasedDataReader, S3DataReader
 from magic_pdf.libs.config_reader import get_s3_config
 from magic_pdf.libs.path_utils import (parse_s3_range_params, parse_s3path,
                                        remove_non_official_s3_args)
 from magic_pdf.libs.version import __version__
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
 from magic_pdf.tools.common import do_parse, parse_pdf_methods
@@ -19,15 +17,14 @@ def read_s3_path(s3path):
     bucket, key = parse_s3path(s3path)
     s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
-    s3_rw = S3ReaderWriter(s3_ak, s3_sk, s3_endpoint, 'auto',
-                           remove_non_official_s3_args(s3path))
+    s3_rw = S3DataReader('', bucket, s3_ak, s3_sk, s3_endpoint, 'auto')
     may_range_params = parse_s3_range_params(s3path)
     if may_range_params is None or 2 != len(may_range_params):
-        byte_start, byte_end = 0, None
+        byte_start, byte_end = 0, -1
     else:
         byte_start, byte_end = int(may_range_params[0]), int(
             may_range_params[1])
-    return s3_rw.read_offset(
+    return s3_rw.read_at(
         remove_non_official_s3_args(s3path),
         byte_start,
         byte_end,
@@ -129,8 +126,8 @@ def pdf(pdf, json_data, output_dir, method):
     os.makedirs(output_dir, exist_ok=True)
     def read_fn(path):
-        disk_rw = DiskReaderWriter(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
+        disk_rw = FileBasedDataReader(os.path.dirname(path))
+        return disk_rw.read(os.path.basename(path))
     model_json_list = json_parse.loads(read_fn(json_data).decode('utf-8'))

magic-pdf 0.9.2__py3-none-any.whl → 0.10.0__py3-none-any.whl

magic-pdf 0.9.2py3-none-any.whl → 0.10.0py3-none-any.whl