PyPI - magic-pdf - Versions diffs - 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl - Mend

magic-pdf 0.9.3py3-none-any.whl → 0.10.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

magic_pdf/config/constants.py +53 -0
magic_pdf/config/drop_reason.py +35 -0
magic_pdf/config/drop_tag.py +19 -0
magic_pdf/config/make_content_config.py +11 -0
magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
magic_pdf/data/data_reader_writer/filebase.py +3 -0
magic_pdf/data/read_api.py +1 -1
magic_pdf/dict2md/mkcontent.py +226 -185
magic_pdf/dict2md/ocr_mkcontent.py +11 -11
magic_pdf/filter/pdf_meta_scan.py +101 -79
magic_pdf/integrations/rag/utils.py +4 -5
magic_pdf/libs/config_reader.py +5 -5
magic_pdf/libs/draw_bbox.py +3 -2
magic_pdf/libs/pdf_image_tools.py +36 -12
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
magic_pdf/model/magic_model.py +13 -13
magic_pdf/model/pdf_extract_kit.py +122 -76
magic_pdf/model/sub_modules/model_init.py +40 -35
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
magic_pdf/para/para_split.py +411 -248
magic_pdf/para/para_split_v2.py +352 -182
magic_pdf/para/para_split_v3.py +110 -53
magic_pdf/pdf_parse_by_ocr.py +2 -0
magic_pdf/pdf_parse_by_txt.py +2 -0
magic_pdf/pdf_parse_union_core.py +174 -100
magic_pdf/pdf_parse_union_core_v2.py +202 -36
magic_pdf/pipe/AbsPipe.py +28 -44
magic_pdf/pipe/OCRPipe.py +5 -5
magic_pdf/pipe/TXTPipe.py +5 -6
magic_pdf/pipe/UNIPipe.py +24 -25
magic_pdf/post_proc/pdf_post_filter.py +7 -14
magic_pdf/pre_proc/cut_image.py +9 -11
magic_pdf/pre_proc/equations_replace.py +203 -212
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
magic_pdf/pre_proc/remove_footer_header.py +2 -5
magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
magic_pdf/spark/spark_api.py +15 -17
magic_pdf/tools/cli.py +3 -4
magic_pdf/tools/cli_dev.py +6 -9
magic_pdf/tools/common.py +26 -36
magic_pdf/user_api.py +29 -38
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/METADATA +11 -12
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/RECORD +58 -59
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/WHEEL +1 -1
magic_pdf/libs/Constants.py +0 -55
magic_pdf/libs/MakeContentConfig.py +0 -11
magic_pdf/libs/drop_reason.py +0 -27
magic_pdf/libs/drop_tag.py +0 -19
magic_pdf/para/para_pipeline.py +0 -297
/magic_pdf/{libs → config}/ocr_content_type.py +0 -0
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.1.dist-info}/top_level.txt +0 -0

magic_pdf/pre_proc/ocr_span_list_modify.py CHANGED Viewed

@@ -1,9 +1,10 @@
-from loguru import logger
-from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio, get_minbox_if_overlap_by_ratio, \
-    __is_overlaps_y_exceeds_threshold, calculate_iou
-from magic_pdf.libs.drop_tag import DropTag
-from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.config.drop_tag import DropTag
+from magic_pdf.config.ocr_content_type import BlockType, ContentType
+from magic_pdf.libs.boxbase import (__is_overlaps_y_exceeds_threshold,
+                                    calculate_iou,
+                                    calculate_overlap_area_in_bbox1_area_ratio,
+                                    get_minbox_if_overlap_by_ratio)
 def remove_overlaps_low_confidence_spans(spans):
@@ -21,7 +22,10 @@ def remove_overlaps_low_confidence_spans(spans):
                             span_need_remove = span1
                         else:
                             span_need_remove = span2
-                        if span_need_remove is not None and span_need_remove not in dropped_spans:
+                        if (
+                            span_need_remove is not None
+                            and span_need_remove not in dropped_spans
+                        ):
                             dropped_spans.append(span_need_remove)
     if len(dropped_spans) > 0:
@@ -38,12 +42,15 @@ def remove_overlaps_min_spans(spans):
     for span1 in spans:
         for span2 in spans:
             if span1 != span2:
-                overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
-                if overlap_box is not None:
-                    span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
-                    if span_need_remove is not None and span_need_remove not in dropped_spans:
-                        dropped_spans.append(span_need_remove)
+                # span1 或 span2 任何一个都不应该在 dropped_spans 中
+                if span1 in dropped_spans or span2 in dropped_spans:
+                    continue
+                else:
+                    overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
+                    if overlap_box is not None:
+                        span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
+                        if span_need_remove is not None and span_need_remove not in dropped_spans:
+                            dropped_spans.append(span_need_remove)
     if len(dropped_spans) > 0:
         for span_need_remove in dropped_spans:
             spans.remove(span_need_remove)
@@ -58,7 +65,10 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
     need_remove_spans = []
     for span in spans:
         for removed_bbox in need_remove_spans_bboxes:
-            if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
+            if (
+                calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox)
+                > 0.5
+            ):
                 if span not in need_remove_spans:
                     need_remove_spans.append(span)
                     break
@@ -78,12 +88,22 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
         for span in spans:
             # 通过判断span的bbox是否在removed_bboxes中, 判断是否需要删除该span
             for removed_bbox in removed_bboxes:
-                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
+                if (
+                    calculate_overlap_area_in_bbox1_area_ratio(
+                        span['bbox'], removed_bbox
+                    )
+                    > 0.5
+                ):
                     need_remove_spans.append(span)
                     break
                 # 当drop_tag为DropTag.FOOTNOTE时, 判断span是否在removed_bboxes中任意一个的下方，如果是,则删除该span
-                elif drop_tag == DropTag.FOOTNOTE and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3] and \
-                        removed_bbox[0] < (span['bbox'][0] + span['bbox'][2]) / 2 < removed_bbox[2]:
+                elif (
+                    drop_tag == DropTag.FOOTNOTE
+                    and (span['bbox'][1] + span['bbox'][3]) / 2 > removed_bbox[3]
+                    and removed_bbox[0]
+                    < (span['bbox'][0] + span['bbox'][2]) / 2
+                    < removed_bbox[2]
+                ):
                     need_remove_spans.append(span)
                     break
@@ -98,11 +118,18 @@ def remove_spans_by_bboxes_dict(spans, need_remove_spans_bboxes_dict):
 def adjust_bbox_for_standalone_block(spans):
     # 对tpye=["interline_equation", "image", "table"]进行额外处理,如果左边有字的话,将该span的bbox中y0调整至不高于文字的y0
     for sb_span in spans:
-        if sb_span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+        if sb_span['type'] in [
+            ContentType.InterlineEquation,
+            ContentType.Image,
+            ContentType.Table,
+        ]:
             for text_span in spans:
                 if text_span['type'] in [ContentType.Text, ContentType.InlineEquation]:
                     # 判断span2的纵向高度是否被span所覆盖
-                    if sb_span['bbox'][1] < text_span['bbox'][1] and sb_span['bbox'][3] > text_span['bbox'][3]:
+                    if (
+                        sb_span['bbox'][1] < text_span['bbox'][1]
+                        and sb_span['bbox'][3] > text_span['bbox'][3]
+                    ):
                         # 判断span2是否在span左边
                         if text_span['bbox'][0] < sb_span['bbox'][0]:
                             # 调整span的y0和span2的y0一致
@@ -120,11 +147,15 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
         lines = []
         current_line = [spans[0]]
-        if spans[0]["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+        if spans[0]['type'] in [
+            ContentType.InterlineEquation,
+            ContentType.Image,
+            ContentType.Table,
+        ]:
             displayed_list.append(spans[0])
-        line_first_y0 = spans[0]["bbox"][1]
-        line_first_y = spans[0]["bbox"][3]
+        line_first_y0 = spans[0]['bbox'][1]
+        line_first_y = spans[0]['bbox'][3]
         # 用于给行间公式搜索
         # text_inline_lines = []
         for span in spans[1:]:
@@ -132,26 +163,43 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
             #     print("debug")
             # 如果当前的span类型为"interline_equation" 或者 当前行中已经有"interline_equation"
             # image和table类型，同上
-            if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] or any(
-                    s['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table] for s in
-                    current_line):
+            if span['type'] in [
+                ContentType.InterlineEquation,
+                ContentType.Image,
+                ContentType.Table,
+            ] or any(
+                s['type']
+                in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]
+                for s in current_line
+            ):
                 # 传入
-                if span["type"] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+                if span['type'] in [
+                    ContentType.InterlineEquation,
+                    ContentType.Image,
+                    ContentType.Table,
+                ]:
                     displayed_list.append(span)
                 # 则开始新行
                 lines.append(current_line)
-                if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
-                    text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
+                if len(current_line) > 1 or current_line[0]['type'] in [
+                    ContentType.Text,
+                    ContentType.InlineEquation,
+                ]:
+                    text_inline_lines.append(
+                        (current_line, (line_first_y0, line_first_y))
+                    )
                 current_line = [span]
-                line_first_y0 = span["bbox"][1]
-                line_first_y = span["bbox"][3]
+                line_first_y0 = span['bbox'][1]
+                line_first_y = span['bbox'][3]
                 continue
             # 如果当前的span与当前行的最后一个span在y轴上重叠，则添加到当前行
-            if __is_overlaps_y_exceeds_threshold(span['bbox'], current_line[-1]['bbox']):
-                if span["type"] == "text":
-                    line_first_y0 = span["bbox"][1]
-                    line_first_y = span["bbox"][3]
+            if __is_overlaps_y_exceeds_threshold(
+                span['bbox'], current_line[-1]['bbox']
+            ):
+                if span['type'] == 'text':
+                    line_first_y0 = span['bbox'][1]
+                    line_first_y = span['bbox'][3]
                 current_line.append(span)
             else:
@@ -159,13 +207,16 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
                 lines.append(current_line)
                 text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
                 current_line = [span]
-                line_first_y0 = span["bbox"][1]
-                line_first_y = span["bbox"][3]
+                line_first_y0 = span['bbox'][1]
+                line_first_y = span['bbox'][3]
             # 添加最后一行
         if current_line:
             lines.append(current_line)
-            if len(current_line) > 1 or current_line[0]["type"] in [ContentType.Text, ContentType.InlineEquation]:
+            if len(current_line) > 1 or current_line[0]['type'] in [
+                ContentType.Text,
+                ContentType.InlineEquation,
+            ]:
                 text_inline_lines.append((current_line, (line_first_y0, line_first_y)))
         for line in text_inline_lines:
             # 按照x0坐标排序
@@ -176,8 +227,8 @@ def modify_y_axis(spans: list, displayed_list: list, text_inline_lines: list):
         for line in text_inline_lines:
             current_line, (line_first_y0, line_first_y) = line
             for span in current_line:
-                span["bbox"][1] = line_first_y0
-                span["bbox"][3] = line_first_y
+                span['bbox'][1] = line_first_y0
+                span['bbox'][3] = line_first_y
         # return spans, displayed_list, text_inline_lines
@@ -189,34 +240,42 @@ def modify_inline_equation(spans: list, displayed_list: list, text_inline_lines:
         # if i == 8:
         #     print("debug")
         span = displayed_list[i]
-        span_y0, span_y = span["bbox"][1], span["bbox"][3]
+        span_y0, span_y = span['bbox'][1], span['bbox'][3]
         while j < len(text_inline_lines):
             text_line = text_inline_lines[j]
             y0, y1 = text_line[1]
             if (
-                    span_y0 < y0 < span_y or span_y0 < y1 < span_y or span_y0 < y0 and span_y > y1
-            ) and __is_overlaps_y_exceeds_threshold(
-                span['bbox'], (0, y0, 0, y1)
-            ):
+                span_y0 < y0 < span_y
+                or span_y0 < y1 < span_y
+                or span_y0 < y0
+                and span_y > y1
+            ) and __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1)):
                 # 调整公式类型
-                if span["type"] == ContentType.InterlineEquation:
+                if span['type'] == ContentType.InterlineEquation:
                     # 最后一行是行间公式
                     if j + 1 >= len(text_inline_lines):
-                        span["type"] = ContentType.InlineEquation
-                        span["bbox"][1] = y0
-                        span["bbox"][3] = y1
+                        span['type'] = ContentType.InlineEquation
+                        span['bbox'][1] = y0
+                        span['bbox'][3] = y1
                     else:
                         # 行间公式旁边有多行文字或者行间公式比文字高3倍则不转换
                         y0_next, y1_next = text_inline_lines[j + 1][1]
-                        if not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0_next, 0, y1_next)) and 3 * (
-                                y1 - y0) > span_y - span_y0:
-                            span["type"] = ContentType.InlineEquation
-                            span["bbox"][1] = y0
-                            span["bbox"][3] = y1
+                        if (
+                            not __is_overlaps_y_exceeds_threshold(
+                                span['bbox'], (0, y0_next, 0, y1_next)
+                            )
+                            and 3 * (y1 - y0) > span_y - span_y0
+                        ):
+                            span['type'] = ContentType.InlineEquation
+                            span['bbox'][1] = y0
+                            span['bbox'][3] = y1
                 break
-            elif span_y < y0 or span_y0 < y0 < span_y and not __is_overlaps_y_exceeds_threshold(span['bbox'],
-                                                                                                (0, y0, 0, y1)):
+            elif (
+                span_y < y0
+                or span_y0 < y0 < span_y
+                and not __is_overlaps_y_exceeds_threshold(span['bbox'], (0, y0, 0, y1))
+            ):
                 break
             else:
                 j += 1
@@ -232,15 +291,15 @@ def get_qa_need_list(blocks):
     inline_equations = []
     for block in blocks:
-        for line in block["lines"]:
-            for span in line["spans"]:
-                if span["type"] == ContentType.Image:
+        for line in block['lines']:
+            for span in line['spans']:
+                if span['type'] == ContentType.Image:
                     images.append(span)
-                elif span["type"] == ContentType.Table:
+                elif span['type'] == ContentType.Table:
                     tables.append(span)
-                elif span["type"] == ContentType.InlineEquation:
+                elif span['type'] == ContentType.InlineEquation:
                     inline_equations.append(span)
-                elif span["type"] == ContentType.InterlineEquation:
+                elif span['type'] == ContentType.InterlineEquation:
                     interline_equations.append(span)
                 else:
                     continue
@@ -254,10 +313,10 @@ def get_qa_need_list_v2(blocks):
     interline_equations = []
     for block in blocks:
-        if block["type"] == BlockType.Image:
+        if block['type'] == BlockType.Image:
             images.append(block)
-        elif block["type"] == BlockType.Table:
+        elif block['type'] == BlockType.Table:
             tables.append(block)
-        elif block["type"] == BlockType.InterlineEquation:
+        elif block['type'] == BlockType.InterlineEquation:
             interline_equations.append(block)
     return images, tables, interline_equations

magic_pdf/pre_proc/pdf_pre_filter.py CHANGED Viewed

@@ -1,58 +1,65 @@
-from magic_pdf.libs.commons import fitz
+from magic_pdf.config.drop_reason import DropReason
 from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap
-from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.libs.commons import fitz
 def __area(box):
     return (box[2] - box[0]) * (box[3] - box[1])
-def __is_contain_color_background_rect(page:fitz.Page, text_blocks, image_bboxes) -> bool:
-    """
-    检查page是包含有颜色背景的矩形
-    """
+def __is_contain_color_background_rect(
+    page: fitz.Page, text_blocks, image_bboxes
+) -> bool:
+    """检查page是包含有颜色背景的矩形."""
     color_bg_rect = []
     p_width, p_height = page.rect.width, page.rect.height
     # 先找到最大的带背景矩形
     blocks = page.get_cdrawings()
     for block in blocks:
-        if 'fill' in block and block['fill']: # 过滤掉透明的
+        if 'fill' in block and block['fill']:  # 过滤掉透明的
             fill = list(block['fill'])
             fill[0], fill[1], fill[2] = int(fill[0]), int(fill[1]), int(fill[2])
-            if fill==(1.0,1.0,1.0):
+            if fill == (1.0, 1.0, 1.0):
                 continue
             rect = block['rect']
             # 过滤掉特别小的矩形
-            if __area(rect) < 10*10:
+            if __area(rect) < 10 * 10:
                 continue
             # 为了防止是svg图片上的色块，这里过滤掉这类
-            if any([_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]):
+            if any(
+                [_is_in_or_part_overlap(rect, img_bbox) for img_bbox in image_bboxes]
+            ):
                 continue
             color_bg_rect.append(rect)
     # 找到最大的背景矩形
     if len(color_bg_rect) > 0:
-        max_rect = max(color_bg_rect, key=lambda x:__area(x))
-        max_rect_int = (int(max_rect[0]), int(max_rect[1]), int(max_rect[2]), int(max_rect[3]))
+        max_rect = max(color_bg_rect, key=lambda x: __area(x))
+        max_rect_int = (
+            int(max_rect[0]),
+            int(max_rect[1]),
+            int(max_rect[2]),
+            int(max_rect[3]),
+        )
         # 判断最大的背景矩形是否包含超过3行文字，或者50个字 TODO
-        if max_rect[2]-max_rect[0] > 0.2*p_width and  max_rect[3]-max_rect[1] > 0.1*p_height:#宽度符合
-            #看是否有文本块落入到这个矩形中
+        if (
+            max_rect[2] - max_rect[0] > 0.2 * p_width
+            and max_rect[3] - max_rect[1] > 0.1 * p_height
+        ):  # 宽度符合
+            # 看是否有文本块落入到这个矩形中
             for text_block in text_blocks:
                 box = text_block['bbox']
                 box_int = (int(box[0]), int(box[1]), int(box[2]), int(box[3]))
                 if _is_in(box_int, max_rect_int):
                     return True
     return False
 def __is_table_overlap_text_block(text_blocks, table_bbox):
-    """
-    检查table_bbox是否覆盖了text_blocks里的文本块
-    TODO
-    """
+    """检查table_bbox是否覆盖了text_blocks里的文本块 TODO."""
     for text_block in text_blocks:
         box = text_block['bbox']
         if _is_in_or_part_overlap(table_bbox, box):
@@ -60,15 +67,12 @@ def __is_table_overlap_text_block(text_blocks, table_bbox):
     return False
-def pdf_filter(page:fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
-    """
-    return:(True|False, err_msg)
-        True, 如果pdf符合要求
-        False, 如果pdf不符合要求
-    """
+def pdf_filter(page: fitz.Page, text_blocks, table_bboxes, image_bboxes) -> tuple:
+    """return:(True|False, err_msg) True, 如果pdf符合要求 False, 如果pdf不符合要求."""
     if __is_contain_color_background_rect(page, text_blocks, image_bboxes):
-        return False, {"_need_drop": True, "_drop_reason": DropReason.COLOR_BACKGROUND_TEXT_BOX}
+        return False, {
+            '_need_drop': True,
+            '_drop_reason': DropReason.COLOR_BACKGROUND_TEXT_BOX,
+        }
-    return True, None
+    return True, None

magic_pdf/pre_proc/remove_bbox_overlap.py CHANGED Viewed

@@ -1,8 +1,9 @@
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_in, _is_part_overlap
-from magic_pdf.libs.drop_reason import DropReason
+from magic_pdf.config.drop_reason import DropReason
+from magic_pdf.libs.boxbase import _is_in, _is_part_overlap
 def _remove_overlap_between_bbox(bbox1, bbox2):
-   if _is_part_overlap(bbox1, bbox2):
+    if _is_part_overlap(bbox1, bbox2):
         ix0, iy0, ix1, iy1 = bbox1
         x0, y0, x1, y1 = bbox2
@@ -22,10 +23,10 @@ def _remove_overlap_between_bbox(bbox1, bbox2):
             if y1 >= iy1:
                 mid = (y0 + iy1) // 2
                 y0 = max(mid + 0.25, y0)
-                iy1 = min(iy1, mid-0.25)
+                iy1 = min(iy1, mid - 0.25)
             else:
                 mid = (iy0 + y1) // 2
-                y1 = min(y1, mid-0.25)
+                y1 = min(y1, mid - 0.25)
                 iy0 = max(mid + 0.25, iy0)
         if ix1 > ix0 and iy1 > iy0 and y1 > y0 and x1 > x0:
@@ -34,8 +35,8 @@ def _remove_overlap_between_bbox(bbox1, bbox2):
             return bbox1, bbox2, None
         else:
             return bbox1, bbox2, DropReason.NEGATIVE_BBOX_AREA
-   else:
-       return bbox1, bbox2, None
+    else:
+        return bbox1, bbox2, None
 def _remove_overlap_between_bboxes(arr):
@@ -47,7 +48,7 @@ def _remove_overlap_between_bboxes(arr):
         for j in range(N):
             if i == j:
                 continue
-            if _is_in(arr[i]["bbox"], arr[j]["bbox"]):
+            if _is_in(arr[i]['bbox'], arr[j]['bbox']):
                 keeps[i] = False
     for idx, v in enumerate(arr):
@@ -56,13 +57,15 @@ def _remove_overlap_between_bboxes(arr):
         for i in range(N):
             if res[i] is None:
                 continue
-            bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(v["bbox"], res[i]["bbox"])
+            bbox1, bbox2, drop_reason = _remove_overlap_between_bbox(
+                v['bbox'], res[i]['bbox']
+            )
             if drop_reason is None:
-                v["bbox"] = bbox1
-                res[i]["bbox"] = bbox2
+                v['bbox'] = bbox1
+                res[i]['bbox'] = bbox2
             else:
-                if v["score"] > res[i]["score"]:
+                if v['score'] > res[i]['score']:
                     keeps[i] = False
                     res[i] = None
                 else:
@@ -74,25 +77,24 @@ def _remove_overlap_between_bboxes(arr):
 def remove_overlap_between_bbox_for_span(spans):
-    arr = [{"bbox": span["bbox"], "score": span.get("score", 0.1)} for span in spans ]
+    arr = [{'bbox': span['bbox'], 'score': span.get('score', 0.1)} for span in spans]
     res, drop_reasons = _remove_overlap_between_bboxes(arr)
     ret = []
     for i in range(len(res)):
         if res[i] is None:
             continue
-        spans[i]["bbox"] = res[i]["bbox"]
+        spans[i]['bbox'] = res[i]['bbox']
         ret.append(spans[i])
     return ret, drop_reasons
 def remove_overlap_between_bbox_for_block(all_bboxes):
-    arr = [{"bbox": bbox[:4], "score": bbox[-1]} for bbox in all_bboxes ]
+    arr = [{'bbox': bbox[:4], 'score': bbox[-1]} for bbox in all_bboxes]
     res, drop_reasons = _remove_overlap_between_bboxes(arr)
     ret = []
     for i in range(len(res)):
         if res[i] is None:
             continue
-        all_bboxes[i][:4] = res[i]["bbox"]
+        all_bboxes[i][:4] = res[i]['bbox']
         ret.append(all_bboxes[i])
     return ret, drop_reasons

magic_pdf/pre_proc/remove_colored_strip_bbox.py CHANGED Viewed

@@ -1,7 +1,8 @@
-from magic_pdf.libs.boxbase import _is_in, _is_in_or_part_overlap, calculate_overlap_area_2_minbox_area_ratio
 from loguru import logger
-from magic_pdf.libs.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
+from magic_pdf.config.drop_tag import COLOR_BG_HEADER_TXT_BLOCK
+from magic_pdf.libs.boxbase import (_is_in, _is_in_or_part_overlap,
+                                    calculate_overlap_area_2_minbox_area_ratio)
 def __area(box):
@@ -9,8 +10,7 @@ def __area(box):
 def rectangle_position_determination(rect, p_width):
-    """
-    判断矩形是否在页面中轴线附近。
+    """判断矩形是否在页面中轴线附近。
     Args:
         rect (list): 矩形坐标，格式为[x1, y1, x2, y2]。
@@ -34,9 +34,10 @@ def rectangle_position_determination(rect, p_width):
         else:
             return False
 def remove_colored_strip_textblock(remain_text_blocks, page):
-    """
-    根据页面中特定颜色和大小过滤文本块，将符合条件的文本块从remain_text_blocks中移除，并返回移除的文本块列表colored_strip_textblock。
+    """根据页面中特定颜色和大小过滤文本块，将符合条件的文本块从remain_text_blocks中移除，并返回移除的文本块列表colored_str
+    ip_textblock。
     Args:
         remain_text_blocks (list): 剩余文本块列表。
@@ -51,22 +52,44 @@ def remove_colored_strip_textblock(remain_text_blocks, page):
         blocks = page.get_cdrawings()
         colored_strip_bg_rect = []
         for block in blocks:
-            is_filled = 'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)  # 过滤掉透明的
+            is_filled = (
+                'fill' in block and block['fill'] and block['fill'] != (1.0, 1.0, 1.0)
+            )  # 过滤掉透明的
             rect = block['rect']
             area_is_large_enough = __area(rect) > 100  # 过滤掉特别小的矩形
-            rectangle_position_determination_result = rectangle_position_determination(rect, p_width)
-            in_upper_half_page = rect[3] < p_height * 0.3  # 找到位于页面上半部分的矩形，下边界小于页面高度的30%
-            aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (rect[3] - rect[1]) * 4  # 找到长宽比超过4的矩形
+            rectangle_position_determination_result = rectangle_position_determination(
+                rect, p_width
+            )
+            in_upper_half_page = (
+                rect[3] < p_height * 0.3
+            )  # 找到位于页面上半部分的矩形，下边界小于页面高度的30%
+            aspect_ratio_exceeds_4 = (rect[2] - rect[0]) > (
+                rect[3] - rect[1]
+            ) * 4  # 找到长宽比超过4的矩形
-            if is_filled and area_is_large_enough and rectangle_position_determination_result and in_upper_half_page and aspect_ratio_exceeds_4:
+            if (
+                is_filled
+                and area_is_large_enough
+                and rectangle_position_determination_result
+                and in_upper_half_page
+                and aspect_ratio_exceeds_4
+            ):
                 colored_strip_bg_rect.append(rect)
         if len(colored_strip_bg_rect) > 0:
             for colored_strip_block_bbox in colored_strip_bg_rect:
                 for text_block in remain_text_blocks:
                     text_bbox = text_block['bbox']
-                    if _is_in(text_bbox, colored_strip_block_bbox) or (_is_in_or_part_overlap(text_bbox, colored_strip_block_bbox) and calculate_overlap_area_2_minbox_area_ratio(text_bbox, colored_strip_block_bbox) > 0.6):
-                        logger.info(f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}')
+                    if _is_in(text_bbox, colored_strip_block_bbox) or (
+                        _is_in_or_part_overlap(text_bbox, colored_strip_block_bbox)
+                        and calculate_overlap_area_2_minbox_area_ratio(
+                            text_bbox, colored_strip_block_bbox
+                        )
+                        > 0.6
+                    ):
+                        logger.info(
+                            f'remove_colored_strip_textblock: {text_bbox}, {colored_strip_block_bbox}'
+                        )
                         text_block['tag'] = COLOR_BG_HEADER_TXT_BLOCK
                         colored_strip_textblocks.append(text_block)
@@ -76,4 +99,3 @@ def remove_colored_strip_textblock(remain_text_blocks, page):
                             remain_text_blocks.remove(colored_strip_textblock)
     return remain_text_blocks, colored_strip_textblocks

magic_pdf/pre_proc/remove_footer_header.py CHANGED Viewed

@@ -1,15 +1,12 @@
 import re
+from magic_pdf.config.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
 from magic_pdf.libs.boxbase import _is_in_or_part_overlap
-from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO
 def remove_headder_footer_one_page(text_raw_blocks, image_bboxes, table_bboxes, header_bboxs, footer_bboxs,
                                    page_no_bboxs, page_w, page_h):
-    """
-    删除页眉页脚，页码
-    从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中
-    """
+    """删除页眉页脚，页码 从line级别进行删除，删除之后观察这个text-block是否是空的，如果是空的，则移动到remove_list中."""
     header = []
     footer = []
     if len(header) == 0:

magic-pdf 0.9.3__py3-none-any.whl → 0.10.1__py3-none-any.whl

magic-pdf 0.9.3py3-none-any.whl → 0.10.1py3-none-any.whl