PyPI - magic-pdf - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

magic_pdf/data/data_reader_writer/filebase.py +3 -0
magic_pdf/filter/pdf_meta_scan.py +3 -17
magic_pdf/libs/commons.py +0 -161
magic_pdf/libs/draw_bbox.py +2 -3
magic_pdf/libs/markdown_utils.py +0 -21
magic_pdf/libs/pdf_image_tools.py +2 -1
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
magic_pdf/model/magic_model.py +0 -30
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
magic_pdf/para/para_split_v3.py +7 -2
magic_pdf/pdf_parse_union_core_v2.py +97 -124
magic_pdf/pre_proc/construct_page_dict.py +0 -55
magic_pdf/pre_proc/cut_image.py +0 -37
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
magic_pdf/rw/S3ReaderWriter.py +1 -1
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
magic_pdf/dict2md/mkcontent.py +0 -438
magic_pdf/layout/__init__.py +0 -0
magic_pdf/layout/bbox_sort.py +0 -681
magic_pdf/layout/layout_det_utils.py +0 -182
magic_pdf/layout/layout_sort.py +0 -921
magic_pdf/layout/layout_spiler_recog.py +0 -101
magic_pdf/layout/mcol_sort.py +0 -336
magic_pdf/libs/calc_span_stats.py +0 -239
magic_pdf/libs/detect_language_from_model.py +0 -21
magic_pdf/libs/nlp_utils.py +0 -203
magic_pdf/libs/textbase.py +0 -33
magic_pdf/libs/vis_utils.py +0 -308
magic_pdf/para/block_continuation_processor.py +0 -562
magic_pdf/para/block_termination_processor.py +0 -480
magic_pdf/para/commons.py +0 -222
magic_pdf/para/denoise.py +0 -246
magic_pdf/para/draw.py +0 -121
magic_pdf/para/exceptions.py +0 -198
magic_pdf/para/layout_match_processor.py +0 -40
magic_pdf/para/para_split.py +0 -807
magic_pdf/para/para_split_v2.py +0 -959
magic_pdf/para/raw_processor.py +0 -207
magic_pdf/para/stats.py +0 -268
magic_pdf/para/title_processor.py +0 -1014
magic_pdf/pdf_parse_union_core.py +0 -345
magic_pdf/post_proc/__init__.py +0 -0
magic_pdf/post_proc/detect_para.py +0 -3472
magic_pdf/post_proc/pdf_post_filter.py +0 -60
magic_pdf/post_proc/remove_footnote.py +0 -153
magic_pdf/pre_proc/citationmarker_remove.py +0 -161
magic_pdf/pre_proc/detect_equation.py +0 -134
magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
magic_pdf/pre_proc/detect_footnote.py +0 -170
magic_pdf/pre_proc/detect_header.py +0 -64
magic_pdf/pre_proc/detect_images.py +0 -647
magic_pdf/pre_proc/detect_page_number.py +0 -64
magic_pdf/pre_proc/detect_tables.py +0 -62
magic_pdf/pre_proc/equations_replace.py +0 -550
magic_pdf/pre_proc/fix_image.py +0 -244
magic_pdf/pre_proc/fix_table.py +0 -270
magic_pdf/pre_proc/main_text_font.py +0 -23
magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
magic_pdf/pre_proc/post_layout_split.py +0 -0
magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
magic_pdf/pre_proc/remove_footer_header.py +0 -114
magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
magic_pdf/pre_proc/solve_line_alien.py +0 -29
magic_pdf/pre_proc/statistics.py +0 -12
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0

magic_pdf/pdf_parse_union_core_v2.py CHANGED Viewed

@@ -5,19 +5,18 @@ import time
 from typing import List
 import torch
+import fitz
 from loguru import logger
-from magic_pdf.config.drop_reason import DropReason
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.data.dataset import Dataset, PageableData
 from magic_pdf.libs.boxbase import calculate_overlap_area_in_bbox1_area_ratio
 from magic_pdf.libs.clean_memory import clean_memory
-from magic_pdf.libs.commons import fitz, get_delta_time
 from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
-from magic_pdf.libs.local_math import float_equal
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
 from magic_pdf.model.magic_model import MagicModel
@@ -34,13 +33,11 @@ except ImportError:
 from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
 from magic_pdf.para.para_split_v3 import para_split
-from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker
 from magic_pdf.pre_proc.construct_page_dict import \
     ocr_construct_page_component_v2
 from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
-from magic_pdf.pre_proc.equations_replace import (
-    combine_chars_to_pymudict, remove_chars_in_text_blocks,
-    replace_equations_in_textblock)
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import \
     ocr_prepare_bboxes_for_layout_split_v2
 from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
@@ -49,26 +46,6 @@ from magic_pdf.pre_proc.ocr_dict_merge import (fill_spans_in_blocks,
 from magic_pdf.pre_proc.ocr_span_list_modify import (
     get_qa_need_list_v2, remove_overlaps_low_confidence_spans,
     remove_overlaps_min_spans)
-from magic_pdf.pre_proc.resolve_bbox_conflict import \
-    check_useful_block_horizontal_overlap
-def remove_horizontal_overlap_block_which_smaller(all_bboxes):
-    useful_blocks = []
-    for bbox in all_bboxes:
-        useful_blocks.append({'bbox': bbox[:4]})
-    is_useful_block_horz_overlap, smaller_bbox, bigger_bbox = (
-        check_useful_block_horizontal_overlap(useful_blocks)
-    )
-    if is_useful_block_horz_overlap:
-        logger.warning(
-            f'skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}, smaller bbox is {smaller_bbox}, bigger bbox is {bigger_bbox}'
-        )  # noqa: E501
-        for bbox in all_bboxes.copy():
-            if smaller_bbox == bbox[:4]:
-                all_bboxes.remove(bbox)
-    return is_useful_block_horz_overlap, all_bboxes
 def __replace_STX_ETX(text_str: str):
@@ -89,29 +66,26 @@ def __replace_STX_ETX(text_str: str):
 def chars_to_content(span):
-        # # 先给chars按char['bbox']的x坐标排序
-        # span['chars'] = sorted(span['chars'], key=lambda x: x['bbox'][0])
+    # 检查span中的char是否为空
+    if len(span['chars']) == 0:
+        span['content'] = ''
+    else:
         # 先给chars按char['bbox']的中心点的x坐标排序
         span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
-        content = ''
         # 求char的平均宽度
-        if len(span['chars']) == 0:
-            span['content'] = content
-            del span['chars']
-            return
-        else:
-            char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
-            char_avg_width = char_width_sum / len(span['chars'])
+        char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
+        char_avg_width = char_width_sum / len(span['chars'])
+        content = ''
         for char in span['chars']:
             # 如果下一个char的x0和上一个char的x1距离超过一个字符宽度，则需要在中间插入一个空格
             if char['bbox'][0] - span['chars'][span['chars'].index(char) - 1]['bbox'][2] > char_avg_width:
                 content += ' '
             content += char['c']
         span['content'] = __replace_STX_ETX(content)
-        del span['chars']
+    del span['chars']
 LINE_STOP_FLAG = ('.', '!', '?', '。', '！', '？', ')', '）', '"', '”', ':', '：', ';', '；', ']', '】', '}', '}', '>', '》', '、', ',', '，', '-', '—', '–',)
@@ -128,8 +102,13 @@ def fill_char_in_spans(spans, all_chars):
                 span['chars'].append(char)
                 break
+    empty_spans = []
     for span in spans:
         chars_to_content(span)
+        if len(span['content']) == 0:
+            empty_spans.append(span)
+    return empty_spans
 # 使用鲁棒性更强的中心点坐标判断
@@ -162,48 +141,79 @@ def calculate_char_in_span(char_bbox, span_bbox, char_is_line_stop_flag):
 def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
+    text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+    # @todo: 拿到char之后把倾斜角度较大的先删一遍
+    all_pymu_chars = []
+    for block in text_blocks_raw:
+        for line in block['lines']:
+            for span in line['spans']:
+                all_pymu_chars.extend(span['chars'])
+    # 计算所有sapn的高度的中位数
+    span_height_list = []
+    for span in spans:
+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            continue
+        span_height = span['bbox'][3] - span['bbox'][1]
+        span['height'] = span_height
+        span_height_list.append(span_height)
+    if len(span_height_list) == 0:
+        return spans
+    else:
+        median_span_height = statistics.median(span_height_list)
     useful_spans = []
     unuseful_spans = []
+    # 纵向span的两个特征：1. 高度超过多个line 2. 高宽比超过某个值
+    vertical_spans = []
     for span in spans:
-        for block in all_bboxes:
+        if span['type'] in [ContentType.InterlineEquation, ContentType.Image, ContentType.Table]:
+            continue
+        for block in all_bboxes + all_discarded_blocks:
             if block[7] in [BlockType.ImageBody, BlockType.TableBody, BlockType.InterlineEquation]:
                 continue
-            else:
-                if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
-                    useful_spans.append(span)
-                    break
-        for block in all_discarded_blocks:
             if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], block[0:4]) > 0.5:
-                unuseful_spans.append(span)
+                if span['height'] > median_span_height * 3 and span['height'] > (span['bbox'][2] - span['bbox'][0]) * 3:
+                    vertical_spans.append(span)
+                elif block in all_bboxes:
+                    useful_spans.append(span)
+                else:
+                    unuseful_spans.append(span)
+                del span['height']
                 break
-    text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+    """垂直的span框直接用pymu的line进行填充"""
+    if len(vertical_spans) > 0:
+        text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
+        all_pymu_lines = []
+        for block in text_blocks:
+            for line in block['lines']:
+                all_pymu_lines.append(line)
-    # @todo: 拿到char之后把倾斜角度较大的先删一遍
-    all_pymu_chars = []
-    for block in text_blocks:
-        for line in block['lines']:
-            for span in line['spans']:
-                all_pymu_chars.extend(span['chars'])
+        for pymu_line in all_pymu_lines:
+            for span in vertical_spans:
+                if calculate_overlap_area_in_bbox1_area_ratio(pymu_line['bbox'], span['bbox']) > 0.5:
+                    for pymu_span in pymu_line['spans']:
+                        span['content'] += pymu_span['text']
+                    break
-    new_spans = []
+        for span in vertical_spans:
+            if len(span['content']) == 0:
+                spans.remove(span)
-    for span in useful_spans:
-        if span['type'] in [ContentType.Text]:
-            span['chars'] = []
-            new_spans.append(span)
+    """水平的span框如果没有char则用ocr进行填充"""
+    new_spans = []
-    for span in unuseful_spans:
+    for span in useful_spans + unuseful_spans:
         if span['type'] in [ContentType.Text]:
             span['chars'] = []
             new_spans.append(span)
-    fill_char_in_spans(new_spans, all_pymu_chars)
+    empty_spans = fill_char_in_spans(new_spans, all_pymu_chars)
-    empty_spans = []
-    for span in new_spans:
-        if len(span['content']) == 0:
-            empty_spans.append(span)
     if len(empty_spans) > 0:
         # 初始化ocr模型
@@ -216,55 +226,21 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
         )
         for span in empty_spans:
-            spans.remove(span)
-            # 对span的bbox截图
+            # 对span的bbox截图再ocr
             span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode="cv2")
             ocr_res = ocr_model.ocr(span_img, det=False)
-            # logger.info(f"ocr_res: {ocr_res}")
-            # logger.info(f"empty_span: {span}")
             if ocr_res and len(ocr_res) > 0:
                 if len(ocr_res[0]) > 0:
                     ocr_text, ocr_score = ocr_res[0][0]
                     if ocr_score > 0.5 and len(ocr_text) > 0:
-                            span['content'] = ocr_text
-                            spans.append(span)
+                        span['content'] = ocr_text
+                        span['score'] = ocr_score
+                    else:
+                        spans.remove(span)
     return spans
-def txt_spans_extract_v1(pdf_page, inline_equations, interline_equations):
-    text_raw_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
-    char_level_text_blocks = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)[
-        'blocks'
-    ]
-    text_blocks = combine_chars_to_pymudict(text_raw_blocks, char_level_text_blocks)
-    text_blocks = replace_equations_in_textblock(
-        text_blocks, inline_equations, interline_equations
-    )
-    text_blocks = remove_citation_marker(text_blocks)
-    text_blocks = remove_chars_in_text_blocks(text_blocks)
-    spans = []
-    for v in text_blocks:
-        for line in v['lines']:
-            for span in line['spans']:
-                bbox = span['bbox']
-                if float_equal(bbox[0], bbox[2]) or float_equal(bbox[1], bbox[3]):
-                    continue
-                if span.get('type') not in (
-                    ContentType.InlineEquation,
-                    ContentType.InterlineEquation,
-                ):
-                    spans.append(
-                        {
-                            'bbox': list(span['bbox']),
-                            'content': __replace_STX_ETX(span['text']),
-                            'type': ContentType.Text,
-                            'score': 1.0,
-                        }
-                    )
-    return spans
 def replace_text_span(pymu_spans, ocr_spans):
     return list(filter(lambda x: x['type'] != ContentType.Text, ocr_spans)) + pymu_spans
@@ -682,6 +658,23 @@ def parse_page_core(
     """顺便删除大水印并保留abandon的span"""
     spans = remove_outside_spans(spans, all_bboxes, all_discarded_blocks)
+    """删除重叠spans中置信度较低的那些"""
+    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
+    """删除重叠spans中较小的那些"""
+    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
+    """根据parse_mode，构造spans，主要是文本类的字符填充"""
+    if parse_mode == SupportedPdfParseMethod.TXT:
+        """使用新版本的混合ocr方案"""
+        spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
+    elif parse_mode == SupportedPdfParseMethod.OCR:
+        pass
+    else:
+        raise Exception('parse_mode must be txt or ocr')
     """先处理不需要排版的discarded_blocks"""
     discarded_block_with_spans, spans = fill_spans_in_blocks(
         all_discarded_blocks, spans, 0.4
@@ -706,26 +699,6 @@ def parse_page_core(
             drop_reason,
         )
-    """删除重叠spans中置信度较低的那些"""
-    spans, dropped_spans_by_confidence = remove_overlaps_low_confidence_spans(spans)
-    """删除重叠spans中较小的那些"""
-    spans, dropped_spans_by_span_overlap = remove_overlaps_min_spans(spans)
-    """根据parse_mode，构造spans，主要是文本类的字符填充"""
-    if parse_mode == SupportedPdfParseMethod.TXT:
-        """之前的公式替换方案"""
-        # pymu_spans = txt_spans_extract_v1(page_doc, inline_equations, interline_equations)
-        # spans = replace_text_span(pymu_spans, spans)
-        """ocr 中文本类的 span 用 pymu spans 替换！"""
-        spans = txt_spans_extract_v2(page_doc, spans, all_bboxes, all_discarded_blocks, lang)
-    elif parse_mode == SupportedPdfParseMethod.OCR:
-        pass
-    else:
-        raise Exception('parse_mode must be txt or ocr')
     """对image和table截图"""
     spans = ocr_cut_image_and_table(
         spans, page_doc, page_id, pdf_bytes_md5, imageWriter
@@ -811,7 +784,7 @@ def pdf_parse_union(
         if debug_mode:
             time_now = time.time()
             logger.info(
-                f'page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}'
+                f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
             )
             start_time = time_now

magic_pdf/pre_proc/construct_page_dict.py CHANGED Viewed

@@ -1,58 +1,3 @@
-def construct_page_component(page_id, image_info, table_info, text_blocks_preproc, layout_bboxes, inline_eq_info,
-                             interline_eq_info, raw_pymu_blocks,
-                             removed_text_blocks, removed_image_blocks, images_backup, droped_table_block, table_backup,
-                             layout_tree,
-                             page_w, page_h, footnote_bboxes_tmp):
-    """
-    """
-    return_dict = {}
-    return_dict['para_blocks'] = {}
-    return_dict['preproc_blocks'] = text_blocks_preproc
-    return_dict['images'] = image_info
-    return_dict['tables'] = table_info
-    return_dict['interline_equations'] = interline_eq_info
-    return_dict['inline_equations'] = inline_eq_info
-    return_dict['layout_bboxes'] = layout_bboxes
-    return_dict['pymu_raw_blocks'] = raw_pymu_blocks
-    return_dict['global_statistic'] = {}
-    return_dict['droped_text_block'] = removed_text_blocks
-    return_dict['droped_image_block'] = removed_image_blocks
-    return_dict['droped_table_block'] = []
-    return_dict['image_backup'] = images_backup
-    return_dict['table_backup'] = []
-    return_dict['page_idx'] = page_id
-    return_dict['page_size'] = [page_w, page_h]
-    return_dict['_layout_tree'] = layout_tree  # 辅助分析layout作用
-    return_dict['footnote_bboxes_tmp'] = footnote_bboxes_tmp
-    return return_dict
-def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
-                                 images, tables, interline_equations, inline_equations,
-                                 dropped_text_block, dropped_image_block, dropped_table_block, dropped_equation_block,
-                                 need_remove_spans_bboxes_dict):
-    return_dict = {
-        'preproc_blocks': blocks,
-        'layout_bboxes': layout_bboxes,
-        'page_idx': page_id,
-        'page_size': [page_w, page_h],
-        '_layout_tree': layout_tree,
-        'images': images,
-        'tables': tables,
-        'interline_equations': interline_equations,
-        'inline_equations': inline_equations,
-        'droped_text_block': dropped_text_block,
-        'droped_image_block': dropped_image_block,
-        'droped_table_block': dropped_table_block,
-        'dropped_equation_block': dropped_equation_block,
-        'droped_bboxes': need_remove_spans_bboxes_dict,
-    }
-    return return_dict
 def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
                                     images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):

magic_pdf/pre_proc/cut_image.py CHANGED Viewed

@@ -25,43 +25,6 @@ def ocr_cut_image_and_table(spans, page, page_id, pdf_bytes_md5, imageWriter):
     return spans
-def txt_save_images_by_bboxes(page_num: int, page, pdf_bytes_md5: str,
-                              image_bboxes: list, images_overlap_backup: list, table_bboxes: list,
-                              equation_inline_bboxes: list,
-                              equation_interline_bboxes: list, imageWriter) -> dict:
-    """返回一个dict, key为bbox, 值是图片地址."""
-    image_info = []
-    image_backup_info = []
-    table_info = []
-    inline_eq_info = []
-    interline_eq_info = []
-    # 图片的保存路径组成是这样的： {s3_or_local_path}/{book_name}/{images|tables|equations}/{page_num}_{bbox[0]}_{bbox[1]}_{bbox[2]}_{bbox[3]}.jpg
-    def return_path(type):
-        return join_path(pdf_bytes_md5, type)
-    for bbox in image_bboxes:
-        if not check_img_bbox(bbox):
-            continue
-        image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
-        image_info.append({'bbox': bbox, 'image_path': image_path})
-    for bbox in images_overlap_backup:
-        if not check_img_bbox(bbox):
-            continue
-        image_path = cut_image(bbox, page_num, page, return_path('images'), imageWriter)
-        image_backup_info.append({'bbox': bbox, 'image_path': image_path})
-    for bbox in table_bboxes:
-        if not check_img_bbox(bbox):
-            continue
-        image_path = cut_image(bbox, page_num, page, return_path('tables'), imageWriter)
-        table_info.append({'bbox': bbox, 'image_path': image_path})
-    return image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info
 def check_img_bbox(bbox) -> bool:
     if any([bbox[0] >= bbox[2], bbox[1] >= bbox[3]]):
         logger.warning(f'image_bboxes: 错误的box, {bbox}')

magic_pdf/pre_proc/ocr_detect_all_bboxes.py CHANGED Viewed

@@ -1,184 +1,11 @@
 from magic_pdf.config.ocr_content_type import BlockType
 from magic_pdf.libs.boxbase import (
-    calculate_iou, calculate_overlap_area_in_bbox1_area_ratio,
+    calculate_iou,
+    calculate_overlap_area_in_bbox1_area_ratio,
     calculate_vertical_projection_overlap_ratio,
-    get_minbox_if_overlap_by_ratio)
-from magic_pdf.pre_proc.remove_bbox_overlap import \
-    remove_overlap_between_bbox_for_block
-def ocr_prepare_bboxes_for_layout_split(
-    img_blocks,
-    table_blocks,
-    discarded_blocks,
-    text_blocks,
-    title_blocks,
-    interline_equation_blocks,
-    page_w,
-    page_h,
-):
-    all_bboxes = []
-    all_discarded_blocks = []
-    for image in img_blocks:
-        x0, y0, x1, y1 = image['bbox']
-        all_bboxes.append(
-            [
-                x0,
-                y0,
-                x1,
-                y1,
-                None,
-                None,
-                None,
-                BlockType.Image,
-                None,
-                None,
-                None,
-                None,
-                image['score'],
-            ]
-        )
-    for table in table_blocks:
-        x0, y0, x1, y1 = table['bbox']
-        all_bboxes.append(
-            [
-                x0,
-                y0,
-                x1,
-                y1,
-                None,
-                None,
-                None,
-                BlockType.Table,
-                None,
-                None,
-                None,
-                None,
-                table['score'],
-            ]
-        )
-    for text in text_blocks:
-        x0, y0, x1, y1 = text['bbox']
-        all_bboxes.append(
-            [
-                x0,
-                y0,
-                x1,
-                y1,
-                None,
-                None,
-                None,
-                BlockType.Text,
-                None,
-                None,
-                None,
-                None,
-                text['score'],
-            ]
-        )
-    for title in title_blocks:
-        x0, y0, x1, y1 = title['bbox']
-        all_bboxes.append(
-            [
-                x0,
-                y0,
-                x1,
-                y1,
-                None,
-                None,
-                None,
-                BlockType.Title,
-                None,
-                None,
-                None,
-                None,
-                title['score'],
-            ]
-        )
-    for interline_equation in interline_equation_blocks:
-        x0, y0, x1, y1 = interline_equation['bbox']
-        all_bboxes.append(
-            [
-                x0,
-                y0,
-                x1,
-                y1,
-                None,
-                None,
-                None,
-                BlockType.InterlineEquation,
-                None,
-                None,
-                None,
-                None,
-                interline_equation['score'],
-            ]
-        )
-    """block嵌套问题解决"""
-    """文本框与标题框重叠，优先信任文本框"""
-    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
-    """任何框体与舍弃框重叠，优先信任舍弃框"""
-    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
-    # interline_equation 与title或text框冲突的情况，分两种情况处理
-    """interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框"""
-    all_bboxes = fix_interline_equation_overlap_text_blocks_with_hi_iou(all_bboxes)
-    """interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框"""
-    # 通过后续大框套小框逻辑删除
-    """discarded_blocks中只保留宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的（限定footnote）"""
-    for discarded in discarded_blocks:
-        x0, y0, x1, y1 = discarded['bbox']
-        all_discarded_blocks.append(
-            [
-                x0,
-                y0,
-                x1,
-                y1,
-                None,
-                None,
-                None,
-                BlockType.Discarded,
-                None,
-                None,
-                None,
-                None,
-                discarded['score'],
-            ]
-        )
-        # 将footnote加入到all_bboxes中，用来计算layout
-        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
-            all_bboxes.append(
-                [
-                    x0,
-                    y0,
-                    x1,
-                    y1,
-                    None,
-                    None,
-                    None,
-                    BlockType.Footnote,
-                    None,
-                    None,
-                    None,
-                    None,
-                    discarded['score'],
-                ]
-            )
-    """经过以上处理后，还存在大框套小框的情况，则删除小框"""
-    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
-    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
-    """将剩余的bbox做分离处理，防止后面分layout时出错"""
-    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
-    return all_bboxes, all_discarded_blocks, drop_reasons
+    get_minbox_if_overlap_by_ratio
+)
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
 def add_bboxes(blocks, block_type, bboxes):

magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl