PyPI - magic-pdf - Versions diffs - 0.5.4__py3-none-any.whl - Mend

magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (121) hide show

magic_pdf/__init__.py +0 -0
magic_pdf/cli/__init__.py +0 -0
magic_pdf/cli/magicpdf.py +294 -0
magic_pdf/dict2md/__init__.py +0 -0
magic_pdf/dict2md/mkcontent.py +397 -0
magic_pdf/dict2md/ocr_mkcontent.py +356 -0
magic_pdf/filter/__init__.py +0 -0
magic_pdf/filter/pdf_classify_by_type.py +381 -0
magic_pdf/filter/pdf_meta_scan.py +368 -0
magic_pdf/layout/__init__.py +0 -0
magic_pdf/layout/bbox_sort.py +681 -0
magic_pdf/layout/layout_det_utils.py +182 -0
magic_pdf/layout/layout_sort.py +732 -0
magic_pdf/layout/layout_spiler_recog.py +101 -0
magic_pdf/layout/mcol_sort.py +336 -0
magic_pdf/libs/Constants.py +11 -0
magic_pdf/libs/MakeContentConfig.py +10 -0
magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
magic_pdf/libs/__init__.py +0 -0
magic_pdf/libs/boxbase.py +408 -0
magic_pdf/libs/calc_span_stats.py +239 -0
magic_pdf/libs/commons.py +204 -0
magic_pdf/libs/config_reader.py +63 -0
magic_pdf/libs/convert_utils.py +5 -0
magic_pdf/libs/coordinate_transform.py +9 -0
magic_pdf/libs/detect_language_from_model.py +21 -0
magic_pdf/libs/draw_bbox.py +227 -0
magic_pdf/libs/drop_reason.py +27 -0
magic_pdf/libs/drop_tag.py +19 -0
magic_pdf/libs/hash_utils.py +15 -0
magic_pdf/libs/json_compressor.py +27 -0
magic_pdf/libs/language.py +31 -0
magic_pdf/libs/markdown_utils.py +31 -0
magic_pdf/libs/math.py +9 -0
magic_pdf/libs/nlp_utils.py +203 -0
magic_pdf/libs/ocr_content_type.py +21 -0
magic_pdf/libs/path_utils.py +23 -0
magic_pdf/libs/pdf_image_tools.py +33 -0
magic_pdf/libs/safe_filename.py +11 -0
magic_pdf/libs/textbase.py +33 -0
magic_pdf/libs/version.py +1 -0
magic_pdf/libs/vis_utils.py +308 -0
magic_pdf/model/__init__.py +0 -0
magic_pdf/model/doc_analyze_by_360layout.py +8 -0
magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
magic_pdf/model/magic_model.py +632 -0
magic_pdf/para/__init__.py +0 -0
magic_pdf/para/block_continuation_processor.py +562 -0
magic_pdf/para/block_termination_processor.py +480 -0
magic_pdf/para/commons.py +222 -0
magic_pdf/para/denoise.py +246 -0
magic_pdf/para/draw.py +121 -0
magic_pdf/para/exceptions.py +198 -0
magic_pdf/para/layout_match_processor.py +40 -0
magic_pdf/para/para_pipeline.py +297 -0
magic_pdf/para/para_split.py +644 -0
magic_pdf/para/para_split_v2.py +772 -0
magic_pdf/para/raw_processor.py +207 -0
magic_pdf/para/stats.py +268 -0
magic_pdf/para/title_processor.py +1014 -0
magic_pdf/pdf_parse_by_ocr.py +219 -0
magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
magic_pdf/pdf_parse_by_txt.py +410 -0
magic_pdf/pdf_parse_by_txt_v2.py +56 -0
magic_pdf/pdf_parse_for_train.py +685 -0
magic_pdf/pdf_parse_union_core.py +241 -0
magic_pdf/pipe/AbsPipe.py +112 -0
magic_pdf/pipe/OCRPipe.py +28 -0
magic_pdf/pipe/TXTPipe.py +29 -0
magic_pdf/pipe/UNIPipe.py +83 -0
magic_pdf/pipe/__init__.py +0 -0
magic_pdf/post_proc/__init__.py +0 -0
magic_pdf/post_proc/detect_para.py +3472 -0
magic_pdf/post_proc/pdf_post_filter.py +67 -0
magic_pdf/post_proc/remove_footnote.py +153 -0
magic_pdf/pre_proc/__init__.py +0 -0
magic_pdf/pre_proc/citationmarker_remove.py +157 -0
magic_pdf/pre_proc/construct_page_dict.py +72 -0
magic_pdf/pre_proc/cut_image.py +71 -0
magic_pdf/pre_proc/detect_equation.py +134 -0
magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
magic_pdf/pre_proc/detect_footnote.py +170 -0
magic_pdf/pre_proc/detect_header.py +64 -0
magic_pdf/pre_proc/detect_images.py +647 -0
magic_pdf/pre_proc/detect_page_number.py +64 -0
magic_pdf/pre_proc/detect_tables.py +62 -0
magic_pdf/pre_proc/equations_replace.py +559 -0
magic_pdf/pre_proc/fix_image.py +244 -0
magic_pdf/pre_proc/fix_table.py +270 -0
magic_pdf/pre_proc/main_text_font.py +23 -0
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
magic_pdf/pre_proc/post_layout_split.py +0 -0
magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
magic_pdf/pre_proc/remove_footer_header.py +117 -0
magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
magic_pdf/pre_proc/solve_line_alien.py +29 -0
magic_pdf/pre_proc/statistics.py +12 -0
magic_pdf/rw/AbsReaderWriter.py +34 -0
magic_pdf/rw/DiskReaderWriter.py +66 -0
magic_pdf/rw/S3ReaderWriter.py +107 -0
magic_pdf/rw/__init__.py +0 -0
magic_pdf/spark/__init__.py +0 -0
magic_pdf/spark/spark_api.py +51 -0
magic_pdf/train_utils/__init__.py +0 -0
magic_pdf/train_utils/convert_to_train_format.py +65 -0
magic_pdf/train_utils/extract_caption.py +59 -0
magic_pdf/train_utils/remove_footer_header.py +159 -0
magic_pdf/train_utils/vis_utils.py +327 -0
magic_pdf/user_api.py +136 -0
magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
magic_pdf-0.5.4.dist-info/METADATA +24 -0
magic_pdf-0.5.4.dist-info/RECORD +121 -0
magic_pdf-0.5.4.dist-info/WHEEL +5 -0
magic_pdf-0.5.4.dist-info/top_level.txt +1 -0

magic_pdf/pre_proc/fix_image.py ADDED Viewed

@@ -0,0 +1,244 @@
+import re
+from magic_pdf.libs.boxbase import  _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
+from magic_pdf.libs.textbase import get_text_block_base_info
+def fix_image_vertical(image_bboxes:list, text_blocks:list):
+    """
+    修正图片的位置
+    如果图片与文字block发生一定重叠（也就是图片切到了一部分文字），那么减少图片边缘，让文字和图片不再重叠。
+    只对垂直方向进行。
+    """
+    for image_bbox in image_bboxes:
+        for text_block in text_blocks:
+            text_bbox = text_block["bbox"]
+            if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]):
+                if text_bbox[1] < image_bbox[1]:#在图片上方
+                    image_bbox[1] = text_bbox[3]+1
+                elif text_bbox[3]>image_bbox[3]:#在图片下方
+                    image_bbox[3] = text_bbox[1]-1
+    return image_bboxes
+def __merge_if_common_edge(bbox1, bbox2):
+    x_min_1, y_min_1, x_max_1, y_max_1 = bbox1
+    x_min_2, y_min_2, x_max_2, y_max_2 = bbox2
+    # 检查是否有公共的水平边
+    if y_min_1 == y_min_2 or y_max_1 == y_max_2:
+        # 确保一个框的x范围在另一个框的x范围内
+        if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2):
+            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
+    # 检查是否有公共的垂直边
+    if x_min_1 == x_min_2 or x_max_1 == x_max_2:
+        # 确保一个框的y范围在另一个框的y范围内
+        if max(y_min_1, y_min_2) <= min(y_max_1, y_max_2):
+            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
+    # 如果没有公共边
+    return None
+def fix_seperated_image(image_bboxes:list):
+    """
+    如果2个图片有一个边重叠，那么合并2个图片
+    """
+    new_images = []
+    droped_img_idx = []
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            new_img = __merge_if_common_edge(image_bboxes[i], image_bboxes[j])
+            if new_img is not None:
+                new_images.append(new_img)
+                droped_img_idx.append(i)
+                droped_img_idx.append(j)
+                break
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+    return new_images
+def __check_img_title_pattern(text):
+    """
+    检查文本段是否是表格的标题
+    """
+    patterns = [r"^(fig|figure).*", r"^(scheme).*"]
+    text = text.strip()
+    for pattern in patterns:
+        match = re.match(pattern, text, re.IGNORECASE)
+        if match:
+            return True
+    return False
+def __get_fig_caption_text(text_block):
+    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+    line_cnt = len(text_block['lines'])
+    txt = txt.replace("Ž . ", '')
+    return txt, line_cnt
+def __find_and_extend_bottom_caption(text_block, pymu_blocks, image_box):
+    """
+    继续向下方寻找和图片caption字号，字体，颜色一样的文字框，合并入caption。
+    text_block是已经找到的图片catpion（这个caption可能不全，多行被划分到多个pymu block里了）
+    """
+    combined_image_caption_text_block = list(text_block.copy()['bbox'])
+    base_font_color, base_font_size, base_font_type = get_text_block_base_info(text_block)
+    while True:
+        tb_add = find_bottom_nearest_text_bbox(pymu_blocks, combined_image_caption_text_block)
+        if not tb_add:
+            break
+        tb_font_color, tb_font_size, tb_font_type = get_text_block_base_info(tb_add)
+        if tb_font_color==base_font_color and tb_font_size==base_font_size and tb_font_type==base_font_type:
+            combined_image_caption_text_block[0] = min(combined_image_caption_text_block[0], tb_add['bbox'][0])
+            combined_image_caption_text_block[2] = max(combined_image_caption_text_block[2], tb_add['bbox'][2])
+            combined_image_caption_text_block[3] = tb_add['bbox'][3]
+        else:
+            break
+    image_box[0] = min(image_box[0], combined_image_caption_text_block[0])
+    image_box[1] = min(image_box[1], combined_image_caption_text_block[1])
+    image_box[2] = max(image_box[2], combined_image_caption_text_block[2])
+    image_box[3] = max(image_box[3], combined_image_caption_text_block[3])
+    text_block['_image_caption'] = True
+def include_img_title(pymu_blocks, image_bboxes: list):
+    """
+    向上方和下方寻找符合图片title的文本block，合并到图片里
+    如果图片上下都有fig的情况怎么办？寻找标题距离最近的那个。
+    ---
+    增加对左侧和右侧图片标题的寻找
+    """
+    for tb in image_bboxes:
+        # 优先找下方的
+        max_find_cnt = 3 # 向上，向下最多找3个就停止
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_btn = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_btn:
+                txt, line_cnt = __get_fig_caption_text(text_block_btn)
+                if len(txt.strip())>0:
+                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: # 设置line_cnt<=2目的是为了跳过子标题，或者有时候图片下方文字没有被图片识别模型放入图片里
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[3] = text_block_btn['bbox'][3]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[3] = text_block_btn['bbox'][3] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        max_find_cnt = 3 # 向上，向下最多找3个就停止
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_top:
+                txt, line_cnt = __get_fig_caption_text(text_block_top)
+                if len(txt.strip())>0:
+                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3:
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[1] = text_block_top['bbox'][1]
+                        continue
+                    else:
+                        break
+                else:
+                    b = text_block_top['bbox']
+                    temp_box[1] = b[1] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False :
+            btn_text, _ = __get_fig_caption_text(text_block_btn)
+            top_text, _ = __get_fig_caption_text(text_block_top)
+            if __check_img_title_pattern(btn_text) and __check_img_title_pattern(top_text):
+                # 取距离图片最近的
+                btn_text_distance = text_block_btn['bbox'][1] - tb[3]
+                top_text_distance = tb[1] - text_block_top['bbox'][3]
+                if btn_text_distance<top_text_distance: # caption在下方
+                    __find_and_extend_bottom_caption(text_block_btn, pymu_blocks, tb)
+                else:
+                    text_block = text_block_top
+                    tb[0] = min(tb[0], text_block['bbox'][0])
+                    tb[1] = min(tb[1], text_block['bbox'][1])
+                    tb[2] = max(tb[2], text_block['bbox'][2])
+                    tb[3] = max(tb[3], text_block['bbox'][3])
+                    text_block_btn['_image_caption'] = True
+                continue
+        text_block = text_block_btn # find_bottom_nearest_text_bbox(pymu_blocks, tb)
+        if text_block and text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(text_block)
+            if __check_img_title_pattern(first_text_line):
+                # 发现特征之后，继续向相同方向寻找（想同颜色，想同大小，想同字体）的textblock
+                __find_and_extend_bottom_caption(text_block, pymu_blocks, tb)
+                continue
+        text_block = text_block_top # find_top_nearest_text_bbox(pymu_blocks, tb)
+        if text_block  and text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_image_caption'] = True
+                continue
+        """向左、向右寻找，暂时只寻找一次"""
+        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
+        if left_text_block and left_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(left_text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], left_text_block['bbox'][0])
+                tb[1] = min(tb[1], left_text_block['bbox'][1])
+                tb[2] = max(tb[2], left_text_block['bbox'][2])
+                tb[3] = max(tb[3], left_text_block['bbox'][3])
+                left_text_block['_image_caption'] = True
+                continue
+        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
+        if right_text_block and right_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_fig_caption_text(right_text_block)
+            if __check_img_title_pattern(first_text_line):
+                tb[0] = min(tb[0], right_text_block['bbox'][0])
+                tb[1] = min(tb[1], right_text_block['bbox'][1])
+                tb[2] = max(tb[2], right_text_block['bbox'][2])
+                tb[3] = max(tb[3], right_text_block['bbox'][3])
+                right_text_block['_image_caption'] = True
+                continue
+    return image_bboxes
+def combine_images(image_bboxes:list):
+    """
+    合并图片，如果图片有重叠，那么合并
+    """
+    new_images = []
+    droped_img_idx = []
+    for i in range(0, len(image_bboxes)):
+        for j in range(i+1, len(image_bboxes)):
+            if j not in droped_img_idx and _is_in_or_part_overlap(image_bboxes[i], image_bboxes[j]):
+                # 合并
+                image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
+                droped_img_idx.append(j)
+    for i in range(0, len(image_bboxes)):
+        if i not in droped_img_idx:
+            new_images.append(image_bboxes[i])
+    return new_images

magic_pdf/pre_proc/fix_table.py ADDED Viewed

@@ -0,0 +1,270 @@
+from magic_pdf.libs.commons import fitz             # pyMuPDF库
+import re
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox             # json
+## version 2
+def get_merged_line(page):
+    """
+    这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线，并且将断开的线段进行了合并。
+    :param page :fitz读取的当前页的内容
+    """
+    drawings_bbox = []
+    drawings_line = []
+    drawings = page.get_drawings()  # 提取所有的矢量
+    for p in drawings:
+        drawings_bbox.append(p["rect"].irect)  # (L, U, R, D)
+    lines = []
+    for L, U, R, D in drawings_bbox:
+        if abs(D - U) <= 3: # 筛出水平的横线
+            lines.append((L, U, R, D))
+    U_groups = []
+    visited = [False for _ in range(len(lines))]
+    for i, (L1, U1, R1, D1) in enumerate(lines):
+        if visited[i] == True:
+            continue
+        tmp_g = [(L1, U1, R1, D1)]
+        for j, (L2, U2, R2, D2) in enumerate(lines):
+            if i == j:
+                continue
+            if visited[j] == True:
+                continue
+            if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5:   # 把高度一致的线放进一个group
+                tmp_g.append((L2, U2, R2, D2))
+                visited[j] = True
+        U_groups.append(tmp_g)
+    res = []
+    for group in U_groups:
+        group.sort(key = lambda LURD: (LURD[0], LURD[2]))
+        LL, UU, RR, DD = group[0]
+        for i, (L1, U1, R1, D1) in enumerate(group):
+            if (L1 - RR) >= 5:
+                cur_line = (LL, UU, RR, DD)
+                res.append(cur_line)
+                LL = L1
+            else:
+                RR = max(RR, R1)
+        cur_line = (LL, UU, RR, DD)
+        res.append(cur_line)
+    return res
+def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
+    """
+    :param page :fitz读取的当前页的内容
+    :param table_bboxes: list类型，每一个元素是一个元祖 (L, U, R, D)
+    :param include_table_title: 是否将表格的标题也圈进来
+    :param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
+    """
+    drawings_lines = get_merged_line(page)
+    fix_table_bboxes = []
+    for table in table_bboxes:
+        (L, U, R, D) = table
+        fix_table_L = []
+        fix_table_U = []
+        fix_table_R = []
+        fix_table_D = []
+        width = R - L
+        width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
+        height = D - U
+        height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
+        for line in drawings_lines:
+            if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
+                if (U - height_range) < line[1] < (U + height_range): # 上边界，在一定的高度范围内
+                    fix_table_U.append(line[1])
+                    fix_table_L.append(line[0])
+                    fix_table_R.append(line[2])
+                elif (D - height_range) < line[1] < (D + height_range): # 下边界，在一定的高度范围内
+                    fix_table_D.append(line[1])
+                    fix_table_L.append(line[0])
+                    fix_table_R.append(line[2])
+        if fix_table_U:
+            U = min(fix_table_U)
+        if fix_table_D:
+            D = max(fix_table_D)
+        if fix_table_L:
+            L = min(fix_table_L)
+        if fix_table_R:
+            R = max(fix_table_R)
+        if include_table_title:   # 需要将表格标题包括
+            text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]   # 所有的text的block
+            incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))]  # 将与表格完全没有任何遮挡的文字筛除掉（比如另一栏的文字）
+            upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0]  # 将在表格线以上的text block筛选出来
+            sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序，如果是同一个高度，则先左再右
+            for idx in range(scan_line_num):
+                if idx+1 <= len(sorted_filtered_text_blocks):
+                    line_temp = sorted_filtered_text_blocks[idx]['lines']
+                    if line_temp:
+                        text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
+                        check_en = re.match('Table', text) # 检查是否有Table开头的(英文）
+                        check_ch = re.match('表', text) # 检查是否有Table开头的(中文）
+                        if check_en or check_ch:
+                            if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
+                                U = sorted_filtered_text_blocks[idx]['bbox'][1]
+        fix_table_bboxes.append([L-2, U-2, R+2, D+2])
+    return fix_table_bboxes
+def __check_table_title_pattern(text):
+    """
+    检查文本段是否是表格的标题
+    """
+    patterns = [r'^table\s\d+']
+    for pattern in patterns:
+        match = re.match(pattern, text, re.IGNORECASE)
+        if match:
+            return True
+        else:
+            return False
+def fix_table_text_block(pymu_blocks, table_bboxes: list):
+    """
+    调整table, 如果table和上下的text block有相交区域，则将table的上下边界调整到text block的上下边界
+    例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
+    """
+    for tb in table_bboxes:
+        (L, U, R, D) = tb
+        for block in pymu_blocks:
+            if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
+                txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
+                if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title，那么不调整。因为下一步会统一调整，如果这里进行了调整，后面的调整会造成调整到其他table的title上（在连续出现2个table的情况下）。
+                    tb[0] = min(tb[0], block['bbox'][0])
+                    tb[1] = min(tb[1], block['bbox'][1])
+                    tb[2] = max(tb[2], block['bbox'][2])
+                    tb[3] = max(tb[3], block['bbox'][3])
+                    block['_table'] = True # 占位，防止其他table再次占用
+                """如果是个table的title，但是有部分重叠，那么修正这个title,使得和table不重叠"""
+                if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
+                    block['bbox'] = list(block['bbox'])
+                    if block['bbox'][3] > U:
+                        block['bbox'][3] = U-1
+                    if block['bbox'][1] < D:
+                        block['bbox'][1] = D+1
+    return table_bboxes
+def __get_table_caption_text(text_block):
+    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+    line_cnt = len(text_block['lines'])
+    txt = txt.replace("Ž . ", '')
+    return txt, line_cnt
+def include_table_title(pymu_blocks, table_bboxes: list):
+    """
+    把表格的title也包含进来，扩展到table_bbox上
+    """
+    for tb in table_bboxes:
+        max_find_cnt = 3 # 上上最多找3次
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_top:
+                txt, line_cnt = __get_table_caption_text(text_block_top)
+                if len(txt.strip())>0:
+                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
+                        max_find_cnt = max_find_cnt -1
+                        temp_box[1] = text_block_top['bbox'][1]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[1] = text_block_top['bbox'][1] # 宽度不变，扩大
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        max_find_cnt = 3 # 向下找
+        temp_box = tb.copy()
+        while max_find_cnt>0:
+            text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
+            if text_block_bottom:
+                txt, line_cnt = __get_table_caption_text(text_block_bottom)
+                if len(txt.strip())>0:
+                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
+                        max_find_cnt = max_find_cnt - 1
+                        temp_box[3] = text_block_bottom['bbox'][3]
+                        continue
+                    else:
+                        break
+                else:
+                    temp_box[3] = text_block_bottom['bbox'][3]
+                    max_find_cnt = max_find_cnt - 1
+            else:
+                break
+        if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
+            btn_text, _ = __get_table_caption_text(text_block_bottom)
+            top_text, _ = __get_table_caption_text(text_block_top)
+            if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
+                # 取距离最近的
+                btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
+                top_text_distance = tb[1] - text_block_top['bbox'][3]
+                text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block_bottom['_table_caption'] = True
+                continue
+        # 如果以上条件都不满足，那么就向下找
+        text_block = text_block_top
+        if text_block and text_block.get("_table_caption", False) is False:
+            first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
+            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_table_caption'] = True
+                continue
+        text_block = text_block_bottom
+        if text_block and text_block.get("_table_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(text_block)
+            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
+                tb[0] = min(tb[0], text_block['bbox'][0])
+                tb[1] = min(tb[1], text_block['bbox'][1])
+                tb[2] = max(tb[2], text_block['bbox'][2])
+                tb[3] = max(tb[3], text_block['bbox'][3])
+                text_block['_table_caption'] = True
+                continue
+        """向左、向右寻找，暂时只寻找一次"""
+        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
+        if left_text_block and left_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(left_text_block)
+            if __check_table_title_pattern(first_text_line):
+                tb[0] = min(tb[0], left_text_block['bbox'][0])
+                tb[1] = min(tb[1], left_text_block['bbox'][1])
+                tb[2] = max(tb[2], left_text_block['bbox'][2])
+                tb[3] = max(tb[3], left_text_block['bbox'][3])
+                left_text_block['_image_caption'] = True
+                continue
+        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
+        if right_text_block and right_text_block.get("_image_caption", False) is False:
+            first_text_line, _ = __get_table_caption_text(right_text_block)
+            if __check_table_title_pattern(first_text_line):
+                tb[0] = min(tb[0], right_text_block['bbox'][0])
+                tb[1] = min(tb[1], right_text_block['bbox'][1])
+                tb[2] = max(tb[2], right_text_block['bbox'][2])
+                tb[3] = max(tb[3], right_text_block['bbox'][3])
+                right_text_block['_image_caption'] = True
+                continue
+    return table_bboxes

magic_pdf/pre_proc/main_text_font.py ADDED Viewed

@@ -0,0 +1,23 @@
+import collections
+def get_main_text_font(pdf_docs):
+    font_names = collections.Counter()
+    for page in pdf_docs:
+        blocks = page.get_text('dict')['blocks']
+        if blocks is not None:
+            for block in blocks:
+                lines = block.get('lines')
+                if lines is not None:
+                    for line in lines:
+                        span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
+                                     'font' in span and len(span['text']) > 0]
+                        if span_font:
+                            # main_text_font应该用基于字数最多的字体而不是span级别的统计
+                            # font_names.append(font_name for font_name in span_font)
+                            # block_fonts.append(font_name for font_name in span_font)
+                            for font, count in span_font:
+                                font_names[font] += count
+    main_text_font = font_names.most_common(1)[0][0]
+    return main_text_font

magic_pdf/pre_proc/ocr_detect_all_bboxes.py ADDED Viewed

@@ -0,0 +1,115 @@
+from loguru import logger
+from magic_pdf.libs.boxbase import get_minbox_if_overlap_by_ratio, calculate_overlap_area_in_bbox1_area_ratio, \
+    calculate_iou
+from magic_pdf.libs.drop_tag import DropTag
+from magic_pdf.libs.ocr_content_type import BlockType
+from magic_pdf.pre_proc.remove_bbox_overlap import remove_overlap_between_bbox_for_block
+def ocr_prepare_bboxes_for_layout_split(img_blocks, table_blocks, discarded_blocks, text_blocks,
+                                        title_blocks, interline_equation_blocks, page_w, page_h):
+    all_bboxes = []
+    all_discarded_blocks = []
+    for image in img_blocks:
+        x0, y0, x1, y1 = image['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Image, None, None, None, None, image["score"]])
+    for table in table_blocks:
+        x0, y0, x1, y1 = table['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Table, None, None, None, None, table["score"]])
+    for text in text_blocks:
+        x0, y0, x1, y1 = text['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Text, None, None, None, None, text["score"]])
+    for title in title_blocks:
+        x0, y0, x1, y1 = title['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Title, None, None, None, None, title["score"]])
+    for interline_equation in interline_equation_blocks:
+        x0, y0, x1, y1 = interline_equation['bbox']
+        all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.InterlineEquation, None, None, None, None, interline_equation["score"]])
+    '''block嵌套问题解决'''
+    '''文本框与标题框重叠，优先信任文本框'''
+    all_bboxes = fix_text_overlap_title_blocks(all_bboxes)
+    '''任何框体与舍弃框重叠，优先信任舍弃框'''
+    all_bboxes = remove_need_drop_blocks(all_bboxes, discarded_blocks)
+    # @todo interline_equation 与title或text框冲突的情况，分两种情况处理
+    '''interline_equation框与文本类型框iou比较接近1的时候，信任行间公式框'''
+    '''interline_equation框被包含在文本类型框内，且interline_equation比文本区块小很多时信任文本框，这时需要舍弃公式框'''
+    '''discarded_blocks中只保留宽度超过1/3页面宽度的，高度超过10的，处于页面下半50%区域的（限定footnote）'''
+    for discarded in discarded_blocks:
+        x0, y0, x1, y1 = discarded['bbox']
+        all_discarded_blocks.append([x0, y0, x1, y1, None, None, None, BlockType.Discarded, None, None, None, None, discarded["score"]])
+        # 将footnote加入到all_bboxes中，用来计算layout
+        if (x1 - x0) > (page_w / 3) and (y1 - y0) > 10 and y0 > (page_h / 2):
+            all_bboxes.append([x0, y0, x1, y1, None, None, None, BlockType.Footnote, None, None, None, None, discarded["score"]])
+    '''经过以上处理后，还存在大框套小框的情况，则删除小框'''
+    all_bboxes = remove_overlaps_min_blocks(all_bboxes)
+    all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
+    '''将剩余的bbox做分离处理，防止后面分layout时出错'''
+    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+    return all_bboxes, all_discarded_blocks, drop_reasons
+def fix_text_overlap_title_blocks(all_bboxes):
+    # 先提取所有text和title block
+    text_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.Text:
+            text_blocks.append(block)
+    title_blocks = []
+    for block in all_bboxes:
+        if block[7] == BlockType.Title:
+            title_blocks.append(block)
+    for text_block in text_blocks:
+        for title_block in title_blocks:
+            text_block_bbox = text_block[:4]
+            title_block_bbox = title_block[:4]
+            if calculate_iou(text_block_bbox, title_block_bbox) > 0.8:
+                all_bboxes.remove(title_block)
+    return all_bboxes
+def remove_need_drop_blocks(all_bboxes, discarded_blocks):
+    need_remove = []
+    for block in all_bboxes:
+        for discarded_block in discarded_blocks:
+            block_bbox = block[:4]
+            if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
+                if block not in need_remove:
+                    need_remove.append(block)
+                    break
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+    return all_bboxes
+def remove_overlaps_min_blocks(all_bboxes):
+    #  删除重叠blocks中较小的那些
+    need_remove = []
+    for block1 in all_bboxes:
+        for block2 in all_bboxes:
+            if block1 != block2:
+                block1_bbox = block1[:4]
+                block2_bbox = block2[:4]
+                overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
+                if overlap_box is not None:
+                    bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
+                    if bbox_to_remove is not None and bbox_to_remove not in need_remove:
+                        need_remove.append(bbox_to_remove)
+    if len(need_remove) > 0:
+        for block in need_remove:
+            all_bboxes.remove(block)
+    return all_bboxes