PyPI - magic-pdf - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

magic_pdf/data/data_reader_writer/filebase.py +3 -0
magic_pdf/filter/pdf_meta_scan.py +3 -17
magic_pdf/libs/commons.py +0 -161
magic_pdf/libs/draw_bbox.py +2 -3
magic_pdf/libs/markdown_utils.py +0 -21
magic_pdf/libs/pdf_image_tools.py +2 -1
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
magic_pdf/model/magic_model.py +0 -30
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
magic_pdf/para/para_split_v3.py +7 -2
magic_pdf/pdf_parse_union_core_v2.py +97 -124
magic_pdf/pre_proc/construct_page_dict.py +0 -55
magic_pdf/pre_proc/cut_image.py +0 -37
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
magic_pdf/rw/S3ReaderWriter.py +1 -1
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
magic_pdf/dict2md/mkcontent.py +0 -438
magic_pdf/layout/__init__.py +0 -0
magic_pdf/layout/bbox_sort.py +0 -681
magic_pdf/layout/layout_det_utils.py +0 -182
magic_pdf/layout/layout_sort.py +0 -921
magic_pdf/layout/layout_spiler_recog.py +0 -101
magic_pdf/layout/mcol_sort.py +0 -336
magic_pdf/libs/calc_span_stats.py +0 -239
magic_pdf/libs/detect_language_from_model.py +0 -21
magic_pdf/libs/nlp_utils.py +0 -203
magic_pdf/libs/textbase.py +0 -33
magic_pdf/libs/vis_utils.py +0 -308
magic_pdf/para/block_continuation_processor.py +0 -562
magic_pdf/para/block_termination_processor.py +0 -480
magic_pdf/para/commons.py +0 -222
magic_pdf/para/denoise.py +0 -246
magic_pdf/para/draw.py +0 -121
magic_pdf/para/exceptions.py +0 -198
magic_pdf/para/layout_match_processor.py +0 -40
magic_pdf/para/para_split.py +0 -807
magic_pdf/para/para_split_v2.py +0 -959
magic_pdf/para/raw_processor.py +0 -207
magic_pdf/para/stats.py +0 -268
magic_pdf/para/title_processor.py +0 -1014
magic_pdf/pdf_parse_union_core.py +0 -345
magic_pdf/post_proc/__init__.py +0 -0
magic_pdf/post_proc/detect_para.py +0 -3472
magic_pdf/post_proc/pdf_post_filter.py +0 -60
magic_pdf/post_proc/remove_footnote.py +0 -153
magic_pdf/pre_proc/citationmarker_remove.py +0 -161
magic_pdf/pre_proc/detect_equation.py +0 -134
magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
magic_pdf/pre_proc/detect_footnote.py +0 -170
magic_pdf/pre_proc/detect_header.py +0 -64
magic_pdf/pre_proc/detect_images.py +0 -647
magic_pdf/pre_proc/detect_page_number.py +0 -64
magic_pdf/pre_proc/detect_tables.py +0 -62
magic_pdf/pre_proc/equations_replace.py +0 -550
magic_pdf/pre_proc/fix_image.py +0 -244
magic_pdf/pre_proc/fix_table.py +0 -270
magic_pdf/pre_proc/main_text_font.py +0 -23
magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
magic_pdf/pre_proc/post_layout_split.py +0 -0
magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
magic_pdf/pre_proc/remove_footer_header.py +0 -114
magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
magic_pdf/pre_proc/solve_line_alien.py +0 -29
magic_pdf/pre_proc/statistics.py +0 -12
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0

magic_pdf/pre_proc/fix_image.py DELETED Viewed

@@ -1,244 +0,0 @@
-import re
-from magic_pdf.libs.boxbase import  _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox
-from magic_pdf.libs.textbase import get_text_block_base_info
-def fix_image_vertical(image_bboxes:list, text_blocks:list):
-    """
-    修正图片的位置
-    如果图片与文字block发生一定重叠（也就是图片切到了一部分文字），那么减少图片边缘，让文字和图片不再重叠。
-    只对垂直方向进行。
-    """
-    for image_bbox in image_bboxes:
-        for text_block in text_blocks:
-            text_bbox = text_block["bbox"]
-            if _is_part_overlap(text_bbox, image_bbox) and any([text_bbox[0]>=image_bbox[0] and text_bbox[2]<=image_bbox[2], text_bbox[0]<=image_bbox[0] and text_bbox[2]>=image_bbox[2]]):
-                if text_bbox[1] < image_bbox[1]:#在图片上方
-                    image_bbox[1] = text_bbox[3]+1
-                elif text_bbox[3]>image_bbox[3]:#在图片下方
-                    image_bbox[3] = text_bbox[1]-1
-    return image_bboxes
-def __merge_if_common_edge(bbox1, bbox2):
-    x_min_1, y_min_1, x_max_1, y_max_1 = bbox1
-    x_min_2, y_min_2, x_max_2, y_max_2 = bbox2
-    # 检查是否有公共的水平边
-    if y_min_1 == y_min_2 or y_max_1 == y_max_2:
-        # 确保一个框的x范围在另一个框的x范围内
-        if max(x_min_1, x_min_2) <= min(x_max_1, x_max_2):
-            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
-    # 检查是否有公共的垂直边
-    if x_min_1 == x_min_2 or x_max_1 == x_max_2:
-        # 确保一个框的y范围在另一个框的y范围内
-        if max(y_min_1, y_min_2) <= min(y_max_1, y_max_2):
-            return [min(x_min_1, x_min_2), min(y_min_1, y_min_2), max(x_max_1, x_max_2), max(y_max_1, y_max_2)]
-    # 如果没有公共边
-    return None
-def fix_seperated_image(image_bboxes:list):
-    """
-    如果2个图片有一个边重叠，那么合并2个图片
-    """
-    new_images = []
-    droped_img_idx = []
-    for i in range(0, len(image_bboxes)):
-        for j in range(i+1, len(image_bboxes)):
-            new_img = __merge_if_common_edge(image_bboxes[i], image_bboxes[j])
-            if new_img is not None:
-                new_images.append(new_img)
-                droped_img_idx.append(i)
-                droped_img_idx.append(j)
-                break
-    for i in range(0, len(image_bboxes)):
-        if i not in droped_img_idx:
-            new_images.append(image_bboxes[i])
-    return new_images
-def __check_img_title_pattern(text):
-    """
-    检查文本段是否是表格的标题
-    """
-    patterns = [r"^(fig|figure).*", r"^(scheme).*"]
-    text = text.strip()
-    for pattern in patterns:
-        match = re.match(pattern, text, re.IGNORECASE)
-        if match:
-            return True
-    return False
-def __get_fig_caption_text(text_block):
-    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
-    line_cnt = len(text_block['lines'])
-    txt = txt.replace("Ž . ", '')
-    return txt, line_cnt
-def __find_and_extend_bottom_caption(text_block, pymu_blocks, image_box):
-    """
-    继续向下方寻找和图片caption字号，字体，颜色一样的文字框，合并入caption。
-    text_block是已经找到的图片catpion（这个caption可能不全，多行被划分到多个pymu block里了）
-    """
-    combined_image_caption_text_block = list(text_block.copy()['bbox'])
-    base_font_color, base_font_size, base_font_type = get_text_block_base_info(text_block)
-    while True:
-        tb_add = find_bottom_nearest_text_bbox(pymu_blocks, combined_image_caption_text_block)
-        if not tb_add:
-            break
-        tb_font_color, tb_font_size, tb_font_type = get_text_block_base_info(tb_add)
-        if tb_font_color==base_font_color and tb_font_size==base_font_size and tb_font_type==base_font_type:
-            combined_image_caption_text_block[0] = min(combined_image_caption_text_block[0], tb_add['bbox'][0])
-            combined_image_caption_text_block[2] = max(combined_image_caption_text_block[2], tb_add['bbox'][2])
-            combined_image_caption_text_block[3] = tb_add['bbox'][3]
-        else:
-            break
-    image_box[0] = min(image_box[0], combined_image_caption_text_block[0])
-    image_box[1] = min(image_box[1], combined_image_caption_text_block[1])
-    image_box[2] = max(image_box[2], combined_image_caption_text_block[2])
-    image_box[3] = max(image_box[3], combined_image_caption_text_block[3])
-    text_block['_image_caption'] = True
-def include_img_title(pymu_blocks, image_bboxes: list):
-    """
-    向上方和下方寻找符合图片title的文本block，合并到图片里
-    如果图片上下都有fig的情况怎么办？寻找标题距离最近的那个。
-    ---
-    增加对左侧和右侧图片标题的寻找
-    """
-    for tb in image_bboxes:
-        # 优先找下方的
-        max_find_cnt = 3 # 向上，向下最多找3个就停止
-        temp_box = tb.copy()
-        while max_find_cnt>0:
-            text_block_btn = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
-            if text_block_btn:
-                txt, line_cnt = __get_fig_caption_text(text_block_btn)
-                if len(txt.strip())>0:
-                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt<3: # 设置line_cnt<=2目的是为了跳过子标题，或者有时候图片下方文字没有被图片识别模型放入图片里
-                        max_find_cnt = max_find_cnt - 1
-                        temp_box[3] = text_block_btn['bbox'][3]
-                        continue
-                    else:
-                        break
-                else:
-                    temp_box[3] = text_block_btn['bbox'][3] # 宽度不变，扩大
-                    max_find_cnt = max_find_cnt - 1
-            else:
-                break
-        max_find_cnt = 3 # 向上，向下最多找3个就停止
-        temp_box = tb.copy()
-        while max_find_cnt>0:
-            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
-            if text_block_top:
-                txt, line_cnt = __get_fig_caption_text(text_block_top)
-                if len(txt.strip())>0:
-                    if not __check_img_title_pattern(txt) and max_find_cnt>0 and line_cnt <3:
-                        max_find_cnt = max_find_cnt - 1
-                        temp_box[1] = text_block_top['bbox'][1]
-                        continue
-                    else:
-                        break
-                else:
-                    b = text_block_top['bbox']
-                    temp_box[1] = b[1] # 宽度不变，扩大
-                    max_find_cnt = max_find_cnt - 1
-            else:
-                break
-        if text_block_btn and text_block_top and text_block_btn.get("_image_caption", False) is False and text_block_top.get("_image_caption", False) is False :
-            btn_text, _ = __get_fig_caption_text(text_block_btn)
-            top_text, _ = __get_fig_caption_text(text_block_top)
-            if __check_img_title_pattern(btn_text) and __check_img_title_pattern(top_text):
-                # 取距离图片最近的
-                btn_text_distance = text_block_btn['bbox'][1] - tb[3]
-                top_text_distance = tb[1] - text_block_top['bbox'][3]
-                if btn_text_distance<top_text_distance: # caption在下方
-                    __find_and_extend_bottom_caption(text_block_btn, pymu_blocks, tb)
-                else:
-                    text_block = text_block_top
-                    tb[0] = min(tb[0], text_block['bbox'][0])
-                    tb[1] = min(tb[1], text_block['bbox'][1])
-                    tb[2] = max(tb[2], text_block['bbox'][2])
-                    tb[3] = max(tb[3], text_block['bbox'][3])
-                    text_block_btn['_image_caption'] = True
-                continue
-        text_block = text_block_btn # find_bottom_nearest_text_bbox(pymu_blocks, tb)
-        if text_block and text_block.get("_image_caption", False) is False:
-            first_text_line, _ = __get_fig_caption_text(text_block)
-            if __check_img_title_pattern(first_text_line):
-                # 发现特征之后，继续向相同方向寻找（想同颜色，想同大小，想同字体）的textblock
-                __find_and_extend_bottom_caption(text_block, pymu_blocks, tb)
-                continue
-        text_block = text_block_top # find_top_nearest_text_bbox(pymu_blocks, tb)
-        if text_block  and text_block.get("_image_caption", False) is False:
-            first_text_line, _ = __get_fig_caption_text(text_block)
-            if __check_img_title_pattern(first_text_line):
-                tb[0] = min(tb[0], text_block['bbox'][0])
-                tb[1] = min(tb[1], text_block['bbox'][1])
-                tb[2] = max(tb[2], text_block['bbox'][2])
-                tb[3] = max(tb[3], text_block['bbox'][3])
-                text_block['_image_caption'] = True
-                continue
-        """向左、向右寻找，暂时只寻找一次"""
-        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
-        if left_text_block and left_text_block.get("_image_caption", False) is False:
-            first_text_line, _ = __get_fig_caption_text(left_text_block)
-            if __check_img_title_pattern(first_text_line):
-                tb[0] = min(tb[0], left_text_block['bbox'][0])
-                tb[1] = min(tb[1], left_text_block['bbox'][1])
-                tb[2] = max(tb[2], left_text_block['bbox'][2])
-                tb[3] = max(tb[3], left_text_block['bbox'][3])
-                left_text_block['_image_caption'] = True
-                continue
-        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
-        if right_text_block and right_text_block.get("_image_caption", False) is False:
-            first_text_line, _ = __get_fig_caption_text(right_text_block)
-            if __check_img_title_pattern(first_text_line):
-                tb[0] = min(tb[0], right_text_block['bbox'][0])
-                tb[1] = min(tb[1], right_text_block['bbox'][1])
-                tb[2] = max(tb[2], right_text_block['bbox'][2])
-                tb[3] = max(tb[3], right_text_block['bbox'][3])
-                right_text_block['_image_caption'] = True
-                continue
-    return image_bboxes
-def combine_images(image_bboxes:list):
-    """
-    合并图片，如果图片有重叠，那么合并
-    """
-    new_images = []
-    droped_img_idx = []
-    for i in range(0, len(image_bboxes)):
-        for j in range(i+1, len(image_bboxes)):
-            if j not in droped_img_idx and _is_in_or_part_overlap(image_bboxes[i], image_bboxes[j]):
-                # 合并
-                image_bboxes[i][0], image_bboxes[i][1],image_bboxes[i][2],image_bboxes[i][3] = min(image_bboxes[i][0], image_bboxes[j][0]), min(image_bboxes[i][1], image_bboxes[j][1]), max(image_bboxes[i][2], image_bboxes[j][2]), max(image_bboxes[i][3], image_bboxes[j][3])
-                droped_img_idx.append(j)
-    for i in range(0, len(image_bboxes)):
-        if i not in droped_img_idx:
-            new_images.append(image_bboxes[i])
-    return new_images

magic_pdf/pre_proc/fix_table.py DELETED Viewed

@@ -1,270 +0,0 @@
-from magic_pdf.libs.commons import fitz             # pyMuPDF库
-import re
-from magic_pdf.libs.boxbase import _is_in_or_part_overlap, _is_part_overlap, find_bottom_nearest_text_bbox, find_left_nearest_text_bbox, find_right_nearest_text_bbox, find_top_nearest_text_bbox             # json
-## version 2
-def get_merged_line(page):
-    """
-    这个函数是为了从pymuPDF中提取出的矢量里筛出水平的横线，并且将断开的线段进行了合并。
-    :param page :fitz读取的当前页的内容
-    """
-    drawings_bbox = []
-    drawings_line = []
-    drawings = page.get_drawings()  # 提取所有的矢量
-    for p in drawings:
-        drawings_bbox.append(p["rect"].irect)  # (L, U, R, D)
-    lines = []
-    for L, U, R, D in drawings_bbox:
-        if abs(D - U) <= 3: # 筛出水平的横线
-            lines.append((L, U, R, D))
-    U_groups = []
-    visited = [False for _ in range(len(lines))]
-    for i, (L1, U1, R1, D1) in enumerate(lines):
-        if visited[i] == True:
-            continue
-        tmp_g = [(L1, U1, R1, D1)]
-        for j, (L2, U2, R2, D2) in enumerate(lines):
-            if i == j:
-                continue
-            if visited[j] == True:
-                continue
-            if max(U1, D1, U2, D2) - min(U1, D1, U2, D2) <= 5:   # 把高度一致的线放进一个group
-                tmp_g.append((L2, U2, R2, D2))
-                visited[j] = True
-        U_groups.append(tmp_g)
-    res = []
-    for group in U_groups:
-        group.sort(key = lambda LURD: (LURD[0], LURD[2]))
-        LL, UU, RR, DD = group[0]
-        for i, (L1, U1, R1, D1) in enumerate(group):
-            if (L1 - RR) >= 5:
-                cur_line = (LL, UU, RR, DD)
-                res.append(cur_line)
-                LL = L1
-            else:
-                RR = max(RR, R1)
-        cur_line = (LL, UU, RR, DD)
-        res.append(cur_line)
-    return res
-def fix_tables(page: fitz.Page, table_bboxes: list, include_table_title: bool, scan_line_num: int):
-    """
-    :param page :fitz读取的当前页的内容
-    :param table_bboxes: list类型，每一个元素是一个元祖 (L, U, R, D)
-    :param include_table_title: 是否将表格的标题也圈进来
-    :param scan_line_num: 在与表格框临近的上下几个文本框里扫描搜索标题
-    """
-    drawings_lines = get_merged_line(page)
-    fix_table_bboxes = []
-    for table in table_bboxes:
-        (L, U, R, D) = table
-        fix_table_L = []
-        fix_table_U = []
-        fix_table_R = []
-        fix_table_D = []
-        width = R - L
-        width_range = width * 0.1 # 只看距离表格整体宽度10%之内偏差的线
-        height = D - U
-        height_range = height * 0.1 # 只看距离表格整体高度10%之内偏差的线
-        for line in drawings_lines:
-            if (L - width_range) <= line[0] <= (L + width_range) and (R - width_range) <= line[2] <= (R + width_range): # 相近的宽度
-                if (U - height_range) < line[1] < (U + height_range): # 上边界，在一定的高度范围内
-                    fix_table_U.append(line[1])
-                    fix_table_L.append(line[0])
-                    fix_table_R.append(line[2])
-                elif (D - height_range) < line[1] < (D + height_range): # 下边界，在一定的高度范围内
-                    fix_table_D.append(line[1])
-                    fix_table_L.append(line[0])
-                    fix_table_R.append(line[2])
-        if fix_table_U:
-            U = min(fix_table_U)
-        if fix_table_D:
-            D = max(fix_table_D)
-        if fix_table_L:
-            L = min(fix_table_L)
-        if fix_table_R:
-            R = max(fix_table_R)
-        if include_table_title:   # 需要将表格标题包括
-            text_blocks = page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]   # 所有的text的block
-            incolumn_text_blocks = [block for block in text_blocks if not ((block['bbox'][0] < L and block['bbox'][2] < L) or (block['bbox'][0] > R and block['bbox'][2] > R))]  # 将与表格完全没有任何遮挡的文字筛除掉（比如另一栏的文字）
-            upper_text_blocks = [block for block in incolumn_text_blocks if (U - block['bbox'][3]) > 0]  # 将在表格线以上的text block筛选出来
-            sorted_filtered_text_blocks = sorted(upper_text_blocks, key=lambda x: (U - x['bbox'][3], x['bbox'][0])) # 按照text block的下边界距离表格上边界的距离升序排序，如果是同一个高度，则先左再右
-            for idx in range(scan_line_num):
-                if idx+1 <= len(sorted_filtered_text_blocks):
-                    line_temp = sorted_filtered_text_blocks[idx]['lines']
-                    if line_temp:
-                        text = line_temp[0]['spans'][0]['text'] # 提取出第一个span里的text内容
-                        check_en = re.match('Table', text) # 检查是否有Table开头的(英文）
-                        check_ch = re.match('表', text) # 检查是否有Table开头的(中文）
-                        if check_en or check_ch:
-                            if sorted_filtered_text_blocks[idx]['bbox'][1] < D: # 以防出现负的bbox
-                                U = sorted_filtered_text_blocks[idx]['bbox'][1]
-        fix_table_bboxes.append([L-2, U-2, R+2, D+2])
-    return fix_table_bboxes
-def __check_table_title_pattern(text):
-    """
-    检查文本段是否是表格的标题
-    """
-    patterns = [r'^table\s\d+']
-    for pattern in patterns:
-        match = re.match(pattern, text, re.IGNORECASE)
-        if match:
-            return True
-        else:
-            return False
-def fix_table_text_block(pymu_blocks, table_bboxes: list):
-    """
-    调整table, 如果table和上下的text block有相交区域，则将table的上下边界调整到text block的上下边界
-    例如 tmp/unittest/unittest_pdf/纯2列_ViLT_6_文字 表格.pdf
-    """
-    for tb in table_bboxes:
-        (L, U, R, D) = tb
-        for block in pymu_blocks:
-            if _is_in_or_part_overlap((L, U, R, D), block['bbox']):
-                txt = " ".join(span['text'] for line in block['lines'] for span in line['spans'])
-                if not __check_table_title_pattern(txt) and block.get("_table", False) is False: # 如果是table的title，那么不调整。因为下一步会统一调整，如果这里进行了调整，后面的调整会造成调整到其他table的title上（在连续出现2个table的情况下）。
-                    tb[0] = min(tb[0], block['bbox'][0])
-                    tb[1] = min(tb[1], block['bbox'][1])
-                    tb[2] = max(tb[2], block['bbox'][2])
-                    tb[3] = max(tb[3], block['bbox'][3])
-                    block['_table'] = True # 占位，防止其他table再次占用
-                """如果是个table的title，但是有部分重叠，那么修正这个title,使得和table不重叠"""
-                if _is_part_overlap(tb, block['bbox']) and __check_table_title_pattern(txt):
-                    block['bbox'] = list(block['bbox'])
-                    if block['bbox'][3] > U:
-                        block['bbox'][3] = U-1
-                    if block['bbox'][1] < D:
-                        block['bbox'][1] = D+1
-    return table_bboxes
-def __get_table_caption_text(text_block):
-    txt = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
-    line_cnt = len(text_block['lines'])
-    txt = txt.replace("Ž . ", '')
-    return txt, line_cnt
-def include_table_title(pymu_blocks, table_bboxes: list):
-    """
-    把表格的title也包含进来，扩展到table_bbox上
-    """
-    for tb in table_bboxes:
-        max_find_cnt = 3 # 上上最多找3次
-        temp_box = tb.copy()
-        while max_find_cnt>0:
-            text_block_top = find_top_nearest_text_bbox(pymu_blocks, temp_box)
-            if text_block_top:
-                txt, line_cnt = __get_table_caption_text(text_block_top)
-                if len(txt.strip())>0:
-                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
-                        max_find_cnt = max_find_cnt -1
-                        temp_box[1] = text_block_top['bbox'][1]
-                        continue
-                    else:
-                        break
-                else:
-                    temp_box[1] = text_block_top['bbox'][1] # 宽度不变，扩大
-                    max_find_cnt = max_find_cnt - 1
-            else:
-                break
-        max_find_cnt = 3 # 向下找
-        temp_box = tb.copy()
-        while max_find_cnt>0:
-            text_block_bottom = find_bottom_nearest_text_bbox(pymu_blocks, temp_box)
-            if text_block_bottom:
-                txt, line_cnt = __get_table_caption_text(text_block_bottom)
-                if len(txt.strip())>0:
-                    if not __check_table_title_pattern(txt) and max_find_cnt>0 and line_cnt<3:
-                        max_find_cnt = max_find_cnt - 1
-                        temp_box[3] = text_block_bottom['bbox'][3]
-                        continue
-                    else:
-                        break
-                else:
-                    temp_box[3] = text_block_bottom['bbox'][3]
-                    max_find_cnt = max_find_cnt - 1
-            else:
-                break
-        if text_block_top and text_block_bottom and text_block_top.get("_table_caption", False) is False and text_block_bottom.get("_table_caption", False) is False :
-            btn_text, _ = __get_table_caption_text(text_block_bottom)
-            top_text, _ = __get_table_caption_text(text_block_top)
-            if __check_table_title_pattern(btn_text) and __check_table_title_pattern(top_text): # 上下都有一个tbale的caption
-                # 取距离最近的
-                btn_text_distance = text_block_bottom['bbox'][1] - tb[3]
-                top_text_distance = tb[1] - text_block_top['bbox'][3]
-                text_block = text_block_bottom if btn_text_distance<top_text_distance else text_block_top
-                tb[0] = min(tb[0], text_block['bbox'][0])
-                tb[1] = min(tb[1], text_block['bbox'][1])
-                tb[2] = max(tb[2], text_block['bbox'][2])
-                tb[3] = max(tb[3], text_block['bbox'][3])
-                text_block_bottom['_table_caption'] = True
-                continue
-        # 如果以上条件都不满足，那么就向下找
-        text_block = text_block_top
-        if text_block and text_block.get("_table_caption", False) is False:
-            first_text_line = " ".join(span['text'] for line in text_block['lines'] for span in line['spans'])
-            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
-                tb[0] = min(tb[0], text_block['bbox'][0])
-                tb[1] = min(tb[1], text_block['bbox'][1])
-                tb[2] = max(tb[2], text_block['bbox'][2])
-                tb[3] = max(tb[3], text_block['bbox'][3])
-                text_block['_table_caption'] = True
-                continue
-        text_block = text_block_bottom
-        if text_block and text_block.get("_table_caption", False) is False:
-            first_text_line, _ = __get_table_caption_text(text_block)
-            if __check_table_title_pattern(first_text_line) and text_block.get("_table", False) is False:
-                tb[0] = min(tb[0], text_block['bbox'][0])
-                tb[1] = min(tb[1], text_block['bbox'][1])
-                tb[2] = max(tb[2], text_block['bbox'][2])
-                tb[3] = max(tb[3], text_block['bbox'][3])
-                text_block['_table_caption'] = True
-                continue
-        """向左、向右寻找，暂时只寻找一次"""
-        left_text_block = find_left_nearest_text_bbox(pymu_blocks, tb)
-        if left_text_block and left_text_block.get("_image_caption", False) is False:
-            first_text_line, _ = __get_table_caption_text(left_text_block)
-            if __check_table_title_pattern(first_text_line):
-                tb[0] = min(tb[0], left_text_block['bbox'][0])
-                tb[1] = min(tb[1], left_text_block['bbox'][1])
-                tb[2] = max(tb[2], left_text_block['bbox'][2])
-                tb[3] = max(tb[3], left_text_block['bbox'][3])
-                left_text_block['_image_caption'] = True
-                continue
-        right_text_block = find_right_nearest_text_bbox(pymu_blocks, tb)
-        if right_text_block and right_text_block.get("_image_caption", False) is False:
-            first_text_line, _ = __get_table_caption_text(right_text_block)
-            if __check_table_title_pattern(first_text_line):
-                tb[0] = min(tb[0], right_text_block['bbox'][0])
-                tb[1] = min(tb[1], right_text_block['bbox'][1])
-                tb[2] = max(tb[2], right_text_block['bbox'][2])
-                tb[3] = max(tb[3], right_text_block['bbox'][3])
-                right_text_block['_image_caption'] = True
-                continue
-    return table_bboxes

magic_pdf/pre_proc/main_text_font.py DELETED Viewed

@@ -1,23 +0,0 @@
-import collections
-def get_main_text_font(pdf_docs):
-    font_names = collections.Counter()
-    for page in pdf_docs:
-        blocks = page.get_text('dict')['blocks']
-        if blocks is not None:
-            for block in blocks:
-                lines = block.get('lines')
-                if lines is not None:
-                    for line in lines:
-                        span_font = [(span['font'], len(span['text'])) for span in line['spans'] if
-                                     'font' in span and len(span['text']) > 0]
-                        if span_font:
-                            # main_text_font应该用基于字数最多的字体而不是span级别的统计
-                            # font_names.append(font_name for font_name in span_font)
-                            # block_fonts.append(font_name for font_name in span_font)
-                            for font, count in span_font:
-                                font_names[font] += count
-    main_text_font = font_names.most_common(1)[0][0]
-    return main_text_font

magic_pdf/pre_proc/ocr_detect_layout.py DELETED Viewed

@@ -1,133 +0,0 @@
-import fitz
-from magic_pdf.layout.layout_sort import get_bboxes_layout
-from magic_pdf.libs.boxbase import _is_part_overlap, _is_in
-from magic_pdf.libs.coordinate_transform import get_scale_ratio
-def get_center_point(bbox):
-    """
-    根据边界框坐标信息，计算出该边界框的中心点坐标。
-    Args:
-        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
-    Returns:
-        list: 中心点坐标信息，包含两个元素，分别为x坐标和y坐标。
-    """
-    return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
-def get_area(bbox):
-    """
-    根据边界框坐标信息，计算出该边界框的面积。
-    Args:
-        bbox (list): 边界框坐标信息，包含四个元素，分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
-    Returns:
-        float: 该边界框的面积。
-    """
-    return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
-def adjust_layouts(layout_bboxes, page_boundry, page_id):
-    # 遍历所有布局框
-    for i in range(len(layout_bboxes)):
-        # 遍历当前布局框之后的布局框
-        for j in range(i + 1, len(layout_bboxes)):
-            # 判断两个布局框是否重叠
-            if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
-                # 计算每个布局框的中心点坐标和面积
-                area_i = get_area(layout_bboxes[i])
-                area_j = get_area(layout_bboxes[j])
-                # 较大布局框和较小布局框的赋值
-                if area_i > area_j:
-                    larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
-                else:
-                    larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]
-                center_large = get_center_point(larger_layout)
-                center_small = get_center_point(smaller_layout)
-                # 计算横向和纵向的距离差
-                distance_x = center_large[0] - center_small[0]
-                distance_y = center_large[1] - center_small[1]
-                # 根据距离差判断重叠方向并修正边界
-                if abs(distance_x) > abs(distance_y):  # 左右重叠
-                    if distance_x > 0 and larger_layout[0] < smaller_layout[2]:
-                        larger_layout[0] = smaller_layout[2]+1
-                    if distance_x < 0 and larger_layout[2] > smaller_layout[0]:
-                        larger_layout[2] = smaller_layout[0]-1
-                else:  # 上下重叠
-                    if distance_y > 0 and larger_layout[1] < smaller_layout[3]:
-                        larger_layout[1] = smaller_layout[3]+1
-                    if distance_y < 0 and larger_layout[3] > smaller_layout[1]:
-                        larger_layout[3] = smaller_layout[1]-1
-    # 排序调整布局边界框列表
-    new_bboxes = []
-    for layout_bbox in layout_bboxes:
-        new_bboxes.append([layout_bbox[0], layout_bbox[1], layout_bbox[2], layout_bbox[3], None, None, None, None, None, None, None, None, None])
-    layout_bboxes, layout_tree = get_bboxes_layout(new_bboxes, page_boundry, page_id)
-    # 返回排序调整后的布局边界框列表
-    return layout_bboxes, layout_tree
-def layout_detect(layout_info, page: fitz.Page, ocr_page_info):
-    """
-    对输入的布局信息进行解析，提取出每个子布局的边界框，并对所有子布局进行排序调整。
-    Args:
-        layout_info (list): 包含子布局信息的列表，每个子布局信息为字典类型，包含'poly'字段，表示子布局的边界框坐标信息。
-    Returns:
-        list: 经过排序调整后的所有子布局边界框信息的列表，每个边界框信息为字典类型，包含'layout_bbox'字段，表示边界框的坐标信息。
-    """
-    page_id = ocr_page_info['page_info']['page_no']-1
-    horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(ocr_page_info, page)
-    # 初始化布局边界框列表
-    layout_bboxes = []
-    # 遍历每个子布局
-    for sub_layout in layout_info:
-        # 提取子布局的边界框坐标信息
-        x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
-        bbox = [int(x0 / horizontal_scale_ratio), int(y0 / vertical_scale_ratio),
-                int(x1 / horizontal_scale_ratio), int(y1 / vertical_scale_ratio)]
-        # 将子布局的边界框添加到列表中
-        layout_bboxes.append(bbox)
-    # 初始化新的布局边界框列表
-    new_layout_bboxes = []
-    # 遍历每个布局边界框
-    for i in range(len(layout_bboxes)):
-        # 初始化标记变量，用于判断当前边界框是否需要保留
-        keep = True
-        # 获取当前边界框的坐标信息
-        box_i = layout_bboxes[i]
-        # 遍历其他边界框
-        for j in range(len(layout_bboxes)):
-            # 排除当前边界框自身
-            if i != j:
-                # 获取其他边界框的坐标信息
-                box_j = layout_bboxes[j]
-                # 检测box_i是否被box_j包含
-                if _is_in(box_i, box_j):
-                    # 如果当前边界框被其他边界框包含，则标记为不需要保留
-                    keep = False
-                    # 跳出内层循环
-                    break
-        # 如果当前边界框需要保留，则添加到新的布局边界框列表中
-        if keep:
-            new_layout_bboxes.append(layout_bboxes[i])
-    # 对新的布局边界框列表进行排序调整
-    page_width = page.rect.width
-    page_height = page.rect.height
-    page_boundry = [0, 0, page_width, page_height]
-    layout_bboxes, layout_tree = adjust_layouts(new_layout_bboxes, page_boundry, page_id)
-    # 返回排序调整后的布局边界框列表
-    return layout_bboxes, layout_tree

magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl