PyPI - magic-pdf - Versions diffs - 0.5.4__py3-none-any.whl - Mend

magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (121) hide show

magic_pdf/__init__.py +0 -0
magic_pdf/cli/__init__.py +0 -0
magic_pdf/cli/magicpdf.py +294 -0
magic_pdf/dict2md/__init__.py +0 -0
magic_pdf/dict2md/mkcontent.py +397 -0
magic_pdf/dict2md/ocr_mkcontent.py +356 -0
magic_pdf/filter/__init__.py +0 -0
magic_pdf/filter/pdf_classify_by_type.py +381 -0
magic_pdf/filter/pdf_meta_scan.py +368 -0
magic_pdf/layout/__init__.py +0 -0
magic_pdf/layout/bbox_sort.py +681 -0
magic_pdf/layout/layout_det_utils.py +182 -0
magic_pdf/layout/layout_sort.py +732 -0
magic_pdf/layout/layout_spiler_recog.py +101 -0
magic_pdf/layout/mcol_sort.py +336 -0
magic_pdf/libs/Constants.py +11 -0
magic_pdf/libs/MakeContentConfig.py +10 -0
magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
magic_pdf/libs/__init__.py +0 -0
magic_pdf/libs/boxbase.py +408 -0
magic_pdf/libs/calc_span_stats.py +239 -0
magic_pdf/libs/commons.py +204 -0
magic_pdf/libs/config_reader.py +63 -0
magic_pdf/libs/convert_utils.py +5 -0
magic_pdf/libs/coordinate_transform.py +9 -0
magic_pdf/libs/detect_language_from_model.py +21 -0
magic_pdf/libs/draw_bbox.py +227 -0
magic_pdf/libs/drop_reason.py +27 -0
magic_pdf/libs/drop_tag.py +19 -0
magic_pdf/libs/hash_utils.py +15 -0
magic_pdf/libs/json_compressor.py +27 -0
magic_pdf/libs/language.py +31 -0
magic_pdf/libs/markdown_utils.py +31 -0
magic_pdf/libs/math.py +9 -0
magic_pdf/libs/nlp_utils.py +203 -0
magic_pdf/libs/ocr_content_type.py +21 -0
magic_pdf/libs/path_utils.py +23 -0
magic_pdf/libs/pdf_image_tools.py +33 -0
magic_pdf/libs/safe_filename.py +11 -0
magic_pdf/libs/textbase.py +33 -0
magic_pdf/libs/version.py +1 -0
magic_pdf/libs/vis_utils.py +308 -0
magic_pdf/model/__init__.py +0 -0
magic_pdf/model/doc_analyze_by_360layout.py +8 -0
magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
magic_pdf/model/magic_model.py +632 -0
magic_pdf/para/__init__.py +0 -0
magic_pdf/para/block_continuation_processor.py +562 -0
magic_pdf/para/block_termination_processor.py +480 -0
magic_pdf/para/commons.py +222 -0
magic_pdf/para/denoise.py +246 -0
magic_pdf/para/draw.py +121 -0
magic_pdf/para/exceptions.py +198 -0
magic_pdf/para/layout_match_processor.py +40 -0
magic_pdf/para/para_pipeline.py +297 -0
magic_pdf/para/para_split.py +644 -0
magic_pdf/para/para_split_v2.py +772 -0
magic_pdf/para/raw_processor.py +207 -0
magic_pdf/para/stats.py +268 -0
magic_pdf/para/title_processor.py +1014 -0
magic_pdf/pdf_parse_by_ocr.py +219 -0
magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
magic_pdf/pdf_parse_by_txt.py +410 -0
magic_pdf/pdf_parse_by_txt_v2.py +56 -0
magic_pdf/pdf_parse_for_train.py +685 -0
magic_pdf/pdf_parse_union_core.py +241 -0
magic_pdf/pipe/AbsPipe.py +112 -0
magic_pdf/pipe/OCRPipe.py +28 -0
magic_pdf/pipe/TXTPipe.py +29 -0
magic_pdf/pipe/UNIPipe.py +83 -0
magic_pdf/pipe/__init__.py +0 -0
magic_pdf/post_proc/__init__.py +0 -0
magic_pdf/post_proc/detect_para.py +3472 -0
magic_pdf/post_proc/pdf_post_filter.py +67 -0
magic_pdf/post_proc/remove_footnote.py +153 -0
magic_pdf/pre_proc/__init__.py +0 -0
magic_pdf/pre_proc/citationmarker_remove.py +157 -0
magic_pdf/pre_proc/construct_page_dict.py +72 -0
magic_pdf/pre_proc/cut_image.py +71 -0
magic_pdf/pre_proc/detect_equation.py +134 -0
magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
magic_pdf/pre_proc/detect_footnote.py +170 -0
magic_pdf/pre_proc/detect_header.py +64 -0
magic_pdf/pre_proc/detect_images.py +647 -0
magic_pdf/pre_proc/detect_page_number.py +64 -0
magic_pdf/pre_proc/detect_tables.py +62 -0
magic_pdf/pre_proc/equations_replace.py +559 -0
magic_pdf/pre_proc/fix_image.py +244 -0
magic_pdf/pre_proc/fix_table.py +270 -0
magic_pdf/pre_proc/main_text_font.py +23 -0
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
magic_pdf/pre_proc/post_layout_split.py +0 -0
magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
magic_pdf/pre_proc/remove_footer_header.py +117 -0
magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
magic_pdf/pre_proc/solve_line_alien.py +29 -0
magic_pdf/pre_proc/statistics.py +12 -0
magic_pdf/rw/AbsReaderWriter.py +34 -0
magic_pdf/rw/DiskReaderWriter.py +66 -0
magic_pdf/rw/S3ReaderWriter.py +107 -0
magic_pdf/rw/__init__.py +0 -0
magic_pdf/spark/__init__.py +0 -0
magic_pdf/spark/spark_api.py +51 -0
magic_pdf/train_utils/__init__.py +0 -0
magic_pdf/train_utils/convert_to_train_format.py +65 -0
magic_pdf/train_utils/extract_caption.py +59 -0
magic_pdf/train_utils/remove_footer_header.py +159 -0
magic_pdf/train_utils/vis_utils.py +327 -0
magic_pdf/user_api.py +136 -0
magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
magic_pdf-0.5.4.dist-info/METADATA +24 -0
magic_pdf-0.5.4.dist-info/RECORD +121 -0
magic_pdf-0.5.4.dist-info/WHEEL +5 -0
magic_pdf-0.5.4.dist-info/top_level.txt +1 -0

magic_pdf/libs/boxbase.py ADDED Viewed

@@ -0,0 +1,408 @@
+from loguru import logger
+import math
+def _is_in_or_part_overlap(box1, box2) -> bool:
+    """
+    两个bbox是否有部分重叠或者包含
+    """
+    if box1 is None or box2 is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    return not (x1_1 < x0_2 or  # box1在box2的左边
+                x0_1 > x1_2 or  # box1在box2的右边
+                y1_1 < y0_2 or  # box1在box2的上边
+                y0_1 > y1_2)    # box1在box2的下边
+def _is_in_or_part_overlap_with_area_ratio(box1, box2, area_ratio_threshold=0.6):
+    """
+    判断box1是否在box2里面，或者box1和box2有部分重叠，且重叠面积占box1的比例超过area_ratio_threshold
+    """
+    if box1 is None or box2 is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    if not _is_in_or_part_overlap(box1, box2):
+        return False
+    # 计算重叠面积
+    x_left = max(x0_1, x0_2)
+    y_top = max(y0_1, y0_2)
+    x_right = min(x1_1, x1_2)
+    y_bottom = min(y1_1, y1_2)
+    overlap_area = (x_right - x_left) * (y_bottom - y_top)
+    # 计算box1的面积
+    box1_area = (x1_1 - x0_1) * (y1_1 - y0_1)
+    return overlap_area / box1_area > area_ratio_threshold
+def _is_in(box1, box2) -> bool:
+    """
+    box1是否完全在box2里面
+    """
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    return (x0_1 >= x0_2 and  # box1的左边界不在box2的左边外
+            y0_1 >= y0_2 and  # box1的上边界不在box2的上边外
+            x1_1 <= x1_2 and  # box1的右边界不在box2的右边外
+            y1_1 <= y1_2)     # box1的下边界不在box2的下边外
+def _is_part_overlap(box1, box2) -> bool:
+    """
+    两个bbox是否有部分重叠，但不完全包含
+    """
+    if box1 is None or box2 is None:
+        return False
+    return _is_in_or_part_overlap(box1, box2) and not _is_in(box1, box2)
+def _left_intersect(left_box, right_box):
+    "检查两个box的左边界是否有交集，也就是left_box的右边界是否在right_box的左边界内"
+    if left_box is None or right_box is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = left_box
+    x0_2, y0_2, x1_2, y1_2 = right_box
+    return x1_1>x0_2 and x0_1<x0_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
+def _right_intersect(left_box, right_box):
+    """
+    检查box是否在右侧边界有交集，也就是left_box的左边界是否在right_box的右边界内
+    """
+    if left_box is None or right_box is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = left_box
+    x0_2, y0_2, x1_2, y1_2 = right_box
+    return x0_1<x1_2 and x1_1>x1_2 and (y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1)
+def _is_vertical_full_overlap(box1, box2, x_torlence=2):
+    """
+    x方向上：要么box1包含box2, 要么box2包含box1。不能部分包含
+    y方向上：box1和box2有重叠
+    """
+    # 解析box的坐标
+    x11, y11, x12, y12 = box1  # 左上角和右下角的坐标 (x1, y1, x2, y2)
+    x21, y21, x22, y22 = box2
+    # 在x轴方向上，box1是否包含box2 或 box2包含box1
+    contains_in_x = (x11-x_torlence <= x21 and x12+x_torlence >= x22) or (x21-x_torlence <= x11 and x22+x_torlence >= x12)
+    # 在y轴方向上，box1和box2是否有重叠
+    overlap_in_y = not (y12 < y21 or y11 > y22)
+    return contains_in_x and overlap_in_y
+def _is_bottom_full_overlap(box1, box2, y_tolerance=2):
+    """
+    检查box1下方和box2的上方有轻微的重叠，轻微程度收到y_tolerance的限制
+    这个函数和_is_vertical-full_overlap的区别是，这个函数允许box1和box2在x方向上有轻微的重叠,允许一定的模糊度
+    """
+    if box1 is None or box2 is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    tolerance_margin = 2
+    is_xdir_full_overlap = ((x0_1-tolerance_margin<=x0_2<=x1_1+tolerance_margin and x0_1-tolerance_margin<=x1_2<=x1_1+tolerance_margin) or (x0_2-tolerance_margin<=x0_1<=x1_2+tolerance_margin and x0_2-tolerance_margin<=x1_1<=x1_2+tolerance_margin))
+    return y0_2<y1_1 and 0<(y1_1-y0_2)<y_tolerance and is_xdir_full_overlap
+def _is_left_overlap(box1, box2,):
+    """
+    检查box1的左侧是否和box2有重叠
+    在Y方向上可以是部分重叠或者是完全重叠。不分box1和box2的上下关系，也就是无论box1在box2下方还是box2在box1下方，都可以检测到重叠。
+    X方向上
+    """
+    def __overlap_y(Ay1, Ay2, By1, By2):
+        return max(0, min(Ay2, By2) - max(Ay1, By1))
+    if box1 is None or box2 is None:
+        return False
+    x0_1, y0_1, x1_1, y1_1 = box1
+    x0_2, y0_2, x1_2, y1_2 = box2
+    y_overlap_len = __overlap_y(y0_1, y1_1, y0_2, y1_2)
+    ratio_1 = 1.0 * y_overlap_len / (y1_1 - y0_1) if y1_1-y0_1!=0 else 0
+    ratio_2 = 1.0 * y_overlap_len / (y1_2 - y0_2) if y1_2-y0_2!=0 else 0
+    vertical_overlap_cond = ratio_1 >= 0.5 or ratio_2 >= 0.5
+    #vertical_overlap_cond = y0_1<=y0_2<=y1_1 or y0_1<=y1_2<=y1_1 or y0_2<=y0_1<=y1_2 or y0_2<=y1_1<=y1_2
+    return x0_1<=x0_2<=x1_1 and vertical_overlap_cond
+def __is_overlaps_y_exceeds_threshold(bbox1, bbox2, overlap_ratio_threshold=0.8):
+    """检查两个bbox在y轴上是否有重叠，并且该重叠区域的高度占两个bbox高度更低的那个超过80%"""
+    _, y0_1, _, y1_1 = bbox1
+    _, y0_2, _, y1_2 = bbox2
+    overlap = max(0, min(y1_1, y1_2) - max(y0_1, y0_2))
+    height1, height2 = y1_1 - y0_1, y1_2 - y0_2
+    max_height = max(height1, height2)
+    min_height = min(height1, height2)
+    return (overlap / min_height) > overlap_ratio_threshold
+def calculate_iou(bbox1, bbox2):
+    """
+    计算两个边界框的交并比(IOU)。
+    Args:
+        bbox1 (list[float]): 第一个边界框的坐标，格式为 [x1, y1, x2, y2]，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
+        bbox2 (list[float]): 第二个边界框的坐标，格式与 `bbox1` 相同。
+    Returns:
+        float: 两个边界框的交并比(IOU)，取值范围为 [0, 1]。
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    # The area of both rectangles
+    bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
+    bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1])
+    # Compute the intersection over union by taking the intersection area
+    # and dividing it by the sum of both areas minus the intersection area
+    iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area)
+    return iou
+def calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2):
+    """
+    计算box1和box2的重叠面积占最小面积的box的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    min_box_area = min([(bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]), (bbox2[3]-bbox2[1])*(bbox2[2]-bbox2[0])])
+    if min_box_area==0:
+        return 0
+    else:
+        return intersection_area / min_box_area
+def calculate_overlap_area_in_bbox1_area_ratio(bbox1, bbox2):
+    """
+    计算box1和box2的重叠面积占bbox1的比例
+    """
+    # Determine the coordinates of the intersection rectangle
+    x_left = max(bbox1[0], bbox2[0])
+    y_top = max(bbox1[1], bbox2[1])
+    x_right = min(bbox1[2], bbox2[2])
+    y_bottom = min(bbox1[3], bbox2[3])
+    if x_right < x_left or y_bottom < y_top:
+        return 0.0
+    # The area of overlap area
+    intersection_area = (x_right - x_left) * (y_bottom - y_top)
+    bbox1_area = (bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])
+    if bbox1_area == 0:
+        return 0
+    else:
+        return intersection_area / bbox1_area
+def get_minbox_if_overlap_by_ratio(bbox1, bbox2, ratio):
+    """
+    通过calculate_overlap_area_2_minbox_area_ratio计算两个bbox重叠的面积占最小面积的box的比例
+    如果比例大于ratio，则返回小的那个bbox,
+    否则返回None
+    """
+    x1_min, y1_min, x1_max, y1_max = bbox1
+    x2_min, y2_min, x2_max, y2_max = bbox2
+    area1 = (x1_max - x1_min) * (y1_max - y1_min)
+    area2 = (x2_max - x2_min) * (y2_max - y2_min)
+    overlap_ratio = calculate_overlap_area_2_minbox_area_ratio(bbox1, bbox2)
+    if overlap_ratio > ratio:
+        if area1 <= area2:
+            return bbox1
+        else:
+            return bbox2
+    else:
+        return None
+def get_bbox_in_boundry(bboxes:list, boundry:tuple)-> list:
+    x0, y0, x1, y1 = boundry
+    new_boxes = [box for box in bboxes if box[0] >= x0 and box[1] >= y0 and box[2] <= x1 and box[3] <= y1]
+    return new_boxes
+def is_vbox_on_side(bbox, width, height, side_threshold=0.2):
+    """
+    判断一个bbox是否在pdf页面的边缘
+    """
+    x0, x1 = bbox[0], bbox[2]
+    if x1<=width*side_threshold or x0>=width*(1-side_threshold):
+        return True
+    return False
+def find_top_nearest_text_bbox(pymu_blocks, obj_bbox):
+    tolerance_margin = 4
+    top_boxes = [box for box in pymu_blocks if obj_bbox[1]-box['bbox'][3] >=-tolerance_margin and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    top_boxes = [box for box in top_boxes if any([obj_bbox[0]-tolerance_margin <=box['bbox'][0]<=obj_bbox[2]+tolerance_margin,
+                                                  obj_bbox[0]-tolerance_margin <=box['bbox'][2]<=obj_bbox[2]+tolerance_margin,
+                                                    box['bbox'][0]-tolerance_margin <=obj_bbox[0]<=box['bbox'][2]+tolerance_margin,
+                                                    box['bbox'][0]-tolerance_margin <=obj_bbox[2]<=box['bbox'][2]+tolerance_margin
+                                                  ])]
+    # 然后找到y1最大的那个
+    if len(top_boxes)>0:
+        top_boxes.sort(key=lambda x: x['bbox'][3], reverse=True)
+        return top_boxes[0]
+    else:
+        return None
+def find_bottom_nearest_text_bbox(pymu_blocks, obj_bbox):
+    bottom_boxes = [box for box in pymu_blocks if box['bbox'][1] - obj_bbox[3]>=-2 and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    bottom_boxes = [box for box in bottom_boxes if any([obj_bbox[0]-2 <=box['bbox'][0]<=obj_bbox[2]+2,
+                                                  obj_bbox[0]-2 <=box['bbox'][2]<=obj_bbox[2]+2,
+                                                    box['bbox'][0]-2 <=obj_bbox[0]<=box['bbox'][2]+2,
+                                                    box['bbox'][0]-2 <=obj_bbox[2]<=box['bbox'][2]+2
+                                                  ])]
+    # 然后找到y0最小的那个
+    if len(bottom_boxes)>0:
+        bottom_boxes.sort(key=lambda x: x['bbox'][1], reverse=False)
+        return bottom_boxes[0]
+    else:
+        return None
+def find_left_nearest_text_bbox(pymu_blocks, obj_bbox):
+    """
+    寻找左侧最近的文本block
+    """
+    left_boxes = [box for box in pymu_blocks if obj_bbox[0]-box['bbox'][2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    left_boxes = [box for box in left_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2,
+                                                  obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
+                                                  ])]
+    # 然后找到x1最大的那个
+    if len(left_boxes)>0:
+        left_boxes.sort(key=lambda x: x['bbox'][2], reverse=True)
+        return left_boxes[0]
+    else:
+        return None
+def find_right_nearest_text_bbox(pymu_blocks, obj_bbox):
+    """
+    寻找右侧最近的文本block
+    """
+    right_boxes = [box for box in pymu_blocks if box['bbox'][0]-obj_bbox[2]>=-2 and not _is_in(box['bbox'], obj_bbox)]
+    # 然后找到X方向上有互相重叠的
+    right_boxes = [box for box in right_boxes if any([obj_bbox[1]-2 <=box['bbox'][1]<=obj_bbox[3]+2,
+                                                  obj_bbox[1]-2 <=box['bbox'][3]<=obj_bbox[3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[1]<=box['bbox'][3]+2,
+                                                    box['bbox'][1]-2 <=obj_bbox[3]<=box['bbox'][3]+2
+                                                  ])]
+    # 然后找到x0最小的那个
+    if len(right_boxes)>0:
+        right_boxes.sort(key=lambda x: x['bbox'][0], reverse=False)
+        return right_boxes[0]
+    else:
+        return None
+def bbox_relative_pos(bbox1, bbox2):
+    """
+    判断两个矩形框的相对位置关系
+    Args:
+        bbox1: 一个四元组，表示第一个矩形框的左上角和右下角的坐标，格式为(x1, y1, x1b, y1b)
+        bbox2: 一个四元组，表示第二个矩形框的左上角和右下角的坐标，格式为(x2, y2, x2b, y2b)
+    Returns:
+        一个四元组，表示矩形框1相对于矩形框2的位置关系，格式为(left, right, bottom, top)
+        其中，left表示矩形框1是否在矩形框2的左侧，right表示矩形框1是否在矩形框2的右侧，
+        bottom表示矩形框1是否在矩形框2的下方，top表示矩形框1是否在矩形框2的上方
+    """
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+    left = x2b < x1
+    right = x1b < x2
+    bottom = y2b < y1
+    top = y1b < y2
+    return left, right, bottom, top
+def bbox_distance(bbox1, bbox2):
+    """
+    计算两个矩形框的距离。
+    Args:
+        bbox1 (tuple): 第一个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
+        bbox2 (tuple): 第二个矩形框的坐标，格式为 (x1, y1, x2, y2)，其中 (x1, y1) 为左上角坐标，(x2, y2) 为右下角坐标。
+    Returns:
+        float: 矩形框之间的距离。
+    """
+    def dist(point1, point2):
+            return math.sqrt((point1[0]-point2[0])**2 + (point1[1]-point2[1])**2)
+    x1, y1, x1b, y1b = bbox1
+    x2, y2, x2b, y2b = bbox2
+    left, right, bottom, top = bbox_relative_pos(bbox1, bbox2)
+    if top and left:
+        return dist((x1, y1b), (x2b, y2))
+    elif left and bottom:
+        return dist((x1, y1), (x2b, y2b))
+    elif bottom and right:
+        return dist((x1b, y1), (x2, y2b))
+    elif right and top:
+        return dist((x1b, y1b), (x2, y2))
+    elif left:
+        return x1 - x2b
+    elif right:
+        return x2 - x1b
+    elif bottom:
+        return y1 - y2b
+    elif top:
+        return y2 - y1b
+    else:             # rectangles intersect
+        return 0

magic_pdf/libs/calc_span_stats.py ADDED Viewed

@@ -0,0 +1,239 @@
+import os
+import csv
+import json
+import pandas as pd
+from pandas import DataFrame as df
+from matplotlib import pyplot as plt
+from termcolor import cprint
+"""
+Execute this script in the following way:
+1. Make sure there are pdf_dic.json files under the directory code-clean/tmp/unittest/md/, such as the following:
+    code-clean/tmp/unittest/md/scihub/scihub_00500000/libgen.scimag00527000-00527999.zip_10.1002/app.25178/pdf_dic.json
+2. Under the directory code-clean, execute the following command:
+    $ python -m libs.calc_span_stats
+"""
+def print_green_on_red(text):
+    cprint(text, "green", "on_red", attrs=["bold"], end="\n\n")
+def print_green(text):
+    print()
+    cprint(text, "green", attrs=["bold"], end="\n\n")
+def print_red(text):
+    print()
+    cprint(text, "red", attrs=["bold"], end="\n\n")
+def safe_get(dict_obj, key, default):
+    val = dict_obj.get(key)
+    if val is None:
+        return default
+    else:
+        return val
+class SpanStatsCalc:
+    """Calculate statistics of span."""
+    def draw_charts(self, span_stats: pd.DataFrame, fig_num: int, save_path: str):
+        """Draw multiple figures in one figure."""
+        # make a canvas
+        fig = plt.figure(fig_num, figsize=(20, 20))
+        pass
+    def calc_stats_per_dict(self, pdf_dict) -> pd.DataFrame:
+        """Calculate statistics per pdf_dict."""
+        span_stats = pd.DataFrame()
+        span_stats = []
+        span_id = 0
+        for page_id, blocks in pdf_dict.items():
+            if page_id.startswith("page_"):
+                if "para_blocks" in blocks.keys():
+                    for para_block in blocks["para_blocks"]:
+                        for line in para_block["lines"]:
+                            for span in line["spans"]:
+                                span_text = safe_get(span, "text", "")
+                                span_font_name = safe_get(span, "font", "")
+                                span_font_size = safe_get(span, "size", 0)
+                                span_font_color = safe_get(span, "color", "")
+                                span_font_flags = safe_get(span, "flags", 0)
+                                span_font_flags_decoded = safe_get(span, "decomposed_flags", {})
+                                span_is_super_script = safe_get(span_font_flags_decoded, "is_superscript", False)
+                                span_is_italic = safe_get(span_font_flags_decoded, "is_italic", False)
+                                span_is_serifed = safe_get(span_font_flags_decoded, "is_serifed", False)
+                                span_is_sans_serifed = safe_get(span_font_flags_decoded, "is_sans_serifed", False)
+                                span_is_monospaced = safe_get(span_font_flags_decoded, "is_monospaced", False)
+                                span_is_proportional = safe_get(span_font_flags_decoded, "is_proportional", False)
+                                span_is_bold = safe_get(span_font_flags_decoded, "is_bold", False)
+                                span_stats.append(
+                                    {
+                                        "span_id": span_id,  # id of span
+                                        "page_id": page_id,  # page number of pdf
+                                        "span_text": span_text,  # text of span
+                                        "span_font_name": span_font_name,  # font name of span
+                                        "span_font_size": span_font_size,  # font size of span
+                                        "span_font_color": span_font_color,  # font color of span
+                                        "span_font_flags": span_font_flags,  # font flags of span
+                                        "span_is_superscript": int(
+                                            span_is_super_script
+                                        ),  # indicate whether the span is super script or not
+                                        "span_is_italic": int(span_is_italic),  # indicate whether the span is italic or not
+                                        "span_is_serifed": int(span_is_serifed),  # indicate whether the span is serifed or not
+                                        "span_is_sans_serifed": int(
+                                            span_is_sans_serifed
+                                        ),  # indicate whether the span is sans serifed or not
+                                        "span_is_monospaced": int(
+                                            span_is_monospaced
+                                        ),  # indicate whether the span is monospaced or not
+                                        "span_is_proportional": int(
+                                            span_is_proportional
+                                        ),  # indicate whether the span is proportional or not
+                                        "span_is_bold": int(span_is_bold),  # indicate whether the span is bold or not
+                                    }
+                                )
+                                span_id += 1
+        span_stats = pd.DataFrame(span_stats)
+        # print(span_stats)
+        return span_stats
+def __find_pdf_dic_files(
+    jf_name="pdf_dic.json",
+    base_code_name="code-clean",
+    tgt_base_dir_name="tmp",
+    unittest_dir_name="unittest",
+    md_dir_name="md",
+    book_names=[
+        "scihub",
+    ],  # other possible values: "zlib", "arxiv" and so on
+):
+    pdf_dict_files = []
+    curr_dir = os.path.dirname(__file__)
+    for i in range(len(curr_dir)):
+        if curr_dir[i : i + len(base_code_name)] == base_code_name:
+            base_code_dir_name = curr_dir[: i + len(base_code_name)]
+            for book_name in book_names:
+                search_dir_relative_name = os.path.join(tgt_base_dir_name, unittest_dir_name, md_dir_name, book_name)
+                if os.path.exists(base_code_dir_name):
+                    search_dir_name = os.path.join(base_code_dir_name, search_dir_relative_name)
+                    for root, dirs, files in os.walk(search_dir_name):
+                        for file in files:
+                            if file == jf_name:
+                                pdf_dict_files.append(os.path.join(root, file))
+                break
+    return pdf_dict_files
+def combine_span_texts(group_df, span_stats):
+    combined_span_texts = []
+    for _, row in group_df.iterrows():
+        curr_span_id = row.name
+        curr_span_text = row["span_text"]
+        pre_span_id = curr_span_id - 1
+        pre_span_text = span_stats.at[pre_span_id, "span_text"] if pre_span_id in span_stats.index else ""
+        next_span_id = curr_span_id + 1
+        next_span_text = span_stats.at[next_span_id, "span_text"] if next_span_id in span_stats.index else ""
+        # pointer_sign is a right arrow if the span is superscript, otherwise it is a down arrow
+        pointer_sign = "→ → → "
+        combined_text = "\n".join([pointer_sign + pre_span_text, pointer_sign + curr_span_text, pointer_sign + next_span_text])
+        combined_span_texts.append(combined_text)
+    return "\n\n".join(combined_span_texts)
+# pd.set_option("display.max_colwidth", None)  # 设置为 None 来显示完整的文本
+pd.set_option("display.max_rows", None)  # 设置为 None 来显示更多的行
+def main():
+    pdf_dict_files = __find_pdf_dic_files()
+    # print(pdf_dict_files)
+    span_stats_calc = SpanStatsCalc()
+    for pdf_dict_file in pdf_dict_files:
+        print("-" * 100)
+        print_green_on_red(f"Processing {pdf_dict_file}")
+        with open(pdf_dict_file, "r", encoding="utf-8") as f:
+            pdf_dict = json.load(f)
+            raw_df = span_stats_calc.calc_stats_per_dict(pdf_dict)
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_raw.csv")
+            raw_df.to_csv(save_path, index=False)
+            filtered_df = raw_df[raw_df["span_is_superscript"] == 1]
+            if filtered_df.empty:
+                print("No superscript span found!")
+                continue
+            filtered_grouped_df = filtered_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
+            combined_span_texts = filtered_grouped_df.apply(combine_span_texts, span_stats=raw_df)  # type: ignore
+            final_df = filtered_grouped_df.size().reset_index(name="count")
+            final_df["span_texts"] = combined_span_texts.reset_index(level=[0, 1, 2], drop=True)
+            print(final_df)
+            final_df["span_texts"] = final_df["span_texts"].apply(lambda x: x.replace("\n", "\r\n"))
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_final.csv")
+            # 使用 UTF-8 编码并添加 BOM，确保所有字段被双引号包围
+            final_df.to_csv(save_path, index=False, encoding="utf-8-sig", quoting=csv.QUOTE_ALL)
+            # 创建一个 2x2 的图表布局
+            fig, axs = plt.subplots(2, 2, figsize=(15, 10))
+            # 按照 span_font_name 分类作图
+            final_df.groupby("span_font_name")["count"].sum().plot(kind="bar", ax=axs[0, 0], title="By Font Name")
+            # 按照 span_font_size 分类作图
+            final_df.groupby("span_font_size")["count"].sum().plot(kind="bar", ax=axs[0, 1], title="By Font Size")
+            # 按照 span_font_color 分类作图
+            final_df.groupby("span_font_color")["count"].sum().plot(kind="bar", ax=axs[1, 0], title="By Font Color")
+            # 按照 span_font_name、span_font_size 和 span_font_color 共同分类作图
+            grouped = final_df.groupby(["span_font_name", "span_font_size", "span_font_color"])
+            grouped["count"].sum().unstack().plot(kind="bar", ax=axs[1, 1], title="Combined Grouping")
+            # 调整布局
+            plt.tight_layout()
+            # 显示图表
+            # plt.show()
+            # 保存图表到 PNG 文件
+            save_path = pdf_dict_file.replace("pdf_dic.json", "span_stats_combined.png")
+            plt.savefig(save_path)
+            # 清除画布
+            plt.clf()
+if __name__ == "__main__":
+    main()