PyPI - magic-pdf - Versions diffs - 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl - Mend

magic-pdf 0.7.1py3-none-any.whl → 0.8.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

magic_pdf/dict2md/ocr_mkcontent.py +130 -76
magic_pdf/integrations/__init__.py +0 -0
magic_pdf/integrations/rag/__init__.py +0 -0
magic_pdf/integrations/rag/api.py +82 -0
magic_pdf/integrations/rag/type.py +82 -0
magic_pdf/integrations/rag/utils.py +285 -0
magic_pdf/layout/layout_sort.py +472 -283
magic_pdf/libs/boxbase.py +188 -149
magic_pdf/libs/draw_bbox.py +113 -87
magic_pdf/libs/ocr_content_type.py +21 -18
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
magic_pdf/model/magic_model.py +283 -166
magic_pdf/model/model_list.py +8 -0
magic_pdf/model/pdf_extract_kit.py +105 -15
magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
magic_pdf/para/para_split_v2.py +26 -27
magic_pdf/pdf_parse_union_core.py +34 -6
magic_pdf/pipe/AbsPipe.py +4 -1
magic_pdf/pipe/OCRPipe.py +7 -4
magic_pdf/pipe/TXTPipe.py +7 -4
magic_pdf/pipe/UNIPipe.py +11 -6
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
magic_pdf/tools/cli.py +56 -29
magic_pdf/tools/cli_dev.py +61 -64
magic_pdf/tools/common.py +57 -37
magic_pdf/user_api.py +17 -9
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/METADATA +72 -27
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/RECORD +34 -29
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/WHEEL +0 -0
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.7.1.dist-info → magic_pdf-0.8.1.dist-info}/top_level.txt +0 -0

magic_pdf/model/magic_model.py CHANGED Viewed

@@ -1,50 +1,40 @@
 import json
-import math
-from magic_pdf.libs.commons import fitz
-from loguru import logger
-from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
+                                    bbox_relative_pos, box_area, calculate_iou,
+                                    calculate_overlap_area_in_bbox1_area_ratio,
+                                    get_overlap_area)
+from magic_pdf.libs.commons import fitz, join_path
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
-from magic_pdf.libs.ocr_content_type import ContentType
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.libs.local_math import float_gt
-from magic_pdf.libs.boxbase import (
-    _is_in,
-    bbox_relative_pos,
-    bbox_distance,
-    _is_part_overlap,
-    calculate_overlap_area_in_bbox1_area_ratio,
-    calculate_iou,
-)
 from magic_pdf.libs.ModelBlockTypeEnum import ModelBlockTypeEnum
+from magic_pdf.libs.ocr_content_type import CategoryId, ContentType
+from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 CAPATION_OVERLAP_AREA_RATIO = 0.6
+MERGE_BOX_OVERLAP_AREA_RATIO = 1.1
 class MagicModel:
-    """
-    每个函数没有得到元素的时候返回空list
-    """
+    """每个函数没有得到元素的时候返回空list."""
     def __fix_axis(self):
         for model_page_info in self.__model_list:
             need_remove_list = []
-            page_no = model_page_info["page_info"]["page_no"]
+            page_no = model_page_info['page_info']['page_no']
             horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio(
                 model_page_info, self.__docs[page_no]
             )
-            layout_dets = model_page_info["layout_dets"]
+            layout_dets = model_page_info['layout_dets']
             for layout_det in layout_dets:
-                if layout_det.get("bbox") is not None:
+                if layout_det.get('bbox') is not None:
                     # 兼容直接输出bbox的模型数据,如paddle
-                    x0, y0, x1, y1 = layout_det["bbox"]
+                    x0, y0, x1, y1 = layout_det['bbox']
                 else:
                     # 兼容直接输出poly的模型数据，如xxx
-                    x0, y0, _, _, x1, y1, _, _ = layout_det["poly"]
+                    x0, y0, _, _, x1, y1, _, _ = layout_det['poly']
                 bbox = [
                     int(x0 / horizontal_scale_ratio),
@@ -52,7 +42,7 @@ class MagicModel:
                     int(x1 / horizontal_scale_ratio),
                     int(y1 / vertical_scale_ratio),
                 ]
-                layout_det["bbox"] = bbox
+                layout_det['bbox'] = bbox
                 # 删除高度或者宽度小于等于0的spans
                 if bbox[2] - bbox[0] <= 0 or bbox[3] - bbox[1] <= 0:
                     need_remove_list.append(layout_det)
@@ -62,9 +52,9 @@ class MagicModel:
     def __fix_by_remove_low_confidence(self):
         for model_page_info in self.__model_list:
             need_remove_list = []
-            layout_dets = model_page_info["layout_dets"]
+            layout_dets = model_page_info['layout_dets']
             for layout_det in layout_dets:
-                if layout_det["score"] <= 0.05:
+                if layout_det['score'] <= 0.05:
                     need_remove_list.append(layout_det)
                 else:
                     continue
@@ -74,12 +64,12 @@ class MagicModel:
     def __fix_by_remove_high_iou_and_low_confidence(self):
         for model_page_info in self.__model_list:
             need_remove_list = []
-            layout_dets = model_page_info["layout_dets"]
+            layout_dets = model_page_info['layout_dets']
             for layout_det1 in layout_dets:
                 for layout_det2 in layout_dets:
                     if layout_det1 == layout_det2:
                         continue
-                    if layout_det1["category_id"] in [
+                    if layout_det1['category_id'] in [
                         0,
                         1,
                         2,
@@ -90,12 +80,12 @@ class MagicModel:
                         7,
                         8,
                         9,
-                    ] and layout_det2["category_id"] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
+                    ] and layout_det2['category_id'] in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
                         if (
-                            calculate_iou(layout_det1["bbox"], layout_det2["bbox"])
+                            calculate_iou(layout_det1['bbox'], layout_det2['bbox'])
                             > 0.9
                         ):
-                            if layout_det1["score"] < layout_det2["score"]:
+                            if layout_det1['score'] < layout_det2['score']:
                                 layout_det_need_remove = layout_det1
                             else:
                                 layout_det_need_remove = layout_det2
@@ -118,6 +108,69 @@ class MagicModel:
         self.__fix_by_remove_low_confidence()
         """删除高iou(>0.9)数据中置信度较低的那个"""
         self.__fix_by_remove_high_iou_and_low_confidence()
+        self.__fix_footnote()
+    def __fix_footnote(self):
+        # 3: figure, 5: table, 7: footnote
+        for model_page_info in self.__model_list:
+            footnotes = []
+            figures = []
+            tables = []
+            for obj in model_page_info['layout_dets']:
+                if obj['category_id'] == 7:
+                    footnotes.append(obj)
+                elif obj['category_id'] == 3:
+                    figures.append(obj)
+                elif obj['category_id'] == 5:
+                    tables.append(obj)
+                if len(footnotes) * len(figures) == 0:
+                    continue
+            dis_figure_footnote = {}
+            dis_table_footnote = {}
+            for i in range(len(footnotes)):
+                for j in range(len(figures)):
+                    pos_flag_count = sum(
+                        list(
+                            map(
+                                lambda x: 1 if x else 0,
+                                bbox_relative_pos(
+                                    footnotes[i]['bbox'], figures[j]['bbox']
+                                ),
+                            )
+                        )
+                    )
+                    if pos_flag_count > 1:
+                        continue
+                    dis_figure_footnote[i] = min(
+                        bbox_distance(figures[j]['bbox'], footnotes[i]['bbox']),
+                        dis_figure_footnote.get(i, float('inf')),
+                    )
+            for i in range(len(footnotes)):
+                for j in range(len(tables)):
+                    pos_flag_count = sum(
+                        list(
+                            map(
+                                lambda x: 1 if x else 0,
+                                bbox_relative_pos(
+                                    footnotes[i]['bbox'], tables[j]['bbox']
+                                ),
+                            )
+                        )
+                    )
+                    if pos_flag_count > 1:
+                        continue
+                    dis_table_footnote[i] = min(
+                        bbox_distance(tables[j]['bbox'], footnotes[i]['bbox']),
+                        dis_table_footnote.get(i, float('inf')),
+                    )
+            for i in range(len(footnotes)):
+                if i not in dis_figure_footnote:
+                    continue
+                if dis_table_footnote.get(i, float('inf')) > dis_figure_footnote[i]:
+                    footnotes[i]['category_id'] = CategoryId.ImageFootnote
     def __reduct_overlap(self, bboxes):
         N = len(bboxes)
@@ -126,76 +179,115 @@ class MagicModel:
             for j in range(N):
                 if i == j:
                     continue
-                if _is_in(bboxes[i]["bbox"], bboxes[j]["bbox"]):
+                if _is_in(bboxes[i]['bbox'], bboxes[j]['bbox']):
                     keep[i] = False
         return [bboxes[i] for i in range(N) if keep[i]]
     def __tie_up_category_by_distance(
         self, page_no, subject_category_id, object_category_id
     ):
-        """
-        假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object)，每个 object 只能属于一个 subject
-        """
+        """假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object)，每个 object
+        只能属于一个 subject."""
         ret = []
         MAX_DIS_OF_POINT = 10**9 + 7
+        """
+        subject 和 object 的 bbox 会合并成一个大的 bbox （named: merged bbox）。
+        筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
+        再求出筛选出的 subjects 和 object 的最短距离
+        """
+        def search_overlap_between_boxes(
+            subject_idx, object_idx
+        ):
+            idxes = [subject_idx, object_idx]
+            x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
+            y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
+            x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
+            y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
+            merged_bbox = [
+                min(x0s),
+                min(y0s),
+                max(x1s),
+                max(y1s),
+            ]
+            ratio = 0
+            other_objects = list(
+                map(
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
+                    filter(
+                        lambda x: x['category_id']
+                        not in (object_category_id, subject_category_id),
+                        self.__model_list[page_no]['layout_dets'],
+                    ),
+                )
+            )
+            for other_object in other_objects:
+                ratio = max(
+                    ratio,
+                    get_overlap_area(
+                        merged_bbox, other_object['bbox']
+                    ) * 1.0 / box_area(all_bboxes[object_idx]['bbox'])
+                )
+                if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
+                    break
+            return ratio
-        # subject 和 object 的 bbox 会合并成一个大的 bbox （named: merged bbox）。 筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
-        # 再求出筛选出的 subjects 和 object 的最短距离！
         def may_find_other_nearest_bbox(subject_idx, object_idx):
-            ret = float("inf")
+            ret = float('inf')
             x0 = min(
-                all_bboxes[subject_idx]["bbox"][0], all_bboxes[object_idx]["bbox"][0]
+                all_bboxes[subject_idx]['bbox'][0], all_bboxes[object_idx]['bbox'][0]
             )
             y0 = min(
-                all_bboxes[subject_idx]["bbox"][1], all_bboxes[object_idx]["bbox"][1]
+                all_bboxes[subject_idx]['bbox'][1], all_bboxes[object_idx]['bbox'][1]
             )
             x1 = max(
-                all_bboxes[subject_idx]["bbox"][2], all_bboxes[object_idx]["bbox"][2]
+                all_bboxes[subject_idx]['bbox'][2], all_bboxes[object_idx]['bbox'][2]
             )
             y1 = max(
-                all_bboxes[subject_idx]["bbox"][3], all_bboxes[object_idx]["bbox"][3]
+                all_bboxes[subject_idx]['bbox'][3], all_bboxes[object_idx]['bbox'][3]
             )
             object_area = abs(
-                all_bboxes[object_idx]["bbox"][2] - all_bboxes[object_idx]["bbox"][0]
+                all_bboxes[object_idx]['bbox'][2] - all_bboxes[object_idx]['bbox'][0]
             ) * abs(
-                all_bboxes[object_idx]["bbox"][3] - all_bboxes[object_idx]["bbox"][1]
+                all_bboxes[object_idx]['bbox'][3] - all_bboxes[object_idx]['bbox'][1]
             )
             for i in range(len(all_bboxes)):
                 if (
                     i == subject_idx
-                    or all_bboxes[i]["category_id"] != subject_category_id
+                    or all_bboxes[i]['category_id'] != subject_category_id
                 ):
                     continue
-                if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]["bbox"]) or _is_in(
-                    all_bboxes[i]["bbox"], [x0, y0, x1, y1]
+                if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]['bbox']) or _is_in(
+                    all_bboxes[i]['bbox'], [x0, y0, x1, y1]
                 ):
                     i_area = abs(
-                        all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
-                    ) * abs(all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1])
+                        all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
+                    ) * abs(all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1])
                     if i_area >= object_area:
-                        ret = min(float("inf"), dis[i][object_idx])
+                        ret = min(float('inf'), dis[i][object_idx])
             return ret
         def expand_bbbox(idxes):
-            x0s = [all_bboxes[idx]["bbox"][0] for idx in idxes]
-            y0s = [all_bboxes[idx]["bbox"][1] for idx in idxes]
-            x1s = [all_bboxes[idx]["bbox"][2] for idx in idxes]
-            y1s = [all_bboxes[idx]["bbox"][3] for idx in idxes]
+            x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
+            y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
+            x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
+            y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
             return min(x0s), min(y0s), max(x1s), max(y1s)
         subjects = self.__reduct_overlap(
             list(
                 map(
-                    lambda x: {"bbox": x["bbox"], "score": x["score"]},
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
                     filter(
-                        lambda x: x["category_id"] == subject_category_id,
-                        self.__model_list[page_no]["layout_dets"],
+                        lambda x: x['category_id'] == subject_category_id,
+                        self.__model_list[page_no]['layout_dets'],
                     ),
                 )
             )
@@ -204,10 +296,10 @@ class MagicModel:
         objects = self.__reduct_overlap(
             list(
                 map(
-                    lambda x: {"bbox": x["bbox"], "score": x["score"]},
+                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
                     filter(
-                        lambda x: x["category_id"] == object_category_id,
-                        self.__model_list[page_no]["layout_dets"],
+                        lambda x: x['category_id'] == object_category_id,
+                        self.__model_list[page_no]['layout_dets'],
                     ),
                 )
             )
@@ -215,7 +307,7 @@ class MagicModel:
         subject_object_relation_map = {}
         subjects.sort(
-            key=lambda x: x["bbox"][0] ** 2 + x["bbox"][1] ** 2
+            key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2
         )  # get the distance !
         all_bboxes = []
@@ -223,18 +315,18 @@ class MagicModel:
         for v in subjects:
             all_bboxes.append(
                 {
-                    "category_id": subject_category_id,
-                    "bbox": v["bbox"],
-                    "score": v["score"],
+                    'category_id': subject_category_id,
+                    'bbox': v['bbox'],
+                    'score': v['score'],
                 }
             )
         for v in objects:
             all_bboxes.append(
                 {
-                    "category_id": object_category_id,
-                    "bbox": v["bbox"],
-                    "score": v["score"],
+                    'category_id': object_category_id,
+                    'bbox': v['bbox'],
+                    'score': v['score'],
                 }
             )
@@ -244,18 +336,27 @@ class MagicModel:
         for i in range(N):
             for j in range(i):
                 if (
-                    all_bboxes[i]["category_id"] == subject_category_id
-                    and all_bboxes[j]["category_id"] == subject_category_id
+                    all_bboxes[i]['category_id'] == subject_category_id
+                    and all_bboxes[j]['category_id'] == subject_category_id
                 ):
                     continue
-                dis[i][j] = bbox_distance(all_bboxes[i]["bbox"], all_bboxes[j]["bbox"])
+                subject_idx, object_idx = i, j
+                if all_bboxes[j]['category_id'] == subject_category_id:
+                    subject_idx, object_idx = j, i
+                if search_overlap_between_boxes(subject_idx, object_idx) >= MERGE_BOX_OVERLAP_AREA_RATIO:
+                    dis[i][j] = float('inf')
+                    dis[j][i] = dis[i][j]
+                    continue
+                dis[i][j] = bbox_distance(all_bboxes[i]['bbox'], all_bboxes[j]['bbox'])
                 dis[j][i] = dis[i][j]
         used = set()
         for i in range(N):
             # 求第 i 个 subject 所关联的 object
-            if all_bboxes[i]["category_id"] != subject_category_id:
+            if all_bboxes[i]['category_id'] != subject_category_id:
                 continue
             seen = set()
             candidates = []
@@ -267,7 +368,7 @@ class MagicModel:
                         map(
                             lambda x: 1 if x else 0,
                             bbox_relative_pos(
-                                all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
+                                all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
                             ),
                         )
                     )
@@ -275,25 +376,28 @@ class MagicModel:
                 if pos_flag_count > 1:
                     continue
                 if (
-                    all_bboxes[j]["category_id"] != object_category_id
+                    all_bboxes[j]['category_id'] != object_category_id
                     or j in used
                     or dis[i][j] == MAX_DIS_OF_POINT
                 ):
                     continue
                 left, right, _, _ = bbox_relative_pos(
-                    all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
+                    all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
                 )  # 由  pos_flag_count 相关逻辑保证本段逻辑准确性
                 if left or right:
-                    one_way_dis = all_bboxes[i]["bbox"][2] - all_bboxes[i]["bbox"][0]
+                    one_way_dis = all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
                 else:
-                    one_way_dis = all_bboxes[i]["bbox"][3] - all_bboxes[i]["bbox"][1]
+                    one_way_dis = all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1]
                 if dis[i][j] > one_way_dis:
                     continue
                 arr.append((dis[i][j], j))
             arr.sort(key=lambda x: x[0])
             if len(arr) > 0:
-                # bug: 离该subject 最近的 object 可能跨越了其它的 subject 。比如 [this subect] [some sbuject] [the nearest objec of subject]
+                """
+                bug: 离该subject 最近的 object 可能跨越了其它的 subject。
+                比如 [this subect] [some sbuject] [the nearest object of subject]
+                """
                 if may_find_other_nearest_bbox(i, arr[0][1]) >= arr[0][0]:
                     candidates.append(arr[0][1])
@@ -308,7 +412,7 @@ class MagicModel:
                             map(
                                 lambda x: 1 if x else 0,
                                 bbox_relative_pos(
-                                    all_bboxes[j]["bbox"], all_bboxes[k]["bbox"]
+                                    all_bboxes[j]['bbox'], all_bboxes[k]['bbox']
                                 ),
                             )
                         )
@@ -318,7 +422,7 @@ class MagicModel:
                         continue
                     if (
-                        all_bboxes[k]["category_id"] != object_category_id
+                        all_bboxes[k]['category_id'] != object_category_id
                         or k in used
                         or k in seen
                         or dis[j][k] == MAX_DIS_OF_POINT
@@ -327,17 +431,19 @@ class MagicModel:
                         continue
                     is_nearest = True
-                    for l in range(i + 1, N):
-                        if l in (j, k) or l in used or l in seen:
+                    for ni in range(i + 1, N):
+                        if ni in (j, k) or ni in used or ni in seen:
                             continue
-                        if not float_gt(dis[l][k], dis[j][k]):
+                        if not float_gt(dis[ni][k], dis[j][k]):
                             is_nearest = False
                             break
                     if is_nearest:
                         nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
-                        n_dis = bbox_distance(all_bboxes[i]["bbox"], [nx0, ny0, nx1, ny1])
+                        n_dis = bbox_distance(
+                            all_bboxes[i]['bbox'], [nx0, ny0, nx1, ny1]
+                        )
                         if float_gt(dis[i][j], n_dis):
                             continue
                         tmp.append(k)
@@ -350,7 +456,7 @@ class MagicModel:
             # 已经获取到某个 figure 下所有的最靠近的 captions，以及最靠近这些 captions 的 captions 。
             # 先扩一下 bbox，
             ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
-            ix0, iy0, ix1, iy1 = all_bboxes[i]["bbox"]
+            ix0, iy0, ix1, iy1 = all_bboxes[i]['bbox']
             # 分成了 4 个截取空间，需要计算落在每个截取空间下 objects 合并后占据的矩形面积
             caption_poses = [
@@ -366,17 +472,17 @@ class MagicModel:
                 for idx in seen:
                     if (
                         calculate_overlap_area_in_bbox1_area_ratio(
-                            all_bboxes[idx]["bbox"], bbox
+                            all_bboxes[idx]['bbox'], bbox
                         )
                         > CAPATION_OVERLAP_AREA_RATIO
                     ):
                         embed_arr.append(idx)
                 if len(embed_arr) > 0:
-                    embed_x0 = min([all_bboxes[idx]["bbox"][0] for idx in embed_arr])
-                    embed_y0 = min([all_bboxes[idx]["bbox"][1] for idx in embed_arr])
-                    embed_x1 = max([all_bboxes[idx]["bbox"][2] for idx in embed_arr])
-                    embed_y1 = max([all_bboxes[idx]["bbox"][3] for idx in embed_arr])
+                    embed_x0 = min([all_bboxes[idx]['bbox'][0] for idx in embed_arr])
+                    embed_y0 = min([all_bboxes[idx]['bbox'][1] for idx in embed_arr])
+                    embed_x1 = max([all_bboxes[idx]['bbox'][2] for idx in embed_arr])
+                    embed_y1 = max([all_bboxes[idx]['bbox'][3] for idx in embed_arr])
                     caption_areas.append(
                         int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0))
                     )
@@ -391,7 +497,7 @@ class MagicModel:
                 for j in seen:
                     if (
                         calculate_overlap_area_in_bbox1_area_ratio(
-                            all_bboxes[j]["bbox"], caption_bbox
+                            all_bboxes[j]['bbox'], caption_bbox
                         )
                         > CAPATION_OVERLAP_AREA_RATIO
                     ):
@@ -400,30 +506,30 @@ class MagicModel:
         for i in sorted(subject_object_relation_map.keys()):
             result = {
-                "subject_body": all_bboxes[i]["bbox"],
-                "all": all_bboxes[i]["bbox"],
-                "score": all_bboxes[i]["score"],
+                'subject_body': all_bboxes[i]['bbox'],
+                'all': all_bboxes[i]['bbox'],
+                'score': all_bboxes[i]['score'],
             }
             if len(subject_object_relation_map[i]) > 0:
                 x0 = min(
-                    [all_bboxes[j]["bbox"][0] for j in subject_object_relation_map[i]]
+                    [all_bboxes[j]['bbox'][0] for j in subject_object_relation_map[i]]
                 )
                 y0 = min(
-                    [all_bboxes[j]["bbox"][1] for j in subject_object_relation_map[i]]
+                    [all_bboxes[j]['bbox'][1] for j in subject_object_relation_map[i]]
                 )
                 x1 = max(
-                    [all_bboxes[j]["bbox"][2] for j in subject_object_relation_map[i]]
+                    [all_bboxes[j]['bbox'][2] for j in subject_object_relation_map[i]]
                 )
                 y1 = max(
-                    [all_bboxes[j]["bbox"][3] for j in subject_object_relation_map[i]]
+                    [all_bboxes[j]['bbox'][3] for j in subject_object_relation_map[i]]
                 )
-                result["object_body"] = [x0, y0, x1, y1]
-                result["all"] = [
-                    min(x0, all_bboxes[i]["bbox"][0]),
-                    min(y0, all_bboxes[i]["bbox"][1]),
-                    max(x1, all_bboxes[i]["bbox"][2]),
-                    max(y1, all_bboxes[i]["bbox"][3]),
+                result['object_body'] = [x0, y0, x1, y1]
+                result['all'] = [
+                    min(x0, all_bboxes[i]['bbox'][0]),
+                    min(y0, all_bboxes[i]['bbox'][1]),
+                    max(x1, all_bboxes[i]['bbox'][2]),
+                    max(y1, all_bboxes[i]['bbox'][3]),
                 ]
             ret.append(result)
@@ -432,7 +538,7 @@ class MagicModel:
         for i in subject_object_relation_map.keys():
             for j in subject_object_relation_map[i]:
                 total_subject_object_dis += bbox_distance(
-                    all_bboxes[i]["bbox"], all_bboxes[j]["bbox"]
+                    all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
                 )
         # 计算未匹配的 subject 和 object 的距离（非精确版）
@@ -444,12 +550,12 @@ class MagicModel:
             ]
         )
         for i in range(N):
-            if all_bboxes[i]["category_id"] != object_category_id or i in used:
+            if all_bboxes[i]['category_id'] != object_category_id or i in used:
                 continue
             candidates = []
             for j in range(N):
                 if (
-                    all_bboxes[j]["category_id"] != subject_category_id
+                    all_bboxes[j]['category_id'] != subject_category_id
                     or j in with_caption_subject
                 ):
                     continue
@@ -461,18 +567,28 @@ class MagicModel:
         return ret, total_subject_object_dis
     def get_imgs(self, page_no: int):
-        figure_captions, _ = self.__tie_up_category_by_distance(
-            page_no, 3, 4
+        with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
+        with_footnotes, _ = self.__tie_up_category_by_distance(
+            page_no, 3, CategoryId.ImageFootnote
         )
-        return [
-            {
-                "bbox": record["all"],
-                "img_body_bbox": record["subject_body"],
-                "img_caption_bbox": record.get("object_body", None),
-                "score": record["score"],
+        ret = []
+        N, M = len(with_captions), len(with_footnotes)
+        assert N == M
+        for i in range(N):
+            record = {
+                'score': with_captions[i]['score'],
+                'img_caption_bbox': with_captions[i].get('object_body', None),
+                'img_body_bbox': with_captions[i]['subject_body'],
+                'img_footnote_bbox': with_footnotes[i].get('object_body', None),
             }
-            for record in figure_captions
-        ]
+            x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
+            y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
+            x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
+            y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
+            record['bbox'] = [x0, y0, x1, y1]
+            ret.append(record)
+        return ret
     def get_tables(
         self, page_no: int
@@ -484,26 +600,26 @@ class MagicModel:
         assert N == M
         for i in range(N):
             record = {
-                "score": with_captions[i]["score"],
-                "table_caption_bbox": with_captions[i].get("object_body", None),
-                "table_body_bbox": with_captions[i]["subject_body"],
-                "table_footnote_bbox": with_footnotes[i].get("object_body", None),
+                'score': with_captions[i]['score'],
+                'table_caption_bbox': with_captions[i].get('object_body', None),
+                'table_body_bbox': with_captions[i]['subject_body'],
+                'table_footnote_bbox': with_footnotes[i].get('object_body', None),
             }
-            x0 = min(with_captions[i]["all"][0], with_footnotes[i]["all"][0])
-            y0 = min(with_captions[i]["all"][1], with_footnotes[i]["all"][1])
-            x1 = max(with_captions[i]["all"][2], with_footnotes[i]["all"][2])
-            y1 = max(with_captions[i]["all"][3], with_footnotes[i]["all"][3])
-            record["bbox"] = [x0, y0, x1, y1]
+            x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
+            y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
+            x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
+            y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
+            record['bbox'] = [x0, y0, x1, y1]
             ret.append(record)
         return ret
     def get_equations(self, page_no: int) -> list:  # 有坐标，也有字
         inline_equations = self.__get_blocks_by_type(
-            ModelBlockTypeEnum.EMBEDDING.value, page_no, ["latex"]
+            ModelBlockTypeEnum.EMBEDDING.value, page_no, ['latex']
         )
         interline_equations = self.__get_blocks_by_type(
-            ModelBlockTypeEnum.ISOLATED.value, page_no, ["latex"]
+            ModelBlockTypeEnum.ISOLATED.value, page_no, ['latex']
         )
         interline_equations_blocks = self.__get_blocks_by_type(
             ModelBlockTypeEnum.ISOLATE_FORMULA.value, page_no
@@ -525,17 +641,18 @@ class MagicModel:
     def get_ocr_text(self, page_no: int) -> list:  # paddle 搞的，有字也有坐标
         text_spans = []
         model_page_info = self.__model_list[page_no]
-        layout_dets = model_page_info["layout_dets"]
+        layout_dets = model_page_info['layout_dets']
         for layout_det in layout_dets:
-            if layout_det["category_id"] == "15":
+            if layout_det['category_id'] == '15':
                 span = {
-                    "bbox": layout_det["bbox"],
-                    "content": layout_det["text"],
+                    'bbox': layout_det['bbox'],
+                    'content': layout_det['text'],
                 }
                 text_spans.append(span)
         return text_spans
     def get_all_spans(self, page_no: int) -> list:
         def remove_duplicate_spans(spans):
             new_spans = []
             for span in spans:
@@ -545,7 +662,7 @@ class MagicModel:
         all_spans = []
         model_page_info = self.__model_list[page_no]
-        layout_dets = model_page_info["layout_dets"]
+        layout_dets = model_page_info['layout_dets']
         allow_category_id_list = [3, 5, 13, 14, 15]
         """当成span拼接的"""
         #  3: 'image', # 图片
@@ -554,29 +671,29 @@ class MagicModel:
         #  14: 'interline_equation',      # 行间公式
         #  15: 'text',      # ocr识别文本
         for layout_det in layout_dets:
-            category_id = layout_det["category_id"]
+            category_id = layout_det['category_id']
             if category_id in allow_category_id_list:
-                span = {"bbox": layout_det["bbox"], "score": layout_det["score"]}
+                span = {'bbox': layout_det['bbox'], 'score': layout_det['score']}
                 if category_id == 3:
-                    span["type"] = ContentType.Image
+                    span['type'] = ContentType.Image
                 elif category_id == 5:
                     # 获取table模型结果
-                    latex = layout_det.get("latex", None)
-                    html = layout_det.get("html", None)
+                    latex = layout_det.get('latex', None)
+                    html = layout_det.get('html', None)
                     if latex:
-                        span["latex"] = latex
+                        span['latex'] = latex
                     elif html:
-                        span["html"] = html
-                    span["type"] = ContentType.Table
+                        span['html'] = html
+                    span['type'] = ContentType.Table
                 elif category_id == 13:
-                    span["content"] = layout_det["latex"]
-                    span["type"] = ContentType.InlineEquation
+                    span['content'] = layout_det['latex']
+                    span['type'] = ContentType.InlineEquation
                 elif category_id == 14:
-                    span["content"] = layout_det["latex"]
-                    span["type"] = ContentType.InterlineEquation
+                    span['content'] = layout_det['latex']
+                    span['type'] = ContentType.InterlineEquation
                 elif category_id == 15:
-                    span["content"] = layout_det["text"]
-                    span["type"] = ContentType.Text
+                    span['content'] = layout_det['text']
+                    span['type'] = ContentType.Text
                 all_spans.append(span)
         return remove_duplicate_spans(all_spans)
@@ -593,19 +710,19 @@ class MagicModel:
     ) -> list:
         blocks = []
         for page_dict in self.__model_list:
-            layout_dets = page_dict.get("layout_dets", [])
-            page_info = page_dict.get("page_info", {})
-            page_number = page_info.get("page_no", -1)
+            layout_dets = page_dict.get('layout_dets', [])
+            page_info = page_dict.get('page_info', {})
+            page_number = page_info.get('page_no', -1)
             if page_no != page_number:
                 continue
             for item in layout_dets:
-                category_id = item.get("category_id", -1)
-                bbox = item.get("bbox", None)
+                category_id = item.get('category_id', -1)
+                bbox = item.get('bbox', None)
                 if category_id == type:
                     block = {
-                        "bbox": bbox,
-                        "score": item.get("score"),
+                        'bbox': bbox,
+                        'score': item.get('score'),
                     }
                     for col in extra_col:
                         block[col] = item.get(col, None)
@@ -616,28 +733,28 @@ class MagicModel:
         return self.__model_list[page_no]
-if __name__ == "__main__":
-    drw = DiskReaderWriter(r"D:/project/20231108code-clean")
+if __name__ == '__main__':
+    drw = DiskReaderWriter(r'D:/project/20231108code-clean')
     if 0:
-        pdf_file_path = r"linshixuqiu\19983-00.pdf"
-        model_file_path = r"linshixuqiu\19983-00_new.json"
+        pdf_file_path = r'linshixuqiu\19983-00.pdf'
+        model_file_path = r'linshixuqiu\19983-00_new.json'
         pdf_bytes = drw.read(pdf_file_path, AbsReaderWriter.MODE_BIN)
         model_json_txt = drw.read(model_file_path, AbsReaderWriter.MODE_TXT)
         model_list = json.loads(model_json_txt)
-        write_path = r"D:\project\20231108code-clean\linshixuqiu\19983-00"
-        img_bucket_path = "imgs"
+        write_path = r'D:\project\20231108code-clean\linshixuqiu\19983-00'
+        img_bucket_path = 'imgs'
         img_writer = DiskReaderWriter(join_path(write_path, img_bucket_path))
-        pdf_docs = fitz.open("pdf", pdf_bytes)
+        pdf_docs = fitz.open('pdf', pdf_bytes)
         magic_model = MagicModel(model_list, pdf_docs)
     if 1:
         model_list = json.loads(
-            drw.read("/opt/data/pdf/20240418/j.chroma.2009.03.042.json")
+            drw.read('/opt/data/pdf/20240418/j.chroma.2009.03.042.json')
         )
         pdf_bytes = drw.read(
-            "/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf", AbsReaderWriter.MODE_BIN
+            '/opt/data/pdf/20240418/j.chroma.2009.03.042.pdf', AbsReaderWriter.MODE_BIN
         )
-        pdf_docs = fitz.open("pdf", pdf_bytes)
+        pdf_docs = fitz.open('pdf', pdf_bytes)
         magic_model = MagicModel(model_list, pdf_docs)
         for i in range(7):
             print(magic_model.get_imgs(i))

magic-pdf 0.7.1__py3-none-any.whl → 0.8.1__py3-none-any.whl

magic-pdf 0.7.1py3-none-any.whl → 0.8.1py3-none-any.whl