PyPI - magic-pdf - Versions diffs - 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

magic-pdf 0.10.5py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

magic_pdf/config/constants.py +7 -0
magic_pdf/config/exceptions.py +7 -0
magic_pdf/data/data_reader_writer/base.py +13 -1
magic_pdf/data/data_reader_writer/filebase.py +1 -1
magic_pdf/data/data_reader_writer/multi_bucket_s3.py +8 -6
magic_pdf/data/dataset.py +188 -5
magic_pdf/data/read_api.py +59 -12
magic_pdf/data/utils.py +35 -0
magic_pdf/dict2md/ocr_mkcontent.py +16 -15
magic_pdf/filter/__init__.py +32 -0
magic_pdf/filter/pdf_meta_scan.py +3 -2
magic_pdf/libs/clean_memory.py +11 -4
magic_pdf/libs/config_reader.py +9 -0
magic_pdf/libs/draw_bbox.py +19 -22
magic_pdf/libs/language.py +3 -0
magic_pdf/libs/pdf_check.py +30 -30
magic_pdf/libs/version.py +1 -1
magic_pdf/model/__init__.py +1 -1
magic_pdf/model/batch_analyze.py +275 -0
magic_pdf/model/doc_analyze_by_custom_model.py +104 -92
magic_pdf/model/magic_model.py +4 -435
magic_pdf/model/model_list.py +1 -0
magic_pdf/model/pdf_extract_kit.py +35 -5
magic_pdf/model/sub_modules/language_detection/__init__.py +1 -0
magic_pdf/model/sub_modules/language_detection/utils.py +82 -0
magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +139 -0
magic_pdf/model/sub_modules/language_detection/yolov11/__init__.py +1 -0
magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +44 -7
magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +21 -2
magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +70 -27
magic_pdf/model/sub_modules/model_init.py +43 -7
magic_pdf/model/sub_modules/model_utils.py +17 -5
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +51 -1
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +32 -6
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +42 -7
magic_pdf/operators/__init__.py +94 -0
magic_pdf/operators/models.py +154 -0
magic_pdf/operators/pipes.py +191 -0
magic_pdf/pdf_parse_union_core_v2.py +77 -27
magic_pdf/post_proc/__init__.py +1 -0
magic_pdf/post_proc/llm_aided.py +133 -0
magic_pdf/pre_proc/ocr_span_list_modify.py +8 -0
magic_pdf/pre_proc/remove_bbox_overlap.py +1 -1
magic_pdf/resources/yolov11-langdetect/yolo_v11_ft.pt +0 -0
magic_pdf/tools/cli.py +36 -11
magic_pdf/tools/common.py +120 -61
magic_pdf/utils/office_to_pdf.py +29 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/METADATA +78 -25
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/RECORD +54 -55
magic_pdf/para/__init__.py +0 -0
magic_pdf/pdf_parse_by_ocr.py +0 -23
magic_pdf/pdf_parse_by_txt.py +0 -24
magic_pdf/pipe/AbsPipe.py +0 -98
magic_pdf/pipe/OCRPipe.py +0 -41
magic_pdf/pipe/TXTPipe.py +0 -41
magic_pdf/pipe/UNIPipe.py +0 -98
magic_pdf/pipe/__init__.py +0 -0
magic_pdf/rw/AbsReaderWriter.py +0 -17
magic_pdf/rw/DiskReaderWriter.py +0 -74
magic_pdf/rw/S3ReaderWriter.py +0 -142
magic_pdf/rw/__init__.py +0 -0
magic_pdf/user_api.py +0 -121
/magic_pdf/{para → post_proc}/para_split_v3.py +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/WHEEL +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.10.5.dist-info → magic_pdf-1.0.0.dist-info}/top_level.txt +0 -0

magic_pdf/model/magic_model.py CHANGED Viewed

@@ -3,12 +3,9 @@ import enum
 from magic_pdf.config.model_block_type import ModelBlockTypeEnum
 from magic_pdf.config.ocr_content_type import CategoryId, ContentType
 from magic_pdf.data.dataset import Dataset
-from magic_pdf.libs.boxbase import (_is_in, _is_part_overlap, bbox_distance,
-                                    bbox_relative_pos, box_area, calculate_iou,
-                                    calculate_overlap_area_in_bbox1_area_ratio,
-                                    get_overlap_area)
+from magic_pdf.libs.boxbase import (_is_in, bbox_distance, bbox_relative_pos,
+                                    calculate_iou)
 from magic_pdf.libs.coordinate_transform import get_scale_ratio
-from magic_pdf.libs.local_math import float_gt
 from magic_pdf.pre_proc.remove_bbox_overlap import _remove_overlap_between_bbox
 CAPATION_OVERLAP_AREA_RATIO = 0.6
@@ -208,393 +205,6 @@ class MagicModel:
                     keep[i] = False
         return [bboxes[i] for i in range(N) if keep[i]]
-    def __tie_up_category_by_distance(
-        self, page_no, subject_category_id, object_category_id
-    ):
-        """假定每个 subject 最多有一个 object (可以有多个相邻的 object 合并为单个 object)，每个 object
-        只能属于一个 subject."""
-        ret = []
-        MAX_DIS_OF_POINT = 10**9 + 7
-        """
-        subject 和 object 的 bbox 会合并成一个大的 bbox （named: merged bbox）。
-        筛选出所有和 merged bbox 有 overlap 且 overlap 面积大于 object 的面积的 subjects。
-        再求出筛选出的 subjects 和 object 的最短距离
-        """
-        def search_overlap_between_boxes(subject_idx, object_idx):
-            idxes = [subject_idx, object_idx]
-            x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
-            y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
-            x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
-            y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
-            merged_bbox = [
-                min(x0s),
-                min(y0s),
-                max(x1s),
-                max(y1s),
-            ]
-            ratio = 0
-            other_objects = list(
-                map(
-                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
-                    filter(
-                        lambda x: x['category_id']
-                        not in (object_category_id, subject_category_id),
-                        self.__model_list[page_no]['layout_dets'],
-                    ),
-                )
-            )
-            for other_object in other_objects:
-                ratio = max(
-                    ratio,
-                    get_overlap_area(merged_bbox, other_object['bbox'])
-                    * 1.0
-                    / box_area(all_bboxes[object_idx]['bbox']),
-                )
-                if ratio >= MERGE_BOX_OVERLAP_AREA_RATIO:
-                    break
-            return ratio
-        def may_find_other_nearest_bbox(subject_idx, object_idx):
-            ret = float('inf')
-            x0 = min(
-                all_bboxes[subject_idx]['bbox'][0], all_bboxes[object_idx]['bbox'][0]
-            )
-            y0 = min(
-                all_bboxes[subject_idx]['bbox'][1], all_bboxes[object_idx]['bbox'][1]
-            )
-            x1 = max(
-                all_bboxes[subject_idx]['bbox'][2], all_bboxes[object_idx]['bbox'][2]
-            )
-            y1 = max(
-                all_bboxes[subject_idx]['bbox'][3], all_bboxes[object_idx]['bbox'][3]
-            )
-            object_area = abs(
-                all_bboxes[object_idx]['bbox'][2] - all_bboxes[object_idx]['bbox'][0]
-            ) * abs(
-                all_bboxes[object_idx]['bbox'][3] - all_bboxes[object_idx]['bbox'][1]
-            )
-            for i in range(len(all_bboxes)):
-                if (
-                    i == subject_idx
-                    or all_bboxes[i]['category_id'] != subject_category_id
-                ):
-                    continue
-                if _is_part_overlap([x0, y0, x1, y1], all_bboxes[i]['bbox']) or _is_in(
-                    all_bboxes[i]['bbox'], [x0, y0, x1, y1]
-                ):
-                    i_area = abs(
-                        all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
-                    ) * abs(all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1])
-                    if i_area >= object_area:
-                        ret = min(float('inf'), dis[i][object_idx])
-            return ret
-        def expand_bbbox(idxes):
-            x0s = [all_bboxes[idx]['bbox'][0] for idx in idxes]
-            y0s = [all_bboxes[idx]['bbox'][1] for idx in idxes]
-            x1s = [all_bboxes[idx]['bbox'][2] for idx in idxes]
-            y1s = [all_bboxes[idx]['bbox'][3] for idx in idxes]
-            return min(x0s), min(y0s), max(x1s), max(y1s)
-        subjects = self.__reduct_overlap(
-            list(
-                map(
-                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
-                    filter(
-                        lambda x: x['category_id'] == subject_category_id,
-                        self.__model_list[page_no]['layout_dets'],
-                    ),
-                )
-            )
-        )
-        objects = self.__reduct_overlap(
-            list(
-                map(
-                    lambda x: {'bbox': x['bbox'], 'score': x['score']},
-                    filter(
-                        lambda x: x['category_id'] == object_category_id,
-                        self.__model_list[page_no]['layout_dets'],
-                    ),
-                )
-            )
-        )
-        subject_object_relation_map = {}
-        subjects.sort(
-            key=lambda x: x['bbox'][0] ** 2 + x['bbox'][1] ** 2
-        )  # get the distance !
-        all_bboxes = []
-        for v in subjects:
-            all_bboxes.append(
-                {
-                    'category_id': subject_category_id,
-                    'bbox': v['bbox'],
-                    'score': v['score'],
-                }
-            )
-        for v in objects:
-            all_bboxes.append(
-                {
-                    'category_id': object_category_id,
-                    'bbox': v['bbox'],
-                    'score': v['score'],
-                }
-            )
-        N = len(all_bboxes)
-        dis = [[MAX_DIS_OF_POINT] * N for _ in range(N)]
-        for i in range(N):
-            for j in range(i):
-                if (
-                    all_bboxes[i]['category_id'] == subject_category_id
-                    and all_bboxes[j]['category_id'] == subject_category_id
-                ):
-                    continue
-                subject_idx, object_idx = i, j
-                if all_bboxes[j]['category_id'] == subject_category_id:
-                    subject_idx, object_idx = j, i
-                if (
-                    search_overlap_between_boxes(subject_idx, object_idx)
-                    >= MERGE_BOX_OVERLAP_AREA_RATIO
-                ):
-                    dis[i][j] = float('inf')
-                    dis[j][i] = dis[i][j]
-                    continue
-                dis[i][j] = self._bbox_distance(
-                    all_bboxes[subject_idx]['bbox'], all_bboxes[object_idx]['bbox']
-                )
-                dis[j][i] = dis[i][j]
-        used = set()
-        for i in range(N):
-            # 求第 i 个 subject 所关联的 object
-            if all_bboxes[i]['category_id'] != subject_category_id:
-                continue
-            seen = set()
-            candidates = []
-            arr = []
-            for j in range(N):
-                pos_flag_count = sum(
-                    list(
-                        map(
-                            lambda x: 1 if x else 0,
-                            bbox_relative_pos(
-                                all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
-                            ),
-                        )
-                    )
-                )
-                if pos_flag_count > 1:
-                    continue
-                if (
-                    all_bboxes[j]['category_id'] != object_category_id
-                    or j in used
-                    or dis[i][j] == MAX_DIS_OF_POINT
-                ):
-                    continue
-                left, right, _, _ = bbox_relative_pos(
-                    all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
-                )  # 由  pos_flag_count 相关逻辑保证本段逻辑准确性
-                if left or right:
-                    one_way_dis = all_bboxes[i]['bbox'][2] - all_bboxes[i]['bbox'][0]
-                else:
-                    one_way_dis = all_bboxes[i]['bbox'][3] - all_bboxes[i]['bbox'][1]
-                if dis[i][j] > one_way_dis:
-                    continue
-                arr.append((dis[i][j], j))
-            arr.sort(key=lambda x: x[0])
-            if len(arr) > 0:
-                """
-                bug: 离该subject 最近的 object 可能跨越了其它的 subject。
-                比如 [this subect] [some sbuject] [the nearest object of subject]
-                """
-                if may_find_other_nearest_bbox(i, arr[0][1]) >= arr[0][0]:
-                    candidates.append(arr[0][1])
-                    seen.add(arr[0][1])
-            # 已经获取初始种子
-            for j in set(candidates):
-                tmp = []
-                for k in range(i + 1, N):
-                    pos_flag_count = sum(
-                        list(
-                            map(
-                                lambda x: 1 if x else 0,
-                                bbox_relative_pos(
-                                    all_bboxes[j]['bbox'], all_bboxes[k]['bbox']
-                                ),
-                            )
-                        )
-                    )
-                    if pos_flag_count > 1:
-                        continue
-                    if (
-                        all_bboxes[k]['category_id'] != object_category_id
-                        or k in used
-                        or k in seen
-                        or dis[j][k] == MAX_DIS_OF_POINT
-                        or dis[j][k] > dis[i][j]
-                    ):
-                        continue
-                    is_nearest = True
-                    for ni in range(i + 1, N):
-                        if ni in (j, k) or ni in used or ni in seen:
-                            continue
-                        if not float_gt(dis[ni][k], dis[j][k]):
-                            is_nearest = False
-                            break
-                    if is_nearest:
-                        nx0, ny0, nx1, ny1 = expand_bbbox(list(seen) + [k])
-                        n_dis = bbox_distance(
-                            all_bboxes[i]['bbox'], [nx0, ny0, nx1, ny1]
-                        )
-                        if float_gt(dis[i][j], n_dis):
-                            continue
-                        tmp.append(k)
-                        seen.add(k)
-                candidates = tmp
-                if len(candidates) == 0:
-                    break
-            # 已经获取到某个 figure 下所有的最靠近的 captions，以及最靠近这些 captions 的 captions 。
-            # 先扩一下 bbox，
-            ox0, oy0, ox1, oy1 = expand_bbbox(list(seen) + [i])
-            ix0, iy0, ix1, iy1 = all_bboxes[i]['bbox']
-            # 分成了 4 个截取空间，需要计算落在每个截取空间下 objects 合并后占据的矩形面积
-            caption_poses = [
-                [ox0, oy0, ix0, oy1],
-                [ox0, oy0, ox1, iy0],
-                [ox0, iy1, ox1, oy1],
-                [ix1, oy0, ox1, oy1],
-            ]
-            caption_areas = []
-            for bbox in caption_poses:
-                embed_arr = []
-                for idx in seen:
-                    if (
-                        calculate_overlap_area_in_bbox1_area_ratio(
-                            all_bboxes[idx]['bbox'], bbox
-                        )
-                        > CAPATION_OVERLAP_AREA_RATIO
-                    ):
-                        embed_arr.append(idx)
-                if len(embed_arr) > 0:
-                    embed_x0 = min([all_bboxes[idx]['bbox'][0] for idx in embed_arr])
-                    embed_y0 = min([all_bboxes[idx]['bbox'][1] for idx in embed_arr])
-                    embed_x1 = max([all_bboxes[idx]['bbox'][2] for idx in embed_arr])
-                    embed_y1 = max([all_bboxes[idx]['bbox'][3] for idx in embed_arr])
-                    caption_areas.append(
-                        int(abs(embed_x1 - embed_x0) * abs(embed_y1 - embed_y0))
-                    )
-                else:
-                    caption_areas.append(0)
-            subject_object_relation_map[i] = []
-            if max(caption_areas) > 0:
-                max_area_idx = caption_areas.index(max(caption_areas))
-                caption_bbox = caption_poses[max_area_idx]
-                for j in seen:
-                    if (
-                        calculate_overlap_area_in_bbox1_area_ratio(
-                            all_bboxes[j]['bbox'], caption_bbox
-                        )
-                        > CAPATION_OVERLAP_AREA_RATIO
-                    ):
-                        used.add(j)
-                        subject_object_relation_map[i].append(j)
-        for i in sorted(subject_object_relation_map.keys()):
-            result = {
-                'subject_body': all_bboxes[i]['bbox'],
-                'all': all_bboxes[i]['bbox'],
-                'score': all_bboxes[i]['score'],
-            }
-            if len(subject_object_relation_map[i]) > 0:
-                x0 = min(
-                    [all_bboxes[j]['bbox'][0] for j in subject_object_relation_map[i]]
-                )
-                y0 = min(
-                    [all_bboxes[j]['bbox'][1] for j in subject_object_relation_map[i]]
-                )
-                x1 = max(
-                    [all_bboxes[j]['bbox'][2] for j in subject_object_relation_map[i]]
-                )
-                y1 = max(
-                    [all_bboxes[j]['bbox'][3] for j in subject_object_relation_map[i]]
-                )
-                result['object_body'] = [x0, y0, x1, y1]
-                result['all'] = [
-                    min(x0, all_bboxes[i]['bbox'][0]),
-                    min(y0, all_bboxes[i]['bbox'][1]),
-                    max(x1, all_bboxes[i]['bbox'][2]),
-                    max(y1, all_bboxes[i]['bbox'][3]),
-                ]
-            ret.append(result)
-        total_subject_object_dis = 0
-        # 计算已经配对的 distance 距离
-        for i in subject_object_relation_map.keys():
-            for j in subject_object_relation_map[i]:
-                total_subject_object_dis += bbox_distance(
-                    all_bboxes[i]['bbox'], all_bboxes[j]['bbox']
-                )
-        # 计算未匹配的 subject 和 object 的距离（非精确版）
-        with_caption_subject = set(
-            [
-                key
-                for key in subject_object_relation_map.keys()
-                if len(subject_object_relation_map[i]) > 0
-            ]
-        )
-        for i in range(N):
-            if all_bboxes[i]['category_id'] != object_category_id or i in used:
-                continue
-            candidates = []
-            for j in range(N):
-                if (
-                    all_bboxes[j]['category_id'] != subject_category_id
-                    or j in with_caption_subject
-                ):
-                    continue
-                candidates.append((dis[i][j], j))
-            if len(candidates) > 0:
-                candidates.sort(key=lambda x: x[0])
-                total_subject_object_dis += candidates[0][1]
-                with_caption_subject.add(j)
-        return ret, total_subject_object_dis
     def __tie_up_category_by_distance_v2(
         self,
         page_no: int,
@@ -879,52 +489,12 @@ class MagicModel:
         return ret
     def get_imgs(self, page_no: int):
-        with_captions, _ = self.__tie_up_category_by_distance(page_no, 3, 4)
-        with_footnotes, _ = self.__tie_up_category_by_distance(
-            page_no, 3, CategoryId.ImageFootnote
-        )
-        ret = []
-        N, M = len(with_captions), len(with_footnotes)
-        assert N == M
-        for i in range(N):
-            record = {
-                'score': with_captions[i]['score'],
-                'img_caption_bbox': with_captions[i].get('object_body', None),
-                'img_body_bbox': with_captions[i]['subject_body'],
-                'img_footnote_bbox': with_footnotes[i].get('object_body', None),
-            }
-            x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
-            y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
-            x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
-            y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
-            record['bbox'] = [x0, y0, x1, y1]
-            ret.append(record)
-        return ret
+        return self.get_imgs_v2(page_no)
     def get_tables(
         self, page_no: int
     ) -> list:  # 3个坐标， caption, table主体，table-note
-        with_captions, _ = self.__tie_up_category_by_distance(page_no, 5, 6)
-        with_footnotes, _ = self.__tie_up_category_by_distance(page_no, 5, 7)
-        ret = []
-        N, M = len(with_captions), len(with_footnotes)
-        assert N == M
-        for i in range(N):
-            record = {
-                'score': with_captions[i]['score'],
-                'table_caption_bbox': with_captions[i].get('object_body', None),
-                'table_body_bbox': with_captions[i]['subject_body'],
-                'table_footnote_bbox': with_footnotes[i].get('object_body', None),
-            }
-            x0 = min(with_captions[i]['all'][0], with_footnotes[i]['all'][0])
-            y0 = min(with_captions[i]['all'][1], with_footnotes[i]['all'][1])
-            x1 = max(with_captions[i]['all'][2], with_footnotes[i]['all'][2])
-            y1 = max(with_captions[i]['all'][3], with_footnotes[i]['all'][3])
-            record['bbox'] = [x0, y0, x1, y1]
-            ret.append(record)
-        return ret
+        return self.get_tables_v2(page_no)
     def get_equations(self, page_no: int) -> list:  # 有坐标，也有字
         inline_equations = self.__get_blocks_by_type(
@@ -1043,4 +613,3 @@ class MagicModel:
     def get_model_list(self, page_no):
         return self.__model_list[page_no]

magic_pdf/model/model_list.py CHANGED Viewed

@@ -9,3 +9,4 @@ class AtomicModel:
     MFR = "mfr"
     OCR = "ocr"
     Table = "table"
+    LangDetect = "langdetect"

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -10,7 +10,6 @@ from loguru import logger
 from PIL import Image
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
-os.environ['YOLO_VERBOSE'] = 'False'  # disable yolo logger
 try:
     import torchtext
@@ -88,6 +87,14 @@ class CustomPEKModel:
         )
         # 初始化解析方案
         self.device = kwargs.get('device', 'cpu')
+        if str(self.device).startswith("npu"):
+            import torch_npu
+            os.environ['FLAGS_npu_jit_compile'] = '0'
+            os.environ['FLAGS_use_stride_kernel'] = '0'
+        elif str(self.device).startswith("mps"):
+            os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
         logger.info('using device: {}'.format(self.device))
         models_dir = kwargs.get(
             'models_dir', os.path.join(root_dir, 'resources', 'models')
@@ -114,11 +121,12 @@ class CustomPEKModel:
                 os.path.join(models_dir, self.configs['weights'][self.mfr_model_name])
             )
             mfr_cfg_path = str(os.path.join(model_config_dir, 'UniMERNet', 'demo.yaml'))
             self.mfr_model = atom_model_manager.get_atom_model(
                 atom_model_name=AtomicModel.MFR,
                 mfr_weight_dir=mfr_weight_dir,
                 mfr_cfg_path=mfr_cfg_path,
-                device=self.device,
+                device='cpu' if str(self.device).startswith("mps") else self.device,
             )
         # 初始化layout模型
@@ -165,12 +173,17 @@ class CustomPEKModel:
                 table_model_path=str(os.path.join(models_dir, table_model_dir)),
                 table_max_time=self.table_max_time,
                 device=self.device,
+                ocr_engine=self.ocr_model,
             )
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
+        pil_img = Image.fromarray(image)
+        width, height = pil_img.size
+        # logger.info(f'width: {width}, height: {height}')
         # layout检测
         layout_start = time.time()
         layout_res = []
@@ -179,12 +192,28 @@ class CustomPEKModel:
             layout_res = self.layout_model(image, ignore_catids=[])
         elif self.layout_model_name == MODEL_NAME.DocLayout_YOLO:
             # doclayout_yolo
-            layout_res = self.layout_model.predict(image)
+            if height > width:
+                input_res = {"poly":[0,0,width,0,width,height,0,height]}
+                new_image, useful_list = crop_img(input_res, pil_img, crop_paste_x=width//2, crop_paste_y=0)
+                paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
+                layout_res = self.layout_model.predict(new_image)
+                for res in layout_res:
+                    p1, p2, p3, p4, p5, p6, p7, p8 = res['poly']
+                    p1 = p1 - paste_x + xmin
+                    p2 = p2 - paste_y + ymin
+                    p3 = p3 - paste_x + xmin
+                    p4 = p4 - paste_y + ymin
+                    p5 = p5 - paste_x + xmin
+                    p6 = p6 - paste_y + ymin
+                    p7 = p7 - paste_x + xmin
+                    p8 = p8 - paste_y + ymin
+                    res['poly'] = [p1, p2, p3, p4, p5, p6, p7, p8]
+            else:
+                layout_res = self.layout_model.predict(image)
         layout_cost = round(time.time() - layout_start, 2)
         logger.info(f'layout detection time: {layout_cost}')
-        pil_img = Image.fromarray(image)
         if self.apply_formula:
             # 公式检测
             mfd_start = time.time()
@@ -215,6 +244,7 @@ class CustomPEKModel:
             # OCR recognition
             new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
             if self.apply_ocr:
                 ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
             else:

magic_pdf/model/sub_modules/language_detection/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ # Copyright (c) Opendatalab. All rights reserved.

magic_pdf/model/sub_modules/language_detection/utils.py ADDED Viewed

@@ -0,0 +1,82 @@
+# Copyright (c) Opendatalab. All rights reserved.
+import os
+from pathlib import Path
+import yaml
+from PIL import Image
+os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
+from magic_pdf.config.constants import MODEL_NAME
+from magic_pdf.data.utils import load_images_from_pdf
+from magic_pdf.libs.config_reader import get_local_models_dir, get_device
+from magic_pdf.libs.pdf_check import extract_pages
+from magic_pdf.model.model_list import AtomicModel
+from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
+def get_model_config():
+    local_models_dir = get_local_models_dir()
+    device = get_device()
+    current_file_path = os.path.abspath(__file__)
+    root_dir = Path(current_file_path).parents[3]
+    model_config_dir = os.path.join(root_dir, 'resources', 'model_config')
+    config_path = os.path.join(model_config_dir, 'model_configs.yaml')
+    with open(config_path, 'r', encoding='utf-8') as f:
+        configs = yaml.load(f, Loader=yaml.FullLoader)
+    return root_dir, local_models_dir, device, configs
+def get_text_images(simple_images):
+    _, local_models_dir, device, configs = get_model_config()
+    atom_model_manager = AtomModelSingleton()
+    temp_layout_model = atom_model_manager.get_atom_model(
+        atom_model_name=AtomicModel.Layout,
+        layout_model_name=MODEL_NAME.DocLayout_YOLO,
+        doclayout_yolo_weights=str(
+            os.path.join(
+                local_models_dir, configs['weights'][MODEL_NAME.DocLayout_YOLO]
+            )
+        ),
+        device=device,
+    )
+    text_images = []
+    for simple_image in simple_images:
+        image = Image.fromarray(simple_image['img'])
+        layout_res = temp_layout_model.predict(image)
+        # 给textblock截图
+        for res in layout_res:
+            if res['category_id'] in [1]:
+                x1, y1, _, _, x2, y2, _, _ = res['poly']
+                # 初步清洗（宽和高都小于100）
+                if x2 - x1 < 100 and y2 - y1 < 100:
+                    continue
+                text_images.append(image.crop((x1, y1, x2, y2)))
+    return text_images
+def auto_detect_lang(pdf_bytes: bytes):
+    sample_docs = extract_pages(pdf_bytes)
+    sample_pdf_bytes = sample_docs.tobytes()
+    simple_images = load_images_from_pdf(sample_pdf_bytes, dpi=200)
+    text_images = get_text_images(simple_images)
+    langdetect_model = model_init(MODEL_NAME.YOLO_V11_LangDetect)
+    lang = langdetect_model.do_detect(text_images)
+    return lang
+def model_init(model_name: str):
+    atom_model_manager = AtomModelSingleton()
+    if model_name == MODEL_NAME.YOLO_V11_LangDetect:
+        root_dir, _, device, _ = get_model_config()
+        model = atom_model_manager.get_atom_model(
+            atom_model_name=AtomicModel.LangDetect,
+            langdetect_model_name=MODEL_NAME.YOLO_V11_LangDetect,
+            langdetect_model_weight=str(os.path.join(root_dir, 'resources', 'yolov11-langdetect', 'yolo_v11_ft.pt')),
+            device=device,
+        )
+    else:
+        raise ValueError(f"model_name {model_name} not found")
+    return model

magic-pdf 0.10.5__py3-none-any.whl → 1.0.0__py3-none-any.whl

magic-pdf 0.10.5py3-none-any.whl → 1.0.0py3-none-any.whl