PyPI - magic-pdf - Versions diffs - 0.5.13__py3-none-any.whl → 0.6.1__py3-none-any.whl - Mend

magic-pdf 0.5.13py3-none-any.whl → 0.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

magic_pdf/cli/magicpdf.py CHANGED Viewed

@@ -28,18 +28,20 @@ from loguru import logger
 from pathlib import Path
 from magic_pdf.libs.version import __version__
-from magic_pdf.libs.MakeContentConfig import DropMode
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
 from magic_pdf.pipe.UNIPipe import UNIPipe
 from magic_pdf.pipe.OCRPipe import OCRPipe
 from magic_pdf.pipe.TXTPipe import TXTPipe
-from magic_pdf.libs.config_reader import get_s3_config
 from magic_pdf.libs.path_utils import (
     parse_s3path,
     parse_s3_range_params,
     remove_non_official_s3_args,
 )
-from magic_pdf.libs.config_reader import get_local_dir
+from magic_pdf.libs.config_reader import (
+    get_local_dir,
+    get_s3_config,
+)
 from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
 from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
 from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
@@ -81,10 +83,12 @@ def do_parse(
         f_dump_model_json=True,
         f_dump_orig_pdf=True,
         f_dump_content_list=True,
+        f_make_md_mode=MakeMode.MM_MD,
 ):
     orig_model_list = copy.deepcopy(model_list)
     local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
+    logger.info(f"local output dir is {local_md_dir}")
     image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
     image_dir = str(os.path.basename(local_image_dir))
@@ -105,6 +109,7 @@ def do_parse(
     if len(model_list) == 0:
         if model_config.__use_inside_model__:
             pipe.pipe_analyze()
+            orig_model_list = copy.deepcopy(pipe.model_list)
         else:
             logger.error("need model list input")
             exit(1)
@@ -116,7 +121,7 @@ def do_parse(
     if f_draw_span_bbox:
         draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
-    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE)
+    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
     if f_dump_md:
         """写markdown"""
         md_writer.write(
@@ -175,8 +180,10 @@ def cli():
     default="auto",
 )
 @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-def json_command(json, method, inside_model):
+@click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
+def json_command(json, method, inside_model, model_mode):
     model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode
     if not json.startswith("s3://"):
         logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
@@ -226,8 +233,10 @@ def json_command(json, method, inside_model):
     default="auto",
 )
 @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-def local_json_command(local_json, method, inside_model):
+@click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
+def local_json_command(local_json, method, inside_model, model_mode):
     model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode
     def read_s3_path(s3path):
         bucket, key = parse_s3path(s3path)
@@ -278,8 +287,10 @@ def local_json_command(local_json, method, inside_model):
     default="auto",
 )
 @click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-def pdf_command(pdf, model, method, inside_model):
+@click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
+def pdf_command(pdf, model, method, inside_model, model_mode):
     model_config.__use_inside_model__ = inside_model
+    model_config.__model_mode__ = model_mode
     def read_fn(path):
         disk_rw = DiskReaderWriter(os.path.dirname(path))

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -112,7 +112,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                         for line in block['lines']:
                             for span in line['spans']:
                                 if span['type'] == ContentType.Image:
-                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
+                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 2nd.拼image_caption
                     if block['type'] == BlockType.ImageCaption:
                         para_text += merge_para_with_text(block)
@@ -128,7 +128,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                         for line in block['lines']:
                             for span in line['spans']:
                                 if span['type'] == ContentType.Table:
-                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
+                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
                     if block['type'] == BlockType.TableFootnote:
                         para_text += merge_para_with_text(block)

magic_pdf/libs/config_reader.py CHANGED Viewed

@@ -59,5 +59,15 @@ def get_local_dir():
     return config.get("temp-output-dir", "/tmp")
+def get_local_models_dir():
+    config = read_config()
+    return config.get("models-dir", "/tmp/models")
+def get_device():
+    config = read_config()
+    return config.get("device-mode", "cpu")
 if __name__ == "__main__":
     ak, sk, endpoint = get_s3_config("llm-raw")

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.5.13"
1	+ __version__ = "0.6.1"

magic_pdf/model/__init__.py CHANGED Viewed

	@@ -1 +1,2 @@
1 1	__use_inside_model__ = False
2	+ __model_mode__ = "full"

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -1,6 +1,10 @@
+import time
 import fitz
 import numpy as np
 from loguru import logger
+from magic_pdf.libs.config_reader import get_local_models_dir, get_device
 from magic_pdf.model.model_list import MODEL
 import magic_pdf.model as model_config
@@ -21,10 +25,11 @@ def remove_duplicates_dicts(lst):
 def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
     try:
-        import cv2
         from PIL import Image
     except ImportError:
-        logger.error("opencv-python and Pillow are not installed, please install by pip.")
+        logger.error("Pillow not installed, please install by pip.")
+        exit(1)
     images = []
     with fitz.open("pdf", pdf_bytes) as doc:
         for index in range(0, doc.page_count):
@@ -32,32 +37,49 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
             mat = fitz.Matrix(dpi / 72, dpi / 72)
             pm = page.get_pixmap(matrix=mat, alpha=False)
-            # if width or height > 2000 pixels, don't enlarge the image
-            # if pm.width > 2000 or pm.height > 2000:
-            #     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
+            # if width or height > 3000 pixels, don't enlarge the image
+            if pm.width > 3000 or pm.height > 3000:
+                pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-            img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
-            img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+            img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
+            img = np.array(img)
             img_dict = {"img": img, "width": pm.width, "height": pm.height}
             images.append(img_dict)
     return images
-def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, model=MODEL.Paddle):
+def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False):
+    model = None
+    if model_config.__model_mode__ == "lite":
+        model = MODEL.Paddle
+    elif model_config.__model_mode__ == "full":
+        model = MODEL.PEK
     if model_config.__use_inside_model__:
-        from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
+        model_init_start = time.time()
+        if model == MODEL.Paddle:
+            from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
+            custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
+        elif model == MODEL.PEK:
+            from magic_pdf.model.pdf_extract_kit import CustomPEKModel
+            # 从配置文件读取model-dir和device
+            local_models_dir = get_local_models_dir()
+            device = get_device()
+            custom_model = CustomPEKModel(ocr=ocr, show_log=show_log, models_dir=local_models_dir, device=device)
+        else:
+            logger.error("Not allow model_name!")
+            exit(1)
+        model_init_cost = time.time() - model_init_start
+        logger.info(f"model init cost: {model_init_cost}")
     else:
         logger.error("use_inside_model is False, not allow to use inside model")
         exit(1)
     images = load_images_from_pdf(pdf_bytes)
-    custom_model = None
-    if model == MODEL.Paddle:
-        custom_model = CustomPaddleModel(ocr=ocr, show_log=show_log)
-    else:
-        pass
     model_json = []
+    doc_analyze_start = time.time()
     for index, img_dict in enumerate(images):
         img = img_dict["img"]
         page_width = img_dict["width"]
@@ -65,7 +87,8 @@ def doc_analyze(pdf_bytes: bytes, ocr: bool = False, show_log: bool = False, mod
         result = custom_model(img)
         page_info = {"page_no": index, "height": page_height, "width": page_width}
         page_dict = {"layout_dets": result, "page_info": page_info}
         model_json.append(page_dict)
+    doc_analyze_cost = time.time() - doc_analyze_start
+    logger.info(f"doc analyze cost: {doc_analyze_cost}")
     return model_json

magic_pdf/model/model_list.py CHANGED Viewed

@@ -1,2 +1,3 @@
 class MODEL:
     Paddle = "pp_structure_v2"
+    PEK = "pdf_extract_kit"

magic_pdf/model/pdf_extract_kit.py ADDED Viewed

@@ -0,0 +1,200 @@
+from loguru import logger
+import os
+try:
+    import cv2
+    import yaml
+    import time
+    import argparse
+    import numpy as np
+    import torch
+    from paddleocr import draw_ocr
+    from PIL import Image
+    from torchvision import transforms
+    from torch.utils.data import Dataset, DataLoader
+    from ultralytics import YOLO
+    from unimernet.common.config import Config
+    import unimernet.tasks as tasks
+    from unimernet.processors import load_processor
+    from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
+    from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
+    from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
+except ImportError:
+    logger.error('Required dependency not installed, please install by \n"pip install magic-pdf[full-cpu] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
+    exit(1)
+def mfd_model_init(weight):
+    mfd_model = YOLO(weight)
+    return mfd_model
+def mfr_model_init(weight_dir, cfg_path, _device_='cpu'):
+    args = argparse.Namespace(cfg_path=cfg_path, options=None)
+    cfg = Config(args)
+    cfg.config.model.pretrained = os.path.join(weight_dir, "pytorch_model.bin")
+    cfg.config.model.model_config.model_name = weight_dir
+    cfg.config.model.tokenizer_config.path = weight_dir
+    task = tasks.setup_task(cfg)
+    model = task.build_model(cfg)
+    model = model.to(_device_)
+    vis_processor = load_processor('formula_image_eval', cfg.config.datasets.formula_rec_eval.vis_processor.eval)
+    return model, vis_processor
+def layout_model_init(weight, config_file, device):
+    model = Layoutlmv3_Predictor(weight, config_file, device)
+    return model
+class MathDataset(Dataset):
+    def __init__(self, image_paths, transform=None):
+        self.image_paths = image_paths
+        self.transform = transform
+    def __len__(self):
+        return len(self.image_paths)
+    def __getitem__(self, idx):
+        # if not pil image, then convert to pil image
+        if isinstance(self.image_paths[idx], str):
+            raw_image = Image.open(self.image_paths[idx])
+        else:
+            raw_image = self.image_paths[idx]
+        if self.transform:
+            image = self.transform(raw_image)
+            return image
+class CustomPEKModel:
+    def __init__(self, ocr: bool = False, show_log: bool = False, **kwargs):
+        """
+        ======== model init ========
+        """
+        # 获取当前文件（即 pdf_extract_kit.py）的绝对路径
+        current_file_path = os.path.abspath(__file__)
+        # 获取当前文件所在的目录(model)
+        current_dir = os.path.dirname(current_file_path)
+        # 上一级目录(magic_pdf)
+        root_dir = os.path.dirname(current_dir)
+        # model_config目录
+        model_config_dir = os.path.join(root_dir, 'resources', 'model_config')
+        # 构建 model_configs.yaml 文件的完整路径
+        config_path = os.path.join(model_config_dir, 'model_configs.yaml')
+        with open(config_path, "r") as f:
+            self.configs = yaml.load(f, Loader=yaml.FullLoader)
+        # 初始化解析配置
+        self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
+        self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        self.apply_ocr = ocr
+        logger.info(
+            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}".format(
+                self.apply_layout, self.apply_formula, self.apply_ocr
+            )
+        )
+        assert self.apply_layout, "DocAnalysis must contain layout model."
+        # 初始化解析方案
+        self.device = kwargs.get("device", self.configs["config"]["device"])
+        logger.info("using device: {}".format(self.device))
+        models_dir = kwargs.get("models_dir", os.path.join(root_dir, "resources", "models"))
+        # 初始化公式识别
+        if self.apply_formula:
+            # 初始化公式检测模型
+            self.mfd_model = mfd_model_init(str(os.path.join(models_dir, self.configs["weights"]["mfd"])))
+            # 初始化公式解析模型
+            mfr_weight_dir = str(os.path.join(models_dir, self.configs["weights"]["mfr"]))
+            mfr_cfg_path = str(os.path.join(model_config_dir, "UniMERNet", "demo.yaml"))
+            self.mfr_model, mfr_vis_processors = mfr_model_init(mfr_weight_dir, mfr_cfg_path, _device_=self.device)
+            self.mfr_transform = transforms.Compose([mfr_vis_processors, ])
+        # 初始化layout模型
+        self.layout_model = Layoutlmv3_Predictor(
+            str(os.path.join(models_dir, self.configs['weights']['layout'])),
+            str(os.path.join(model_config_dir, "layoutlmv3", "layoutlmv3_base_inference.yaml")),
+            device=self.device
+        )
+        # 初始化ocr
+        if self.apply_ocr:
+            self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
+        logger.info('DocAnalysis init done!')
+    def __call__(self, image):
+        latex_filling_list = []
+        mf_image_list = []
+        # layout检测
+        layout_start = time.time()
+        layout_res = self.layout_model(image, ignore_catids=[])
+        layout_cost = round(time.time() - layout_start, 2)
+        logger.info(f"layout detection cost: {layout_cost}")
+        # 公式检测
+        mfd_res = self.mfd_model.predict(image, imgsz=1888, conf=0.25, iou=0.45, verbose=True)[0]
+        for xyxy, conf, cla in zip(mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()):
+            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
+            new_item = {
+                'category_id': 13 + int(cla.item()),
+                'poly': [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
+                'score': round(float(conf.item()), 2),
+                'latex': '',
+            }
+            layout_res.append(new_item)
+            latex_filling_list.append(new_item)
+            bbox_img = get_croped_image(Image.fromarray(image), [xmin, ymin, xmax, ymax])
+            mf_image_list.append(bbox_img)
+        # 公式识别
+        mfr_start = time.time()
+        dataset = MathDataset(mf_image_list, transform=self.mfr_transform)
+        dataloader = DataLoader(dataset, batch_size=64, num_workers=0)
+        mfr_res = []
+        for mf_img in dataloader:
+            mf_img = mf_img.to(self.device)
+            output = self.mfr_model.generate({'image': mf_img})
+            mfr_res.extend(output['pred_str'])
+        for res, latex in zip(latex_filling_list, mfr_res):
+            res['latex'] = latex_rm_whitespace(latex)
+        mfr_cost = round(time.time() - mfr_start, 2)
+        logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
+        # ocr识别
+        if self.apply_ocr:
+            ocr_start = time.time()
+            pil_img = Image.fromarray(image)
+            single_page_mfdetrec_res = []
+            for res in layout_res:
+                if int(res['category_id']) in [13, 14]:
+                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
+                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
+                    single_page_mfdetrec_res.append({
+                        "bbox": [xmin, ymin, xmax, ymax],
+                    })
+            for res in layout_res:
+                if int(res['category_id']) in [0, 1, 2, 4, 6, 7]:  # 需要进行ocr的类别
+                    xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
+                    xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
+                    crop_box = (xmin, ymin, xmax, ymax)
+                    cropped_img = Image.new('RGB', pil_img.size, 'white')
+                    cropped_img.paste(pil_img.crop(crop_box), crop_box)
+                    cropped_img = cv2.cvtColor(np.asarray(cropped_img), cv2.COLOR_RGB2BGR)
+                    ocr_res = self.ocr_model.ocr(cropped_img, mfd_res=single_page_mfdetrec_res)[0]
+                    if ocr_res:
+                        for box_ocr_res in ocr_res:
+                            p1, p2, p3, p4 = box_ocr_res[0]
+                            text, score = box_ocr_res[1]
+                            layout_res.append({
+                                'category_id': 15,
+                                'poly': p1 + p2 + p3 + p4,
+                                'score': round(score, 2),
+                                'text': text,
+                            })
+            ocr_cost = round(time.time() - ocr_start, 2)
+            logger.info(f"ocr cost: {ocr_cost}")
+        return layout_res

magic_pdf/model/pek_sub_modules/__init__.py ADDED Viewed

File without changes

magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py ADDED Viewed

File without changes

magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py ADDED Viewed

@@ -0,0 +1,179 @@
+# --------------------------------------------------------------------------------
+# VIT: Multi-Path Vision Transformer for Dense Prediction
+# Copyright (c) 2022 Electronics and Telecommunications Research Institute (ETRI).
+# All Rights Reserved.
+# Written by Youngwan Lee
+# This source code is licensed(Dual License(GPL3.0 & Commercial)) under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------------------------------
+# References:
+# timm: https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# CoaT: https://github.com/mlpc-ucsd/CoaT
+# --------------------------------------------------------------------------------
+import torch
+from detectron2.layers import (
+    ShapeSpec,
+)
+from detectron2.modeling import Backbone, BACKBONE_REGISTRY, FPN
+from detectron2.modeling.backbone.fpn import LastLevelP6P7, LastLevelMaxPool
+from .beit import beit_base_patch16, dit_base_patch16, dit_large_patch16, beit_large_patch16
+from .deit import deit_base_patch16, mae_base_patch16
+from .layoutlmft.models.layoutlmv3 import LayoutLMv3Model
+from transformers import AutoConfig
+__all__ = [
+    "build_vit_fpn_backbone",
+]
+class VIT_Backbone(Backbone):
+    """
+    Implement VIT backbone.
+    """
+    def __init__(self, name, out_features, drop_path, img_size, pos_type, model_kwargs,
+                 config_path=None, image_only=False, cfg=None):
+        super().__init__()
+        self._out_features = out_features
+        if 'base' in name:
+            self._out_feature_strides = {"layer3": 4, "layer5": 8, "layer7": 16, "layer11": 32}
+            self._out_feature_channels = {"layer3": 768, "layer5": 768, "layer7": 768, "layer11": 768}
+        else:
+            self._out_feature_strides = {"layer7": 4, "layer11": 8, "layer15": 16, "layer23": 32}
+            self._out_feature_channels = {"layer7": 1024, "layer11": 1024, "layer15": 1024, "layer23": 1024}
+        if name == 'beit_base_patch16':
+            model_func = beit_base_patch16
+        elif name == 'dit_base_patch16':
+            model_func = dit_base_patch16
+        elif name == "deit_base_patch16":
+            model_func = deit_base_patch16
+        elif name == "mae_base_patch16":
+            model_func = mae_base_patch16
+        elif name == "dit_large_patch16":
+            model_func = dit_large_patch16
+        elif name == "beit_large_patch16":
+            model_func = beit_large_patch16
+        if 'beit' in name or 'dit' in name:
+            if pos_type == "abs":
+                self.backbone = model_func(img_size=img_size,
+                                           out_features=out_features,
+                                           drop_path_rate=drop_path,
+                                           use_abs_pos_emb=True,
+                                           **model_kwargs)
+            elif pos_type == "shared_rel":
+                self.backbone = model_func(img_size=img_size,
+                                           out_features=out_features,
+                                           drop_path_rate=drop_path,
+                                           use_shared_rel_pos_bias=True,
+                                           **model_kwargs)
+            elif pos_type == "rel":
+                self.backbone = model_func(img_size=img_size,
+                                           out_features=out_features,
+                                           drop_path_rate=drop_path,
+                                           use_rel_pos_bias=True,
+                                           **model_kwargs)
+            else:
+                raise ValueError()
+        elif "layoutlmv3" in name:
+            config = AutoConfig.from_pretrained(config_path)
+            # disable relative bias as DiT
+            config.has_spatial_attention_bias = False
+            config.has_relative_attention_bias = False
+            self.backbone = LayoutLMv3Model(config, detection=True,
+                                               out_features=out_features, image_only=image_only)
+        else:
+            self.backbone = model_func(img_size=img_size,
+                                       out_features=out_features,
+                                       drop_path_rate=drop_path,
+                                       **model_kwargs)
+        self.name = name
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        if "layoutlmv3" in self.name:
+            return self.backbone.forward(
+                input_ids=x["input_ids"] if "input_ids" in x else None,
+                bbox=x["bbox"] if "bbox" in x else None,
+                images=x["images"] if "images" in x else None,
+                attention_mask=x["attention_mask"] if "attention_mask" in x else None,
+                # output_hidden_states=True,
+            )
+        assert x.dim() == 4, f"VIT takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        return self.backbone.forward_features(x)
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name], stride=self._out_feature_strides[name]
+            )
+            for name in self._out_features
+        }
+def build_VIT_backbone(cfg):
+    """
+    Create a VIT instance from config.
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        A VIT backbone instance.
+    """
+    # fmt: off
+    name = cfg.MODEL.VIT.NAME
+    out_features = cfg.MODEL.VIT.OUT_FEATURES
+    drop_path = cfg.MODEL.VIT.DROP_PATH
+    img_size = cfg.MODEL.VIT.IMG_SIZE
+    pos_type = cfg.MODEL.VIT.POS_TYPE
+    model_kwargs = eval(str(cfg.MODEL.VIT.MODEL_KWARGS).replace("`", ""))
+    if 'layoutlmv3' in name:
+        if cfg.MODEL.CONFIG_PATH != '':
+            config_path = cfg.MODEL.CONFIG_PATH
+        else:
+            config_path = cfg.MODEL.WEIGHTS.replace('pytorch_model.bin', '')  # layoutlmv3 pre-trained models
+            config_path = config_path.replace('model_final.pth', '')  # detection fine-tuned models
+    else:
+        config_path = None
+    return VIT_Backbone(name, out_features, drop_path, img_size, pos_type, model_kwargs,
+                        config_path=config_path, image_only=cfg.MODEL.IMAGE_ONLY, cfg=cfg)
+@BACKBONE_REGISTRY.register()
+def build_vit_fpn_backbone(cfg, input_shape: ShapeSpec):
+    """
+    Create a VIT w/ FPN backbone.
+    Args:
+        cfg: a detectron2 CfgNode
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+    bottom_up = build_VIT_backbone(cfg)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone

magic-pdf 0.5.13__py3-none-any.whl → 0.6.1__py3-none-any.whl

magic-pdf 0.5.13py3-none-any.whl → 0.6.1py3-none-any.whl