magic-pdf 0.5.13__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. magic_pdf/cli/magicpdf.py +18 -7
  2. magic_pdf/libs/config_reader.py +10 -0
  3. magic_pdf/libs/version.py +1 -1
  4. magic_pdf/model/__init__.py +1 -0
  5. magic_pdf/model/doc_analyze_by_custom_model.py +38 -15
  6. magic_pdf/model/model_list.py +1 -0
  7. magic_pdf/model/pdf_extract_kit.py +196 -0
  8. magic_pdf/model/pek_sub_modules/__init__.py +0 -0
  9. magic_pdf/model/pek_sub_modules/layoutlmv3/__init__.py +0 -0
  10. magic_pdf/model/pek_sub_modules/layoutlmv3/backbone.py +179 -0
  11. magic_pdf/model/pek_sub_modules/layoutlmv3/beit.py +671 -0
  12. magic_pdf/model/pek_sub_modules/layoutlmv3/deit.py +476 -0
  13. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/__init__.py +7 -0
  14. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/__init__.py +2 -0
  15. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/cord.py +171 -0
  16. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py +124 -0
  17. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py +136 -0
  18. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py +284 -0
  19. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/xfund.py +213 -0
  20. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/__init__.py +7 -0
  21. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py +24 -0
  22. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py +60 -0
  23. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py +1282 -0
  24. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py +32 -0
  25. magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py +34 -0
  26. magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +150 -0
  27. magic_pdf/model/pek_sub_modules/layoutlmv3/rcnn_vl.py +163 -0
  28. magic_pdf/model/pek_sub_modules/layoutlmv3/visualizer.py +1236 -0
  29. magic_pdf/model/pek_sub_modules/post_process.py +36 -0
  30. magic_pdf/model/pek_sub_modules/self_modify.py +260 -0
  31. magic_pdf/model/pp_structure_v2.py +7 -0
  32. magic_pdf/pipe/AbsPipe.py +8 -14
  33. magic_pdf/pipe/OCRPipe.py +12 -8
  34. magic_pdf/pipe/TXTPipe.py +12 -8
  35. magic_pdf/pipe/UNIPipe.py +9 -7
  36. magic_pdf/resources/model_config/UniMERNet/demo.yaml +46 -0
  37. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +351 -0
  38. magic_pdf/resources/model_config/model_configs.yaml +9 -0
  39. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/METADATA +18 -8
  40. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/RECORD +44 -18
  41. magic_pdf/model/360_layout_analysis.py +0 -8
  42. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/LICENSE.md +0 -0
  43. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/WHEEL +0 -0
  44. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/entry_points.txt +0 -0
  45. {magic_pdf-0.5.13.dist-info → magic_pdf-0.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,32 @@
1
+ # coding=utf-8
2
+ # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Tokenization classes for LayoutLMv3, refer to RoBERTa."""
16
+
17
+ from transformers.models.roberta import RobertaTokenizer
18
+ from transformers.utils import logging
19
+
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ VOCAB_FILES_NAMES = {
24
+ "vocab_file": "vocab.json",
25
+ "merges_file": "merges.txt",
26
+ }
27
+
28
+ class LayoutLMv3Tokenizer(RobertaTokenizer):
29
+ vocab_files_names = VOCAB_FILES_NAMES
30
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
31
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
32
+ model_input_names = ["input_ids", "attention_mask"]
@@ -0,0 +1,34 @@
1
+ # coding=utf-8
2
+ # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Fast Tokenization classes for LayoutLMv3, refer to RoBERTa."""
16
+
17
+
18
+ from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast
19
+ from transformers.utils import logging
20
+
21
+ from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
22
+
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+ VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
27
+
28
+
29
+ class LayoutLMv3TokenizerFast(RobertaTokenizerFast):
30
+ vocab_files_names = VOCAB_FILES_NAMES
31
+ # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
32
+ # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
33
+ model_input_names = ["input_ids", "attention_mask"]
34
+ slow_tokenizer_class = LayoutLMv3Tokenizer
@@ -0,0 +1,150 @@
1
+ from .visualizer import Visualizer
2
+ from .rcnn_vl import *
3
+ from .backbone import *
4
+
5
+ from detectron2.config import get_cfg
6
+ from detectron2.config import CfgNode as CN
7
+ from detectron2.data import MetadataCatalog, DatasetCatalog
8
+ from detectron2.data.datasets import register_coco_instances
9
+ from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch, DefaultPredictor
10
+
11
+
12
+ def add_vit_config(cfg):
13
+ """
14
+ Add config for VIT.
15
+ """
16
+ _C = cfg
17
+
18
+ _C.MODEL.VIT = CN()
19
+
20
+ # CoaT model name.
21
+ _C.MODEL.VIT.NAME = ""
22
+
23
+ # Output features from CoaT backbone.
24
+ _C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"]
25
+
26
+ _C.MODEL.VIT.IMG_SIZE = [224, 224]
27
+
28
+ _C.MODEL.VIT.POS_TYPE = "shared_rel"
29
+
30
+ _C.MODEL.VIT.DROP_PATH = 0.
31
+
32
+ _C.MODEL.VIT.MODEL_KWARGS = "{}"
33
+
34
+ _C.SOLVER.OPTIMIZER = "ADAMW"
35
+
36
+ _C.SOLVER.BACKBONE_MULTIPLIER = 1.0
37
+
38
+ _C.AUG = CN()
39
+
40
+ _C.AUG.DETR = False
41
+
42
+ _C.MODEL.IMAGE_ONLY = True
43
+ _C.PUBLAYNET_DATA_DIR_TRAIN = ""
44
+ _C.PUBLAYNET_DATA_DIR_TEST = ""
45
+ _C.FOOTNOTE_DATA_DIR_TRAIN = ""
46
+ _C.FOOTNOTE_DATA_DIR_VAL = ""
47
+ _C.SCIHUB_DATA_DIR_TRAIN = ""
48
+ _C.SCIHUB_DATA_DIR_TEST = ""
49
+ _C.JIAOCAI_DATA_DIR_TRAIN = ""
50
+ _C.JIAOCAI_DATA_DIR_TEST = ""
51
+ _C.ICDAR_DATA_DIR_TRAIN = ""
52
+ _C.ICDAR_DATA_DIR_TEST = ""
53
+ _C.M6DOC_DATA_DIR_TEST = ""
54
+ _C.DOCSTRUCTBENCH_DATA_DIR_TEST = ""
55
+ _C.DOCSTRUCTBENCHv2_DATA_DIR_TEST = ""
56
+ _C.CACHE_DIR = ""
57
+ _C.MODEL.CONFIG_PATH = ""
58
+
59
+ # effective update steps would be MAX_ITER/GRADIENT_ACCUMULATION_STEPS
60
+ # maybe need to set MAX_ITER *= GRADIENT_ACCUMULATION_STEPS
61
+ _C.SOLVER.GRADIENT_ACCUMULATION_STEPS = 1
62
+
63
+
64
+ def setup(args, device):
65
+ """
66
+ Create configs and perform basic setups.
67
+ """
68
+ cfg = get_cfg()
69
+
70
+ # add_coat_config(cfg)
71
+ add_vit_config(cfg)
72
+ cfg.merge_from_file(args.config_file)
73
+ cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2 # set threshold for this model
74
+ cfg.merge_from_list(args.opts)
75
+
76
+ # 使用统一的device配置
77
+ cfg.MODEL.DEVICE = device
78
+
79
+ cfg.freeze()
80
+ default_setup(cfg, args)
81
+
82
+ register_coco_instances(
83
+ "scihub_train",
84
+ {},
85
+ cfg.SCIHUB_DATA_DIR_TRAIN + ".json",
86
+ cfg.SCIHUB_DATA_DIR_TRAIN
87
+ )
88
+
89
+ return cfg
90
+
91
+
92
+ class DotDict(dict):
93
+ def __init__(self, *args, **kwargs):
94
+ super(DotDict, self).__init__(*args, **kwargs)
95
+
96
+ def __getattr__(self, key):
97
+ if key not in self.keys():
98
+ return None
99
+ value = self[key]
100
+ if isinstance(value, dict):
101
+ value = DotDict(value)
102
+ return value
103
+
104
+ def __setattr__(self, key, value):
105
+ self[key] = value
106
+
107
+
108
+ class Layoutlmv3_Predictor(object):
109
+ def __init__(self, weights, config_file, device):
110
+ layout_args = {
111
+ "config_file": config_file,
112
+ "resume": False,
113
+ "eval_only": False,
114
+ "num_gpus": 1,
115
+ "num_machines": 1,
116
+ "machine_rank": 0,
117
+ "dist_url": "tcp://127.0.0.1:57823",
118
+ "opts": ["MODEL.WEIGHTS", weights],
119
+ }
120
+ layout_args = DotDict(layout_args)
121
+
122
+ cfg = setup(layout_args, device)
123
+ self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption",
124
+ "table_footnote", "isolate_formula", "formula_caption"]
125
+ MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping
126
+ self.predictor = DefaultPredictor(cfg)
127
+
128
+ def __call__(self, image, ignore_catids=[]):
129
+ # page_layout_result = {
130
+ # "layout_dets": []
131
+ # }
132
+ layout_dets = []
133
+ outputs = self.predictor(image)
134
+ boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.tolist()
135
+ labels = outputs["instances"].to("cpu")._fields["pred_classes"].tolist()
136
+ scores = outputs["instances"].to("cpu")._fields["scores"].tolist()
137
+ for bbox_idx in range(len(boxes)):
138
+ if labels[bbox_idx] in ignore_catids:
139
+ continue
140
+ layout_dets.append({
141
+ "category_id": labels[bbox_idx],
142
+ "poly": [
143
+ boxes[bbox_idx][0], boxes[bbox_idx][1],
144
+ boxes[bbox_idx][2], boxes[bbox_idx][1],
145
+ boxes[bbox_idx][2], boxes[bbox_idx][3],
146
+ boxes[bbox_idx][0], boxes[bbox_idx][3],
147
+ ],
148
+ "score": scores[bbox_idx]
149
+ })
150
+ return layout_dets
@@ -0,0 +1,163 @@
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import logging
3
+ import numpy as np
4
+ from typing import Dict, List, Optional, Tuple
5
+ import torch
6
+ from torch import nn
7
+
8
+ from detectron2.config import configurable
9
+ from detectron2.structures import ImageList, Instances
10
+ from detectron2.utils.events import get_event_storage
11
+
12
+ from detectron2.modeling.backbone import Backbone, build_backbone
13
+ from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
14
+
15
+ from detectron2.modeling.meta_arch import GeneralizedRCNN
16
+
17
+ from detectron2.modeling.postprocessing import detector_postprocess
18
+ from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference_single_image
19
+ from contextlib import contextmanager
20
+ from itertools import count
21
+
22
+ @META_ARCH_REGISTRY.register()
23
+ class VLGeneralizedRCNN(GeneralizedRCNN):
24
+ """
25
+ Generalized R-CNN. Any models that contains the following three components:
26
+ 1. Per-image feature extraction (aka backbone)
27
+ 2. Region proposal generation
28
+ 3. Per-region feature extraction and prediction
29
+ """
30
+
31
+ def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
32
+ """
33
+ Args:
34
+ batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
35
+ Each item in the list contains the inputs for one image.
36
+ For now, each item in the list is a dict that contains:
37
+
38
+ * image: Tensor, image in (C, H, W) format.
39
+ * instances (optional): groundtruth :class:`Instances`
40
+ * proposals (optional): :class:`Instances`, precomputed proposals.
41
+
42
+ Other information that's included in the original dicts, such as:
43
+
44
+ * "height", "width" (int): the output resolution of the model, used in inference.
45
+ See :meth:`postprocess` for details.
46
+
47
+ Returns:
48
+ list[dict]:
49
+ Each dict is the output for one input image.
50
+ The dict contains one key "instances" whose value is a :class:`Instances`.
51
+ The :class:`Instances` object has the following keys:
52
+ "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
53
+ """
54
+ if not self.training:
55
+ return self.inference(batched_inputs)
56
+
57
+ images = self.preprocess_image(batched_inputs)
58
+ if "instances" in batched_inputs[0]:
59
+ gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
60
+ else:
61
+ gt_instances = None
62
+
63
+ # features = self.backbone(images.tensor)
64
+ input = self.get_batch(batched_inputs, images)
65
+ features = self.backbone(input)
66
+
67
+ if self.proposal_generator is not None:
68
+ proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
69
+ else:
70
+ assert "proposals" in batched_inputs[0]
71
+ proposals = [x["proposals"].to(self.device) for x in batched_inputs]
72
+ proposal_losses = {}
73
+
74
+ _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
75
+ if self.vis_period > 0:
76
+ storage = get_event_storage()
77
+ if storage.iter % self.vis_period == 0:
78
+ self.visualize_training(batched_inputs, proposals)
79
+
80
+ losses = {}
81
+ losses.update(detector_losses)
82
+ losses.update(proposal_losses)
83
+ return losses
84
+
85
+ def inference(
86
+ self,
87
+ batched_inputs: List[Dict[str, torch.Tensor]],
88
+ detected_instances: Optional[List[Instances]] = None,
89
+ do_postprocess: bool = True,
90
+ ):
91
+ """
92
+ Run inference on the given inputs.
93
+
94
+ Args:
95
+ batched_inputs (list[dict]): same as in :meth:`forward`
96
+ detected_instances (None or list[Instances]): if not None, it
97
+ contains an `Instances` object per image. The `Instances`
98
+ object contains "pred_boxes" and "pred_classes" which are
99
+ known boxes in the image.
100
+ The inference will then skip the detection of bounding boxes,
101
+ and only predict other per-ROI outputs.
102
+ do_postprocess (bool): whether to apply post-processing on the outputs.
103
+
104
+ Returns:
105
+ When do_postprocess=True, same as in :meth:`forward`.
106
+ Otherwise, a list[Instances] containing raw network outputs.
107
+ """
108
+ assert not self.training
109
+
110
+ images = self.preprocess_image(batched_inputs)
111
+ # features = self.backbone(images.tensor)
112
+ input = self.get_batch(batched_inputs, images)
113
+ features = self.backbone(input)
114
+
115
+ if detected_instances is None:
116
+ if self.proposal_generator is not None:
117
+ proposals, _ = self.proposal_generator(images, features, None)
118
+ else:
119
+ assert "proposals" in batched_inputs[0]
120
+ proposals = [x["proposals"].to(self.device) for x in batched_inputs]
121
+
122
+ results, _ = self.roi_heads(images, features, proposals, None)
123
+ else:
124
+ detected_instances = [x.to(self.device) for x in detected_instances]
125
+ results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
126
+
127
+ if do_postprocess:
128
+ assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
129
+ return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
130
+ else:
131
+ return results
132
+
133
+ def get_batch(self, examples, images):
134
+ if len(examples) >= 1 and "bbox" not in examples[0]: # image_only
135
+ return {"images": images.tensor}
136
+
137
+ return input
138
+
139
+ def _batch_inference(self, batched_inputs, detected_instances=None):
140
+ """
141
+ Execute inference on a list of inputs,
142
+ using batch size = self.batch_size (e.g., 2), instead of the length of the list.
143
+
144
+ Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
145
+ """
146
+ if detected_instances is None:
147
+ detected_instances = [None] * len(batched_inputs)
148
+
149
+ outputs = []
150
+ inputs, instances = [], []
151
+ for idx, input, instance in zip(count(), batched_inputs, detected_instances):
152
+ inputs.append(input)
153
+ instances.append(instance)
154
+ if len(inputs) == 2 or idx == len(batched_inputs) - 1:
155
+ outputs.extend(
156
+ self.inference(
157
+ inputs,
158
+ instances if instances[0] is not None else None,
159
+ do_postprocess=True, # False
160
+ )
161
+ )
162
+ inputs, instances = [], []
163
+ return outputs