magic-pdf 1.2.2__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. magic_pdf/data/batch_build_dataset.py +156 -0
  2. magic_pdf/data/dataset.py +56 -25
  3. magic_pdf/data/utils.py +108 -9
  4. magic_pdf/dict2md/ocr_mkcontent.py +4 -3
  5. magic_pdf/libs/pdf_image_tools.py +11 -6
  6. magic_pdf/libs/performance_stats.py +12 -1
  7. magic_pdf/libs/version.py +1 -1
  8. magic_pdf/model/batch_analyze.py +175 -201
  9. magic_pdf/model/doc_analyze_by_custom_model.py +142 -92
  10. magic_pdf/model/pdf_extract_kit.py +5 -38
  11. magic_pdf/model/sub_modules/language_detection/utils.py +2 -4
  12. magic_pdf/model/sub_modules/language_detection/yolov11/YOLOv11.py +24 -19
  13. magic_pdf/model/sub_modules/layout/doclayout_yolo/DocLayoutYOLO.py +3 -1
  14. magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py +3 -1
  15. magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py +31 -102
  16. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py +13 -0
  17. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py +189 -0
  18. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py +8 -0
  19. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py +163 -0
  20. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py +2351 -0
  21. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py +9 -0
  22. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py +132 -0
  23. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py +132 -0
  24. magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/modeling_unimer_swin.py +1084 -0
  25. magic_pdf/model/sub_modules/model_init.py +50 -37
  26. magic_pdf/model/sub_modules/model_utils.py +18 -12
  27. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/__init__.py +1 -0
  28. magic_pdf/model/sub_modules/ocr/{paddleocr → paddleocr2pytorch}/ocr_utils.py +102 -97
  29. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorch_paddle.py +193 -0
  30. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/base_ocr_v20.py +39 -0
  31. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/__init__.py +8 -0
  32. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/__init__.py +48 -0
  33. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/data/imaug/operators.py +418 -0
  34. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/__init__.py +25 -0
  35. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/architectures/base_model.py +105 -0
  36. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py +62 -0
  37. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/det_mobilenet_v3.py +269 -0
  38. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_hgnet.py +290 -0
  39. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_lcnetv3.py +516 -0
  40. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mobilenet_v3.py +136 -0
  41. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_mv1_enhance.py +234 -0
  42. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/rec_svtrnet.py +638 -0
  43. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/common.py +76 -0
  44. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/__init__.py +43 -0
  45. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/cls_head.py +23 -0
  46. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/det_db_head.py +109 -0
  47. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_ctc_head.py +54 -0
  48. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/heads/rec_multi_head.py +58 -0
  49. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/__init__.py +29 -0
  50. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/db_fpn.py +456 -0
  51. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/intracl.py +117 -0
  52. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/necks/rnn.py +228 -0
  53. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/__init__.py +33 -0
  54. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/cls_postprocess.py +20 -0
  55. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/db_postprocess.py +179 -0
  56. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/postprocess/rec_postprocess.py +690 -0
  57. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/__init__.py +0 -0
  58. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/arch_config.yaml +383 -0
  59. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/arabic_dict.txt +162 -0
  60. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/chinese_cht_dict.txt +8421 -0
  61. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/cyrillic_dict.txt +163 -0
  62. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/devanagari_dict.txt +167 -0
  63. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/en_dict.txt +95 -0
  64. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/japan_dict.txt +4399 -0
  65. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ka_dict.txt +153 -0
  66. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/korean_dict.txt +3688 -0
  67. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/latin_dict.txt +185 -0
  68. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ppocr_keys_v1.txt +6623 -0
  69. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/ta_dict.txt +128 -0
  70. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/dict/te_dict.txt +151 -0
  71. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/utils/resources/models_config.yml +49 -0
  72. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/__init__.py +1 -0
  73. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/__init__.py +1 -0
  74. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_cls.py +106 -0
  75. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_det.py +217 -0
  76. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_rec.py +440 -0
  77. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/predict_system.py +104 -0
  78. magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py +227 -0
  79. magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +15 -19
  80. magic_pdf/pdf_parse_union_core_v2.py +112 -74
  81. magic_pdf/pre_proc/ocr_dict_merge.py +9 -1
  82. magic_pdf/pre_proc/ocr_span_list_modify.py +51 -0
  83. magic_pdf/resources/model_config/model_configs.yaml +1 -1
  84. magic_pdf/resources/slanet_plus/slanet-plus.onnx +0 -0
  85. magic_pdf/tools/cli.py +30 -12
  86. magic_pdf/tools/common.py +90 -12
  87. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/METADATA +92 -59
  88. magic_pdf-1.3.1.dist-info/RECORD +203 -0
  89. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/WHEEL +1 -1
  90. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +0 -204
  91. magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_291_mod.py +0 -213
  92. magic_pdf/model/sub_modules/table/structeqtable/struct_eqtable.py +0 -37
  93. magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +0 -71
  94. magic_pdf/resources/model_config/UniMERNet/demo.yaml +0 -46
  95. magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +0 -351
  96. magic_pdf-1.2.2.dist-info/RECORD +0 -147
  97. /magic_pdf/model/sub_modules/{ocr/paddleocr/__init__.py → mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py} +0 -0
  98. /magic_pdf/model/sub_modules/{table/structeqtable → ocr/paddleocr2pytorch/pytorchocr}/__init__.py +0 -0
  99. /magic_pdf/model/sub_modules/{table/tablemaster → ocr/paddleocr2pytorch/pytorchocr/modeling}/__init__.py +0 -0
  100. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/LICENSE.md +0 -0
  101. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/entry_points.txt +0 -0
  102. {magic_pdf-1.2.2.dist-info → magic_pdf-1.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,9 @@
1
+ from .configuration_unimer_swin import UnimerSwinConfig
2
+ from .modeling_unimer_swin import UnimerSwinModel
3
+ from .image_processing_unimer_swin import UnimerSwinImageProcessor
4
+
5
+ __all__ = [
6
+ "UnimerSwinConfig",
7
+ "UnimerSwinModel",
8
+ "UnimerSwinImageProcessor",
9
+ ]
@@ -0,0 +1,132 @@
1
+ # coding=utf-8
2
+ # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Donut Swin Transformer model configuration"""
16
+
17
+ from transformers.configuration_utils import PretrainedConfig
18
+ from transformers.utils import logging
19
+
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+
24
+ class UnimerSwinConfig(PretrainedConfig):
25
+ r"""
26
+ This is the configuration class to store the configuration of a [`UnimerSwinModel`]. It is used to instantiate a
27
+ Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
28
+ with the defaults will yield a similar configuration to that of the Donut
29
+ [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture.
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
+ documentation from [`PretrainedConfig`] for more information.
33
+
34
+ Args:
35
+ image_size (`int`, *optional*, defaults to 224):
36
+ The size (resolution) of each image.
37
+ patch_size (`int`, *optional*, defaults to 4):
38
+ The size (resolution) of each patch.
39
+ num_channels (`int`, *optional*, defaults to 3):
40
+ The number of input channels.
41
+ embed_dim (`int`, *optional*, defaults to 96):
42
+ Dimensionality of patch embedding.
43
+ depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
44
+ Depth of each layer in the Transformer encoder.
45
+ num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
46
+ Number of attention heads in each layer of the Transformer encoder.
47
+ window_size (`int`, *optional*, defaults to 7):
48
+ Size of windows.
49
+ mlp_ratio (`float`, *optional*, defaults to 4.0):
50
+ Ratio of MLP hidden dimensionality to embedding dimensionality.
51
+ qkv_bias (`bool`, *optional*, defaults to `True`):
52
+ Whether or not a learnable bias should be added to the queries, keys and values.
53
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
54
+ The dropout probability for all fully connected layers in the embeddings and encoder.
55
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
56
+ The dropout ratio for the attention probabilities.
57
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
58
+ Stochastic depth rate.
59
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
60
+ The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
61
+ `"selu"` and `"gelu_new"` are supported.
62
+ use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
63
+ Whether or not to add absolute position embeddings to the patch embeddings.
64
+ initializer_range (`float`, *optional*, defaults to 0.02):
65
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
66
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
67
+ The epsilon used by the layer normalization layers.
68
+
69
+ Example:
70
+
71
+ ```python
72
+ >>> from transformers import UnimerSwinConfig, UnimerSwinModel
73
+
74
+ >>> # Initializing a Donut naver-clova-ix/donut-base style configuration
75
+ >>> configuration = UnimerSwinConfig()
76
+
77
+ >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration
78
+ >>> model = UnimerSwinModel(configuration)
79
+
80
+ >>> # Accessing the model configuration
81
+ >>> configuration = model.config
82
+ ```"""
83
+
84
+ model_type = "unimer-swin"
85
+
86
+ attribute_map = {
87
+ "num_attention_heads": "num_heads",
88
+ "num_hidden_layers": "num_layers",
89
+ }
90
+
91
+ def __init__(
92
+ self,
93
+ image_size=224,
94
+ patch_size=4,
95
+ num_channels=3,
96
+ embed_dim=96,
97
+ depths=[2, 2, 6, 2],
98
+ num_heads=[3, 6, 12, 24],
99
+ window_size=7,
100
+ mlp_ratio=4.0,
101
+ qkv_bias=True,
102
+ hidden_dropout_prob=0.0,
103
+ attention_probs_dropout_prob=0.0,
104
+ drop_path_rate=0.1,
105
+ hidden_act="gelu",
106
+ use_absolute_embeddings=False,
107
+ initializer_range=0.02,
108
+ layer_norm_eps=1e-5,
109
+ **kwargs,
110
+ ):
111
+ super().__init__(**kwargs)
112
+
113
+ self.image_size = image_size
114
+ self.patch_size = patch_size
115
+ self.num_channels = num_channels
116
+ self.embed_dim = embed_dim
117
+ self.depths = depths
118
+ self.num_layers = len(depths)
119
+ self.num_heads = num_heads
120
+ self.window_size = window_size
121
+ self.mlp_ratio = mlp_ratio
122
+ self.qkv_bias = qkv_bias
123
+ self.hidden_dropout_prob = hidden_dropout_prob
124
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
125
+ self.drop_path_rate = drop_path_rate
126
+ self.hidden_act = hidden_act
127
+ self.use_absolute_embeddings = use_absolute_embeddings
128
+ self.layer_norm_eps = layer_norm_eps
129
+ self.initializer_range = initializer_range
130
+ # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
131
+ # this indicates the channel dimension after the last stage of the model
132
+ self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
@@ -0,0 +1,132 @@
1
+ from transformers.image_processing_utils import BaseImageProcessor
2
+ import numpy as np
3
+ import cv2
4
+ import albumentations as alb
5
+ from albumentations.pytorch import ToTensorV2
6
+
7
+
8
+ # TODO: dereference cv2 if possible
9
+ class UnimerSwinImageProcessor(BaseImageProcessor):
10
+ def __init__(
11
+ self,
12
+ image_size = (192, 672),
13
+ ):
14
+ self.input_size = [int(_) for _ in image_size]
15
+ assert len(self.input_size) == 2
16
+
17
+ self.transform = alb.Compose(
18
+ [
19
+ alb.ToGray(),
20
+ alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
21
+ # alb.Sharpen()
22
+ ToTensorV2(),
23
+ ]
24
+ )
25
+
26
+ def __call__(self, item):
27
+ image = self.prepare_input(item)
28
+ return self.transform(image=image)['image'][:1]
29
+
30
+ @staticmethod
31
+ def crop_margin_numpy(img: np.ndarray) -> np.ndarray:
32
+ """Crop margins of image using NumPy operations"""
33
+ # Convert to grayscale if it's a color image
34
+ if len(img.shape) == 3 and img.shape[2] == 3:
35
+ gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
36
+ else:
37
+ gray = img.copy()
38
+
39
+ # Normalize and threshold
40
+ if gray.max() == gray.min():
41
+ return img
42
+
43
+ normalized = (((gray - gray.min()) / (gray.max() - gray.min())) * 255).astype(np.uint8)
44
+ binary = 255 * (normalized < 200).astype(np.uint8)
45
+
46
+ # Find bounding box
47
+ coords = cv2.findNonZero(binary) # Find all non-zero points (text)
48
+ x, y, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
49
+
50
+ # Return cropped image
51
+ return img[y:y + h, x:x + w]
52
+
53
+ def prepare_input(self, img, random_padding: bool = False):
54
+ """
55
+ Convert PIL Image or numpy array to properly sized and padded image after:
56
+ - crop margins
57
+ - resize while maintaining aspect ratio
58
+ - pad to target size
59
+ """
60
+ if img is None:
61
+ return None
62
+
63
+ # try:
64
+ # img = self.crop_margin_numpy(img)
65
+ # except Exception:
66
+ # # might throw an error for broken files
67
+ # return None
68
+
69
+ if img.shape[0] == 0 or img.shape[1] == 0:
70
+ return None
71
+
72
+ # Get current dimensions
73
+ h, w = img.shape[:2]
74
+ target_h, target_w = self.input_size
75
+
76
+ # Calculate scale to preserve aspect ratio (equivalent to resize + thumbnail)
77
+ scale = min(target_h / h, target_w / w)
78
+
79
+ # Calculate new dimensions
80
+ new_h, new_w = int(h * scale), int(w * scale)
81
+
82
+ # Resize the image while preserving aspect ratio
83
+ resized_img = cv2.resize(img, (new_w, new_h))
84
+
85
+ # Calculate padding values using the existing method
86
+ delta_width = target_w - new_w
87
+ delta_height = target_h - new_h
88
+
89
+ pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
90
+
91
+ # Apply padding (convert PIL padding format to OpenCV format)
92
+ padding_color = [0, 0, 0] if len(img.shape) == 3 else [0]
93
+
94
+ padded_img = cv2.copyMakeBorder(
95
+ resized_img,
96
+ pad_height, # top
97
+ delta_height - pad_height, # bottom
98
+ pad_width, # left
99
+ delta_width - pad_width, # right
100
+ cv2.BORDER_CONSTANT,
101
+ value=padding_color
102
+ )
103
+
104
+ return padded_img
105
+
106
+ def _calculate_padding(self, new_w, new_h, random_padding):
107
+ """Calculate padding values for PIL images"""
108
+ delta_width = self.input_size[1] - new_w
109
+ delta_height = self.input_size[0] - new_h
110
+
111
+ pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
112
+
113
+ return (
114
+ pad_width,
115
+ pad_height,
116
+ delta_width - pad_width,
117
+ delta_height - pad_height,
118
+ )
119
+
120
+ def _get_padding_values(self, new_w, new_h, random_padding):
121
+ """Get padding values based on image dimensions and padding strategy"""
122
+ delta_width = self.input_size[1] - new_w
123
+ delta_height = self.input_size[0] - new_h
124
+
125
+ if random_padding:
126
+ pad_width = np.random.randint(low=0, high=delta_width + 1)
127
+ pad_height = np.random.randint(low=0, high=delta_height + 1)
128
+ else:
129
+ pad_width = delta_width // 2
130
+ pad_height = delta_height // 2
131
+
132
+ return pad_width, pad_height