PyPI - magic-pdf - Versions diffs - 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

magic-pdf 1.2.2py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (101) hide show

magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from .configuration_unimer_swin import UnimerSwinConfig
+from .modeling_unimer_swin import UnimerSwinModel
+from .image_processing_unimer_swin import UnimerSwinImageProcessor
+__all__ = [
+    "UnimerSwinConfig",
+    "UnimerSwinModel",
+    "UnimerSwinImageProcessor",
+]

magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/configuration_unimer_swin.py ADDED Viewed

@@ -0,0 +1,132 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Donut Swin Transformer model configuration"""
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class UnimerSwinConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UnimerSwinModel`]. It is used to instantiate a
+    Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Donut
+    [naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
+            Depth of each layer in the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        window_size (`int`, *optional*, defaults to 7):
+            Size of windows.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether or not to add absolute position embeddings to the patch embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+    Example:
+    ```python
+    >>> from transformers import UnimerSwinConfig, UnimerSwinModel
+    >>> # Initializing a Donut naver-clova-ix/donut-base style configuration
+    >>> configuration = UnimerSwinConfig()
+    >>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration
+    >>> model = UnimerSwinModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "unimer-swin"
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+    def __init__(
+        self,
+        image_size=224,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        use_absolute_embeddings=False,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.use_absolute_embeddings = use_absolute_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))

magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/image_processing_unimer_swin.py ADDED Viewed

@@ -0,0 +1,132 @@
+from transformers.image_processing_utils import BaseImageProcessor
+import numpy as np
+import cv2
+import albumentations as alb
+from albumentations.pytorch import ToTensorV2
+# TODO: dereference cv2 if possible
+class UnimerSwinImageProcessor(BaseImageProcessor):
+    def __init__(
+            self,
+            image_size = (192, 672),
+        ):
+        self.input_size = [int(_) for _ in image_size]
+        assert len(self.input_size) == 2
+        self.transform = alb.Compose(
+            [
+                alb.ToGray(),
+                alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
+                # alb.Sharpen()
+                ToTensorV2(),
+            ]
+        )
+    def __call__(self, item):
+        image = self.prepare_input(item)
+        return self.transform(image=image)['image'][:1]
+    @staticmethod
+    def crop_margin_numpy(img: np.ndarray) -> np.ndarray:
+        """Crop margins of image using NumPy operations"""
+        # Convert to grayscale if it's a color image
+        if len(img.shape) == 3 and img.shape[2] == 3:
+            gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+        else:
+            gray = img.copy()
+        # Normalize and threshold
+        if gray.max() == gray.min():
+            return img
+        normalized = (((gray - gray.min()) / (gray.max() - gray.min())) * 255).astype(np.uint8)
+        binary = 255 * (normalized < 200).astype(np.uint8)
+        # Find bounding box
+        coords = cv2.findNonZero(binary)  # Find all non-zero points (text)
+        x, y, w, h = cv2.boundingRect(coords)  # Find minimum spanning bounding box
+        # Return cropped image
+        return img[y:y + h, x:x + w]
+    def prepare_input(self, img, random_padding: bool = False):
+        """
+        Convert PIL Image or numpy array to properly sized and padded image after:
+            - crop margins
+            - resize while maintaining aspect ratio
+            - pad to target size
+        """
+        if img is None:
+            return None
+        # try:
+        #     img = self.crop_margin_numpy(img)
+        # except Exception:
+        #     # might throw an error for broken files
+        #     return None
+        if img.shape[0] == 0 or img.shape[1] == 0:
+            return None
+        # Get current dimensions
+        h, w = img.shape[:2]
+        target_h, target_w = self.input_size
+        # Calculate scale to preserve aspect ratio (equivalent to resize + thumbnail)
+        scale = min(target_h / h, target_w / w)
+        # Calculate new dimensions
+        new_h, new_w = int(h * scale), int(w * scale)
+        # Resize the image while preserving aspect ratio
+        resized_img = cv2.resize(img, (new_w, new_h))
+        # Calculate padding values using the existing method
+        delta_width = target_w - new_w
+        delta_height = target_h - new_h
+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
+        # Apply padding (convert PIL padding format to OpenCV format)
+        padding_color = [0, 0, 0] if len(img.shape) == 3 else [0]
+        padded_img = cv2.copyMakeBorder(
+            resized_img,
+            pad_height,  # top
+            delta_height - pad_height,  # bottom
+            pad_width,  # left
+            delta_width - pad_width,  # right
+            cv2.BORDER_CONSTANT,
+            value=padding_color
+        )
+        return padded_img
+    def _calculate_padding(self, new_w, new_h, random_padding):
+        """Calculate padding values for PIL images"""
+        delta_width = self.input_size[1] - new_w
+        delta_height = self.input_size[0] - new_h
+        pad_width, pad_height = self._get_padding_values(new_w, new_h, random_padding)
+        return (
+            pad_width,
+            pad_height,
+            delta_width - pad_width,
+            delta_height - pad_height,
+        )
+    def _get_padding_values(self, new_w, new_h, random_padding):
+        """Get padding values based on image dimensions and padding strategy"""
+        delta_width = self.input_size[1] - new_w
+        delta_height = self.input_size[0] - new_h
+        if random_padding:
+            pad_width = np.random.randint(low=0, high=delta_width + 1)
+            pad_height = np.random.randint(low=0, high=delta_height + 1)
+        else:
+            pad_width = delta_width // 2
+            pad_height = delta_height // 2
+        return pad_width, pad_height

magic-pdf 1.2.2__py3-none-any.whl → 1.3.0__py3-none-any.whl

magic-pdf 1.2.2py3-none-any.whl → 1.3.0py3-none-any.whl