PyPI - magic-pdf - Versions diffs - 0.5.13__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

magic-pdf 0.5.13py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/data_collator.py ADDED Viewed

@@ -0,0 +1,124 @@
+import torch
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+from transformers import BatchEncoding, PreTrainedTokenizerBase
+from transformers.data.data_collator import (
+    DataCollatorMixin,
+    _torch_collate_batch,
+)
+from transformers.file_utils import PaddingStrategy
+from typing import NewType
+InputDataClass = NewType("InputDataClass", Any)
+def pre_calc_rel_mat(segment_ids):
+    valid_span = torch.zeros((segment_ids.shape[0], segment_ids.shape[1], segment_ids.shape[1]),
+                             device=segment_ids.device, dtype=torch.bool)
+    for i in range(segment_ids.shape[0]):
+        for j in range(segment_ids.shape[1]):
+            valid_span[i, j, :] = segment_ids[i, :] == segment_ids[i, j]
+    return valid_span
+@dataclass
+class DataCollatorForKeyValueExtraction(DataCollatorMixin):
+    """
+    Data collator that will dynamically pad the inputs received, as well as the labels.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
+            among:
+            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+              sequence if provided).
+            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+              maximum acceptable input length for the model if that argument is not provided.
+            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+              different lengths).
+        max_length (:obj:`int`, `optional`):
+            Maximum length of the returned list and optionally padding length (see above).
+        pad_to_multiple_of (:obj:`int`, `optional`):
+            If set will pad the sequence to a multiple of the provided value.
+            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
+            7.5 (Volta).
+        label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+            The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
+    """
+    tokenizer: PreTrainedTokenizerBase
+    padding: Union[bool, str, PaddingStrategy] = True
+    max_length: Optional[int] = None
+    pad_to_multiple_of: Optional[int] = None
+    label_pad_token_id: int = -100
+    def __call__(self, features):
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        images = None
+        if "images" in features[0]:
+            images = torch.stack([torch.tensor(d.pop("images")) for d in features])
+            IMAGE_LEN = int(images.shape[-1] / 16) * int(images.shape[-1] / 16) + 1
+        batch = self.tokenizer.pad(
+            features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            # Conversion to tensors will fail if we have labels as they are not of the same length yet.
+            return_tensors="pt" if labels is None else None,
+        )
+        if images is not None:
+            batch["images"] = images
+            batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) and k == 'attention_mask' else v
+                     for k, v in batch.items()}
+            visual_attention_mask = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long)
+            batch["attention_mask"] = torch.cat([batch['attention_mask'], visual_attention_mask], dim=1)
+        if labels is None:
+            return batch
+        has_bbox_input = "bbox" in features[0]
+        has_position_input = "position_ids" in features[0]
+        padding_idx=self.tokenizer.pad_token_id
+        sequence_length = torch.tensor(batch["input_ids"]).shape[1]
+        padding_side = self.tokenizer.padding_side
+        if padding_side == "right":
+            batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels]
+            if has_bbox_input:
+                batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]]
+            if has_position_input:
+                batch["position_ids"] = [position_id + [padding_idx] * (sequence_length - len(position_id))
+                                          for position_id in batch["position_ids"]]
+        else:
+            batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels]
+            if has_bbox_input:
+                batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]]
+            if has_position_input:
+                batch["position_ids"] = [[padding_idx] * (sequence_length - len(position_id))
+                                          + position_id for position_id in batch["position_ids"]]
+        if 'segment_ids' in batch:
+            assert 'position_ids' in batch
+            for i in range(len(batch['segment_ids'])):
+                batch['segment_ids'][i] = batch['segment_ids'][i] + [batch['segment_ids'][i][-1] + 1] * (sequence_length - len(batch['segment_ids'][i])) + [
+                    batch['segment_ids'][i][-1] + 2] * IMAGE_LEN
+        batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()}
+        if 'segment_ids' in batch:
+            valid_span = pre_calc_rel_mat(
+                segment_ids=batch['segment_ids']
+            )
+            batch['valid_span'] = valid_span
+            del batch['segment_ids']
+        if images is not None:
+            visual_labels = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long) * -100
+            batch["labels"] = torch.cat([batch['labels'], visual_labels], dim=1)
+        return batch

magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/funsd.py ADDED Viewed

@@ -0,0 +1,136 @@
+# coding=utf-8
+'''
+Reference: https://huggingface.co/datasets/nielsr/funsd/blob/main/funsd.py
+'''
+import json
+import os
+import datasets
+from .image_utils import load_image, normalize_bbox
+logger = datasets.logging.get_logger(__name__)
+_CITATION = """\
+@article{Jaume2019FUNSDAD,
+  title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents},
+  author={Guillaume Jaume and H. K. Ekenel and J. Thiran},
+  journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)},
+  year={2019},
+  volume={2},
+  pages={1-6}
+}
+"""
+_DESCRIPTION = """\
+https://guillaumejaume.github.io/FUNSD/
+"""
+class FunsdConfig(datasets.BuilderConfig):
+    """BuilderConfig for FUNSD"""
+    def __init__(self, **kwargs):
+        """BuilderConfig for FUNSD.
+        Args:
+          **kwargs: keyword arguments forwarded to super.
+        """
+        super(FunsdConfig, self).__init__(**kwargs)
+class Funsd(datasets.GeneratorBasedBuilder):
+    """Conll2003 dataset."""
+    BUILDER_CONFIGS = [
+        FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"),
+    ]
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "tokens": datasets.Sequence(datasets.Value("string")),
+                    "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
+                    "ner_tags": datasets.Sequence(
+                        datasets.features.ClassLabel(
+                            names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"]
+                        )
+                    ),
+                    "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
+                    "image_path": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage="https://guillaumejaume.github.io/FUNSD/",
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip")
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"}
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"}
+            ),
+        ]
+    def get_line_bbox(self, bboxs):
+        x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)]
+        y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)]
+        x0, y0, x1, y1 = min(x), min(y), max(x), max(y)
+        assert x1 >= x0 and y1 >= y0
+        bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))]
+        return bbox
+    def _generate_examples(self, filepath):
+        logger.info("⏳ Generating examples from = %s", filepath)
+        ann_dir = os.path.join(filepath, "annotations")
+        img_dir = os.path.join(filepath, "images")
+        for guid, file in enumerate(sorted(os.listdir(ann_dir))):
+            tokens = []
+            bboxes = []
+            ner_tags = []
+            file_path = os.path.join(ann_dir, file)
+            with open(file_path, "r", encoding="utf8") as f:
+                data = json.load(f)
+            image_path = os.path.join(img_dir, file)
+            image_path = image_path.replace("json", "png")
+            image, size = load_image(image_path)
+            for item in data["form"]:
+                cur_line_bboxes = []
+                words, label = item["words"], item["label"]
+                words = [w for w in words if w["text"].strip() != ""]
+                if len(words) == 0:
+                    continue
+                if label == "other":
+                    for w in words:
+                        tokens.append(w["text"])
+                        ner_tags.append("O")
+                        cur_line_bboxes.append(normalize_bbox(w["box"], size))
+                else:
+                    tokens.append(words[0]["text"])
+                    ner_tags.append("B-" + label.upper())
+                    cur_line_bboxes.append(normalize_bbox(words[0]["box"], size))
+                    for w in words[1:]:
+                        tokens.append(w["text"])
+                        ner_tags.append("I-" + label.upper())
+                        cur_line_bboxes.append(normalize_bbox(w["box"], size))
+                # by default: --segment_level_layout 1
+                # if do not want to use segment_level_layout, comment the following line
+                cur_line_bboxes = self.get_line_bbox(cur_line_bboxes)
+                # box = normalize_bbox(item["box"], size)
+                # cur_line_bboxes = [box for _ in range(len(words))]
+                bboxes.extend(cur_line_bboxes)
+            yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags,
+                         "image": image, "image_path": image_path}

magic_pdf/model/pek_sub_modules/layoutlmv3/layoutlmft/data/image_utils.py ADDED Viewed

@@ -0,0 +1,284 @@
+import torchvision.transforms.functional as F
+import warnings
+import math
+import random
+import numpy as np
+from PIL import Image
+import torch
+from detectron2.data.detection_utils import read_image
+from detectron2.data.transforms import ResizeTransform, TransformList
+def normalize_bbox(bbox, size):
+    return [
+        int(1000 * bbox[0] / size[0]),
+        int(1000 * bbox[1] / size[1]),
+        int(1000 * bbox[2] / size[0]),
+        int(1000 * bbox[3] / size[1]),
+    ]
+def load_image(image_path):
+    image = read_image(image_path, format="BGR")
+    h = image.shape[0]
+    w = image.shape[1]
+    img_trans = TransformList([ResizeTransform(h=h, w=w, new_h=224, new_w=224)])
+    image = torch.tensor(img_trans.apply_image(image).copy()).permute(2, 0, 1)  # copy to make it writeable
+    return image, (w, h)
+def crop(image, i, j, h, w, boxes=None):
+    cropped_image = F.crop(image, i, j, h, w)
+    if boxes is not None:
+        # Currently we cannot use this case since when some boxes is out of the cropped image,
+        # it may be better to drop out these boxes along with their text input (instead of min or clamp)
+        # which haven't been implemented here
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = torch.as_tensor(boxes) - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        boxes = cropped_boxes.reshape(-1, 4)
+    return cropped_image, boxes
+def resize(image, size, interpolation, boxes=None):
+    # It seems that we do not need to resize boxes here, since the boxes will be resized to 1000x1000 finally,
+    # which is compatible with a square image size of 224x224
+    rescaled_image = F.resize(image, size, interpolation)
+    if boxes is None:
+        return rescaled_image, None
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+    # boxes = boxes.copy()
+    scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
+    return rescaled_image, scaled_boxes
+def clamp(num, min_value, max_value):
+    return max(min(num, max_value), min_value)
+def get_bb(bb, page_size):
+    bbs = [float(j) for j in bb]
+    xs, ys = [], []
+    for i, b in enumerate(bbs):
+        if i % 2 == 0:
+            xs.append(b)
+        else:
+            ys.append(b)
+    (width, height) = page_size
+    return_bb = [
+        clamp(min(xs), 0, width - 1),
+        clamp(min(ys), 0, height - 1),
+        clamp(max(xs), 0, width - 1),
+        clamp(max(ys), 0, height - 1),
+    ]
+    return_bb = [
+            int(1000 * return_bb[0] / width),
+            int(1000 * return_bb[1] / height),
+            int(1000 * return_bb[2] / width),
+            int(1000 * return_bb[3] / height),
+        ]
+    return return_bb
+class ToNumpy:
+    def __call__(self, pil_img):
+        np_img = np.array(pil_img, dtype=np.uint8)
+        if np_img.ndim < 3:
+            np_img = np.expand_dims(np_img, axis=-1)
+        np_img = np.rollaxis(np_img, 2)  # HWC to CHW
+        return np_img
+class ToTensor:
+    def __init__(self, dtype=torch.float32):
+        self.dtype = dtype
+    def __call__(self, pil_img):
+        np_img = np.array(pil_img, dtype=np.uint8)
+        if np_img.ndim < 3:
+            np_img = np.expand_dims(np_img, axis=-1)
+        np_img = np.rollaxis(np_img, 2)  # HWC to CHW
+        return torch.from_numpy(np_img).to(dtype=self.dtype)
+_pil_interpolation_to_str = {
+    F.InterpolationMode.NEAREST: 'F.InterpolationMode.NEAREST',
+    F.InterpolationMode.BILINEAR: 'F.InterpolationMode.BILINEAR',
+    F.InterpolationMode.BICUBIC: 'F.InterpolationMode.BICUBIC',
+    F.InterpolationMode.LANCZOS: 'F.InterpolationMode.LANCZOS',
+    F.InterpolationMode.HAMMING: 'F.InterpolationMode.HAMMING',
+    F.InterpolationMode.BOX: 'F.InterpolationMode.BOX',
+}
+def _pil_interp(method):
+    if method == 'bicubic':
+        return F.InterpolationMode.BICUBIC
+    elif method == 'lanczos':
+        return F.InterpolationMode.LANCZOS
+    elif method == 'hamming':
+        return F.InterpolationMode.HAMMING
+    else:
+        # default bilinear, do we want to allow nearest?
+        return F.InterpolationMode.BILINEAR
+class Compose:
+    """Composes several transforms together. This transform does not support torchscript.
+    Please, see the note below.
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+    """
+    def __init__(self, transforms):
+        self.transforms = transforms
+    def __call__(self, img, augmentation=False, box=None):
+        for t in self.transforms:
+            img = t(img, augmentation, box)
+        return img
+class RandomResizedCropAndInterpolationWithTwoPic:
+    """Crop the given PIL Image to random size and aspect ratio with random interpolation.
+    A crop of random size (default: of 0.08 to 1.0) of the original size and a random
+    aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop
+    is finally resized to given size.
+    This is popularly used to train the Inception networks.
+    Args:
+        size: expected output size of each edge
+        scale: range of size of the origin size cropped
+        ratio: range of aspect ratio of the origin aspect ratio cropped
+        interpolation: Default: PIL.Image.BILINEAR
+    """
+    def __init__(self, size, second_size=None, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.),
+                 interpolation='bilinear', second_interpolation='lanczos'):
+        if isinstance(size, tuple):
+            self.size = size
+        else:
+            self.size = (size, size)
+        if second_size is not None:
+            if isinstance(second_size, tuple):
+                self.second_size = second_size
+            else:
+                self.second_size = (second_size, second_size)
+        else:
+            self.second_size = None
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("range should be of kind (min, max)")
+        self.interpolation = _pil_interp(interpolation)
+        self.second_interpolation = _pil_interp(second_interpolation)
+        self.scale = scale
+        self.ratio = ratio
+    @staticmethod
+    def get_params(img, scale, ratio):
+        """Get parameters for ``crop`` for a random sized crop.
+        Args:
+            img (PIL Image): Image to be cropped.
+            scale (tuple): range of size of the origin size cropped
+            ratio (tuple): range of aspect ratio of the origin aspect ratio cropped
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for a random
+                sized crop.
+        """
+        area = img.size[0] * img.size[1]
+        for attempt in range(10):
+            target_area = random.uniform(*scale) * area
+            log_ratio = (math.log(ratio[0]), math.log(ratio[1]))
+            aspect_ratio = math.exp(random.uniform(*log_ratio))
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+            if w <= img.size[0] and h <= img.size[1]:
+                i = random.randint(0, img.size[1] - h)
+                j = random.randint(0, img.size[0] - w)
+                return i, j, h, w
+        # Fallback to central crop
+        in_ratio = img.size[0] / img.size[1]
+        if in_ratio < min(ratio):
+            w = img.size[0]
+            h = int(round(w / min(ratio)))
+        elif in_ratio > max(ratio):
+            h = img.size[1]
+            w = int(round(h * max(ratio)))
+        else:  # whole image
+            w = img.size[0]
+            h = img.size[1]
+        i = (img.size[1] - h) // 2
+        j = (img.size[0] - w) // 2
+        return i, j, h, w
+    def __call__(self, img, augmentation=False, box=None):
+        """
+        Args:
+            img (PIL Image): Image to be cropped and resized.
+        Returns:
+            PIL Image: Randomly cropped and resized image.
+        """
+        if augmentation:
+            i, j, h, w = self.get_params(img, self.scale, self.ratio)
+            img = F.crop(img, i, j, h, w)
+            # img, box = crop(img, i, j, h, w, box)
+        img = F.resize(img, self.size, self.interpolation)
+        second_img = F.resize(img, self.second_size, self.second_interpolation) \
+            if self.second_size is not None else None
+        return img, second_img
+    def __repr__(self):
+        if isinstance(self.interpolation, (tuple, list)):
+            interpolate_str = ' '.join([_pil_interpolation_to_str[x] for x in self.interpolation])
+        else:
+            interpolate_str = _pil_interpolation_to_str[self.interpolation]
+        format_string = self.__class__.__name__ + '(size={0}'.format(self.size)
+        format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale))
+        format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio))
+        format_string += ', interpolation={0}'.format(interpolate_str)
+        if self.second_size is not None:
+            format_string += ', second_size={0}'.format(self.second_size)
+            format_string += ', second_interpolation={0}'.format(_pil_interpolation_to_str[self.second_interpolation])
+        format_string += ')'
+        return format_string
+def pil_loader(path: str) -> Image.Image:
+    # open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')

magic-pdf 0.5.13__py3-none-any.whl → 0.6.0__py3-none-any.whl

magic-pdf 0.5.13py3-none-any.whl → 0.6.0py3-none-any.whl