PyPI - magic-pdf - Versions diffs - 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl - Mend

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (76) hide show

magic_pdf/data/data_reader_writer/filebase.py +3 -0
magic_pdf/filter/pdf_meta_scan.py +3 -17
magic_pdf/libs/commons.py +0 -161
magic_pdf/libs/draw_bbox.py +2 -3
magic_pdf/libs/markdown_utils.py +0 -21
magic_pdf/libs/pdf_image_tools.py +2 -1
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +2 -2
magic_pdf/model/magic_model.py +0 -30
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +3 -28
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +3 -3
magic_pdf/para/para_split_v3.py +7 -2
magic_pdf/pdf_parse_union_core_v2.py +97 -124
magic_pdf/pre_proc/construct_page_dict.py +0 -55
magic_pdf/pre_proc/cut_image.py +0 -37
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +5 -178
magic_pdf/pre_proc/ocr_dict_merge.py +1 -224
magic_pdf/pre_proc/ocr_span_list_modify.py +2 -252
magic_pdf/rw/S3ReaderWriter.py +1 -1
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/METADATA +3 -77
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/RECORD +25 -76
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/WHEEL +1 -1
magic_pdf/dict2md/mkcontent.py +0 -438
magic_pdf/layout/__init__.py +0 -0
magic_pdf/layout/bbox_sort.py +0 -681
magic_pdf/layout/layout_det_utils.py +0 -182
magic_pdf/layout/layout_sort.py +0 -921
magic_pdf/layout/layout_spiler_recog.py +0 -101
magic_pdf/layout/mcol_sort.py +0 -336
magic_pdf/libs/calc_span_stats.py +0 -239
magic_pdf/libs/detect_language_from_model.py +0 -21
magic_pdf/libs/nlp_utils.py +0 -203
magic_pdf/libs/textbase.py +0 -33
magic_pdf/libs/vis_utils.py +0 -308
magic_pdf/para/block_continuation_processor.py +0 -562
magic_pdf/para/block_termination_processor.py +0 -480
magic_pdf/para/commons.py +0 -222
magic_pdf/para/denoise.py +0 -246
magic_pdf/para/draw.py +0 -121
magic_pdf/para/exceptions.py +0 -198
magic_pdf/para/layout_match_processor.py +0 -40
magic_pdf/para/para_split.py +0 -807
magic_pdf/para/para_split_v2.py +0 -959
magic_pdf/para/raw_processor.py +0 -207
magic_pdf/para/stats.py +0 -268
magic_pdf/para/title_processor.py +0 -1014
magic_pdf/pdf_parse_union_core.py +0 -345
magic_pdf/post_proc/__init__.py +0 -0
magic_pdf/post_proc/detect_para.py +0 -3472
magic_pdf/post_proc/pdf_post_filter.py +0 -60
magic_pdf/post_proc/remove_footnote.py +0 -153
magic_pdf/pre_proc/citationmarker_remove.py +0 -161
magic_pdf/pre_proc/detect_equation.py +0 -134
magic_pdf/pre_proc/detect_footer_by_model.py +0 -64
magic_pdf/pre_proc/detect_footer_header_by_statistics.py +0 -284
magic_pdf/pre_proc/detect_footnote.py +0 -170
magic_pdf/pre_proc/detect_header.py +0 -64
magic_pdf/pre_proc/detect_images.py +0 -647
magic_pdf/pre_proc/detect_page_number.py +0 -64
magic_pdf/pre_proc/detect_tables.py +0 -62
magic_pdf/pre_proc/equations_replace.py +0 -550
magic_pdf/pre_proc/fix_image.py +0 -244
magic_pdf/pre_proc/fix_table.py +0 -270
magic_pdf/pre_proc/main_text_font.py +0 -23
magic_pdf/pre_proc/ocr_detect_layout.py +0 -133
magic_pdf/pre_proc/pdf_pre_filter.py +0 -78
magic_pdf/pre_proc/post_layout_split.py +0 -0
magic_pdf/pre_proc/remove_colored_strip_bbox.py +0 -101
magic_pdf/pre_proc/remove_footer_header.py +0 -114
magic_pdf/pre_proc/remove_rotate_bbox.py +0 -236
magic_pdf/pre_proc/resolve_bbox_conflict.py +0 -184
magic_pdf/pre_proc/solve_line_alien.py +0 -29
magic_pdf/pre_proc/statistics.py +0 -12
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.10.0.dist-info → magic_pdf-0.10.2.dist-info}/top_level.txt +0 -0

magic_pdf/para/raw_processor.py DELETED Viewed

@@ -1,207 +0,0 @@
-class RawBlockProcessor:
-    def __init__(self) -> None:
-        self.y_tolerance = 2
-        self.pdf_dic = {}
-    def __span_flags_decomposer(self, span_flags):
-        """
-        Make font flags human readable.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        span_flags : int
-            span flags
-        Returns
-        -------
-        l : dict
-            decomposed flags
-        """
-        l = {
-            "is_superscript": False,
-            "is_italic": False,
-            "is_serifed": False,
-            "is_sans_serifed": False,
-            "is_monospaced": False,
-            "is_proportional": False,
-            "is_bold": False,
-        }
-        if span_flags & 2**0:
-            l["is_superscript"] = True  # 表示上标
-        if span_flags & 2**1:
-            l["is_italic"] = True  # 表示斜体
-        if span_flags & 2**2:
-            l["is_serifed"] = True  # 表示衬线字体
-        else:
-            l["is_sans_serifed"] = True  # 表示非衬线字体
-        if span_flags & 2**3:
-            l["is_monospaced"] = True  # 表示等宽字体
-        else:
-            l["is_proportional"] = True  # 表示比例字体
-        if span_flags & 2**4:
-            l["is_bold"] = True  # 表示粗体
-        return l
-    def __make_new_lines(self, raw_lines):
-        """
-        This function makes new lines.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        raw_lines : list
-            raw lines
-        Returns
-        -------
-        new_lines : list
-            new lines
-        """
-        new_lines = []
-        new_line = None
-        for raw_line in raw_lines:
-            raw_line_bbox = raw_line["bbox"]
-            raw_line_spans = raw_line["spans"]
-            raw_line_text = "".join([span["text"] for span in raw_line_spans])
-            raw_line_dir = raw_line.get("dir", None)
-            decomposed_line_spans = []
-            for span in raw_line_spans:
-                raw_flags = span["flags"]
-                decomposed_flags = self.__span_flags_decomposer(raw_flags)
-                span["decomposed_flags"] = decomposed_flags
-                decomposed_line_spans.append(span)
-            if new_line is None:
-                new_line = {
-                    "bbox": raw_line_bbox,
-                    "text": raw_line_text,
-                    "dir": raw_line_dir if raw_line_dir else (0, 0),
-                    "spans": decomposed_line_spans,
-                }
-            else:
-                if (
-                    abs(raw_line_bbox[1] - new_line["bbox"][1]) <= self.y_tolerance
-                    and abs(raw_line_bbox[3] - new_line["bbox"][3]) <= self.y_tolerance
-                ):
-                    new_line["bbox"] = (
-                        min(new_line["bbox"][0], raw_line_bbox[0]),  # left
-                        new_line["bbox"][1],  # top
-                        max(new_line["bbox"][2], raw_line_bbox[2]),  # right
-                        raw_line_bbox[3],  # bottom
-                    )
-                    new_line["text"] += " " + raw_line_text
-                    new_line["spans"].extend(raw_line_spans)
-                    new_line["dir"] = (
-                        new_line["dir"][0] + raw_line_dir[0],
-                        new_line["dir"][1] + raw_line_dir[1],
-                    )
-                else:
-                    new_lines.append(new_line)
-                    new_line = {
-                        "bbox": raw_line_bbox,
-                        "text": raw_line_text,
-                        "dir": raw_line_dir if raw_line_dir else (0, 0),
-                        "spans": raw_line_spans,
-                    }
-        if new_line:
-            new_lines.append(new_line)
-        return new_lines
-    def __make_new_block(self, raw_block):
-        """
-        This function makes a new block.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        raw_block : dict
-            a raw block
-        Returns
-        -------
-        new_block : dict
-        Schema of new_block:
-        {
-            "block_id": "block_1",
-            "bbox": [0, 0, 100, 100],
-            "text": "This is a block.",
-            "lines": [
-                {
-                    "bbox": [0, 0, 100, 100],
-                    "text": "This is a line.",
-                    "spans": [
-                        {
-                            "text": "This is a span.",
-                            "font": "Times New Roman",
-                            "size": 12,
-                            "color": "#000000",
-                        }
-                    ],
-                }
-            ],
-        }
-        """
-        new_block = {}
-        block_id = raw_block["number"]
-        block_bbox = raw_block["bbox"]
-        block_text = " ".join(span["text"] for line in raw_block["lines"] for span in line["spans"])
-        raw_lines = raw_block["lines"]
-        block_lines = self.__make_new_lines(raw_lines)
-        new_block["block_id"] = block_id
-        new_block["bbox"] = block_bbox
-        new_block["text"] = block_text
-        new_block["lines"] = block_lines
-        return new_block
-    def batch_process_blocks(self, pdf_dic):
-        """
-        This function processes the blocks in batch.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        blocks : list
-            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json.
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "preproc_blocks" in blocks.keys():
-                    input_blocks = blocks["preproc_blocks"]
-                    for raw_block in input_blocks:
-                        new_block = self.__make_new_block(raw_block)
-                        para_blocks.append(new_block)
-                blocks["para_blocks"] = para_blocks
-        return pdf_dic

magic_pdf/para/stats.py DELETED Viewed

@@ -1,268 +0,0 @@
-from collections import Counter
-import numpy as np
-from magic_pdf.para.commons import *
-if sys.version_info[0] >= 3:
-    sys.stdout.reconfigure(encoding="utf-8")  # type: ignore
-class BlockStatisticsCalculator:
-    def __init__(self) -> None:
-        pass
-    def __calc_stats_of_new_lines(self, new_lines):
-        """
-        This function calculates the paragraph metrics
-        Parameters
-        ----------
-        combined_lines : list
-            combined lines
-        Returns
-        -------
-        X0 : float
-            Median of x0 values, which represents the left average boundary of the block
-        X1 : float
-            Median of x1 values, which represents the right average boundary of the block
-        avg_char_width : float
-            Average of char widths, which represents the average char width of the block
-        avg_char_height : float
-            Average of line heights, which represents the average line height of the block
-        """
-        x0_values = []
-        x1_values = []
-        char_widths = []
-        char_heights = []
-        block_font_types = []
-        block_font_sizes = []
-        block_directions = []
-        if len(new_lines) > 0:
-            for i, line in enumerate(new_lines):
-                line_bbox = line["bbox"]
-                line_text = line["text"]
-                line_spans = line["spans"]
-                num_chars = len([ch for ch in line_text if not ch.isspace()])
-                x0_values.append(line_bbox[0])
-                x1_values.append(line_bbox[2])
-                if num_chars > 0:
-                    char_width = (line_bbox[2] - line_bbox[0]) / num_chars
-                    char_widths.append(char_width)
-                for span in line_spans:
-                    block_font_types.append(span["font"])
-                    block_font_sizes.append(span["size"])
-                if "dir" in line:
-                    block_directions.append(line["dir"])
-                # line_font_types = [span["font"] for span in line_spans]
-                char_heights = [span["size"] for span in line_spans]
-        X0 = np.median(x0_values) if x0_values else 0
-        X1 = np.median(x1_values) if x1_values else 0
-        avg_char_width = sum(char_widths) / len(char_widths) if char_widths else 0
-        avg_char_height = sum(char_heights) / len(char_heights) if char_heights else 0
-        # max_freq_font_type = max(set(block_font_types), key=block_font_types.count) if block_font_types else None
-        max_span_length = 0
-        max_span_font_type = None
-        for line in new_lines:
-            line_spans = line["spans"]
-            for span in line_spans:
-                span_length = span["bbox"][2] - span["bbox"][0]
-                if span_length > max_span_length:
-                    max_span_length = span_length
-                    max_span_font_type = span["font"]
-        max_freq_font_type = max_span_font_type
-        avg_font_size = sum(block_font_sizes) / len(block_font_sizes) if block_font_sizes else None
-        avg_dir_horizontal = sum([dir[0] for dir in block_directions]) / len(block_directions) if block_directions else 0
-        avg_dir_vertical = sum([dir[1] for dir in block_directions]) / len(block_directions) if block_directions else 0
-        median_font_size = float(np.median(block_font_sizes)) if block_font_sizes else None
-        return (
-            X0,
-            X1,
-            avg_char_width,
-            avg_char_height,
-            max_freq_font_type,
-            avg_font_size,
-            (avg_dir_horizontal, avg_dir_vertical),
-            median_font_size,
-        )
-    def __make_new_block(self, input_block):
-        new_block = {}
-        raw_lines = input_block["lines"]
-        stats = self.__calc_stats_of_new_lines(raw_lines)
-        block_id = input_block["block_id"]
-        block_bbox = input_block["bbox"]
-        block_text = input_block["text"]
-        block_lines = raw_lines
-        block_avg_left_boundary = stats[0]
-        block_avg_right_boundary = stats[1]
-        block_avg_char_width = stats[2]
-        block_avg_char_height = stats[3]
-        block_font_type = stats[4]
-        block_font_size = stats[5]
-        block_direction = stats[6]
-        block_median_font_size = stats[7]
-        new_block["block_id"] = block_id
-        new_block["bbox"] = block_bbox
-        new_block["text"] = block_text
-        new_block["dir"] = block_direction
-        new_block["X0"] = block_avg_left_boundary
-        new_block["X1"] = block_avg_right_boundary
-        new_block["avg_char_width"] = block_avg_char_width
-        new_block["avg_char_height"] = block_avg_char_height
-        new_block["block_font_type"] = block_font_type
-        new_block["block_font_size"] = block_font_size
-        new_block["lines"] = block_lines
-        new_block["median_font_size"] = block_median_font_size
-        return new_block
-    def batch_process_blocks(self, pdf_dic):
-        """
-        This function processes the blocks in batch.
-        Parameters
-        ----------
-        self : object
-            The instance of the class.
-        ----------
-        blocks : list
-            Input block is a list of raw blocks. Schema can refer to the value of key ""preproc_blocks", demo file is app/pdf_toolbox/tests/preproc_2_parasplit_example.json
-        Returns
-        -------
-        result_dict : dict
-            result dictionary
-        """
-        for page_id, blocks in pdf_dic.items():
-            if page_id.startswith("page_"):
-                para_blocks = []
-                if "para_blocks" in blocks.keys():
-                    input_blocks = blocks["para_blocks"]
-                    for input_block in input_blocks:
-                        new_block = self.__make_new_block(input_block)
-                        para_blocks.append(new_block)
-                blocks["para_blocks"] = para_blocks
-        return pdf_dic
-class DocStatisticsCalculator:
-    def __init__(self) -> None:
-        pass
-    def calc_stats_of_doc(self, pdf_dict):
-        """
-        This function computes the statistics of the document
-        Parameters
-        ----------
-        result_dict : dict
-            result dictionary
-        Returns
-        -------
-        statistics : dict
-            statistics of the document
-        """
-        total_text_length = 0
-        total_num_blocks = 0
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    para_blocks = blocks["para_blocks"]
-                    for para_block in para_blocks:
-                        total_text_length += len(para_block["text"])
-                        total_num_blocks += 1
-        avg_text_length = total_text_length / total_num_blocks if total_num_blocks else 0
-        font_list = []
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                if "para_blocks" in blocks.keys():
-                    input_blocks = blocks["para_blocks"]
-                    for input_block in input_blocks:
-                        block_text_length = len(input_block.get("text", ""))
-                        if block_text_length < avg_text_length * 0.5:
-                            continue
-                        block_font_type = safe_get(input_block, "block_font_type", "")
-                        block_font_size = safe_get(input_block, "block_font_size", 0)
-                        font_list.append((block_font_type, block_font_size))
-        font_counter = Counter(font_list)
-        most_common_font = font_counter.most_common(1)[0] if font_list else (("", 0), 0)
-        second_most_common_font = font_counter.most_common(2)[1] if len(font_counter) > 1 else (("", 0), 0)
-        statistics = {
-            "num_pages": 0,
-            "num_blocks": 0,
-            "num_paras": 0,
-            "num_titles": 0,
-            "num_header_blocks": 0,
-            "num_footer_blocks": 0,
-            "num_watermark_blocks": 0,
-            "num_vertical_margin_note_blocks": 0,
-            "most_common_font_type": most_common_font[0][0],
-            "most_common_font_size": most_common_font[0][1],
-            "number_of_most_common_font": most_common_font[1],
-            "second_most_common_font_type": second_most_common_font[0][0],
-            "second_most_common_font_size": second_most_common_font[0][1],
-            "number_of_second_most_common_font": second_most_common_font[1],
-            "avg_text_length": avg_text_length,
-        }
-        for page_id, blocks in pdf_dict.items():
-            if page_id.startswith("page_"):
-                blocks = pdf_dict[page_id]["para_blocks"]
-                statistics["num_pages"] += 1
-                for block_id, block_data in enumerate(blocks):
-                    statistics["num_blocks"] += 1
-                    if "paras" in block_data.keys():
-                        statistics["num_paras"] += len(block_data["paras"])
-                    for line in block_data["lines"]:
-                        if line.get("is_title", 0):
-                            statistics["num_titles"] += 1
-                    if block_data.get("is_header", 0):
-                        statistics["num_header_blocks"] += 1
-                    if block_data.get("is_footer", 0):
-                        statistics["num_footer_blocks"] += 1
-                    if block_data.get("is_watermark", 0):
-                        statistics["num_watermark_blocks"] += 1
-                    if block_data.get("is_vertical_margin_note", 0):
-                        statistics["num_vertical_margin_note_blocks"] += 1
-        pdf_dict["statistics"] = statistics
-        return pdf_dict

magic-pdf 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

magic-pdf 0.10.0py3-none-any.whl → 0.10.2py3-none-any.whl