PyPI - magic-pdf - Versions diffs - 0.5.4__py3-none-any.whl - Mend

magic-pdf 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (121) hide show

magic_pdf/__init__.py +0 -0
magic_pdf/cli/__init__.py +0 -0
magic_pdf/cli/magicpdf.py +294 -0
magic_pdf/dict2md/__init__.py +0 -0
magic_pdf/dict2md/mkcontent.py +397 -0
magic_pdf/dict2md/ocr_mkcontent.py +356 -0
magic_pdf/filter/__init__.py +0 -0
magic_pdf/filter/pdf_classify_by_type.py +381 -0
magic_pdf/filter/pdf_meta_scan.py +368 -0
magic_pdf/layout/__init__.py +0 -0
magic_pdf/layout/bbox_sort.py +681 -0
magic_pdf/layout/layout_det_utils.py +182 -0
magic_pdf/layout/layout_sort.py +732 -0
magic_pdf/layout/layout_spiler_recog.py +101 -0
magic_pdf/layout/mcol_sort.py +336 -0
magic_pdf/libs/Constants.py +11 -0
magic_pdf/libs/MakeContentConfig.py +10 -0
magic_pdf/libs/ModelBlockTypeEnum.py +9 -0
magic_pdf/libs/__init__.py +0 -0
magic_pdf/libs/boxbase.py +408 -0
magic_pdf/libs/calc_span_stats.py +239 -0
magic_pdf/libs/commons.py +204 -0
magic_pdf/libs/config_reader.py +63 -0
magic_pdf/libs/convert_utils.py +5 -0
magic_pdf/libs/coordinate_transform.py +9 -0
magic_pdf/libs/detect_language_from_model.py +21 -0
magic_pdf/libs/draw_bbox.py +227 -0
magic_pdf/libs/drop_reason.py +27 -0
magic_pdf/libs/drop_tag.py +19 -0
magic_pdf/libs/hash_utils.py +15 -0
magic_pdf/libs/json_compressor.py +27 -0
magic_pdf/libs/language.py +31 -0
magic_pdf/libs/markdown_utils.py +31 -0
magic_pdf/libs/math.py +9 -0
magic_pdf/libs/nlp_utils.py +203 -0
magic_pdf/libs/ocr_content_type.py +21 -0
magic_pdf/libs/path_utils.py +23 -0
magic_pdf/libs/pdf_image_tools.py +33 -0
magic_pdf/libs/safe_filename.py +11 -0
magic_pdf/libs/textbase.py +33 -0
magic_pdf/libs/version.py +1 -0
magic_pdf/libs/vis_utils.py +308 -0
magic_pdf/model/__init__.py +0 -0
magic_pdf/model/doc_analyze_by_360layout.py +8 -0
magic_pdf/model/doc_analyze_by_pp_structurev2.py +125 -0
magic_pdf/model/magic_model.py +632 -0
magic_pdf/para/__init__.py +0 -0
magic_pdf/para/block_continuation_processor.py +562 -0
magic_pdf/para/block_termination_processor.py +480 -0
magic_pdf/para/commons.py +222 -0
magic_pdf/para/denoise.py +246 -0
magic_pdf/para/draw.py +121 -0
magic_pdf/para/exceptions.py +198 -0
magic_pdf/para/layout_match_processor.py +40 -0
magic_pdf/para/para_pipeline.py +297 -0
magic_pdf/para/para_split.py +644 -0
magic_pdf/para/para_split_v2.py +772 -0
magic_pdf/para/raw_processor.py +207 -0
magic_pdf/para/stats.py +268 -0
magic_pdf/para/title_processor.py +1014 -0
magic_pdf/pdf_parse_by_ocr.py +219 -0
magic_pdf/pdf_parse_by_ocr_v2.py +17 -0
magic_pdf/pdf_parse_by_txt.py +410 -0
magic_pdf/pdf_parse_by_txt_v2.py +56 -0
magic_pdf/pdf_parse_for_train.py +685 -0
magic_pdf/pdf_parse_union_core.py +241 -0
magic_pdf/pipe/AbsPipe.py +112 -0
magic_pdf/pipe/OCRPipe.py +28 -0
magic_pdf/pipe/TXTPipe.py +29 -0
magic_pdf/pipe/UNIPipe.py +83 -0
magic_pdf/pipe/__init__.py +0 -0
magic_pdf/post_proc/__init__.py +0 -0
magic_pdf/post_proc/detect_para.py +3472 -0
magic_pdf/post_proc/pdf_post_filter.py +67 -0
magic_pdf/post_proc/remove_footnote.py +153 -0
magic_pdf/pre_proc/__init__.py +0 -0
magic_pdf/pre_proc/citationmarker_remove.py +157 -0
magic_pdf/pre_proc/construct_page_dict.py +72 -0
magic_pdf/pre_proc/cut_image.py +71 -0
magic_pdf/pre_proc/detect_equation.py +134 -0
magic_pdf/pre_proc/detect_footer_by_model.py +64 -0
magic_pdf/pre_proc/detect_footer_header_by_statistics.py +284 -0
magic_pdf/pre_proc/detect_footnote.py +170 -0
magic_pdf/pre_proc/detect_header.py +64 -0
magic_pdf/pre_proc/detect_images.py +647 -0
magic_pdf/pre_proc/detect_page_number.py +64 -0
magic_pdf/pre_proc/detect_tables.py +62 -0
magic_pdf/pre_proc/equations_replace.py +559 -0
magic_pdf/pre_proc/fix_image.py +244 -0
magic_pdf/pre_proc/fix_table.py +270 -0
magic_pdf/pre_proc/main_text_font.py +23 -0
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +115 -0
magic_pdf/pre_proc/ocr_detect_layout.py +133 -0
magic_pdf/pre_proc/ocr_dict_merge.py +336 -0
magic_pdf/pre_proc/ocr_span_list_modify.py +258 -0
magic_pdf/pre_proc/pdf_pre_filter.py +74 -0
magic_pdf/pre_proc/post_layout_split.py +0 -0
magic_pdf/pre_proc/remove_bbox_overlap.py +98 -0
magic_pdf/pre_proc/remove_colored_strip_bbox.py +79 -0
magic_pdf/pre_proc/remove_footer_header.py +117 -0
magic_pdf/pre_proc/remove_rotate_bbox.py +188 -0
magic_pdf/pre_proc/resolve_bbox_conflict.py +191 -0
magic_pdf/pre_proc/solve_line_alien.py +29 -0
magic_pdf/pre_proc/statistics.py +12 -0
magic_pdf/rw/AbsReaderWriter.py +34 -0
magic_pdf/rw/DiskReaderWriter.py +66 -0
magic_pdf/rw/S3ReaderWriter.py +107 -0
magic_pdf/rw/__init__.py +0 -0
magic_pdf/spark/__init__.py +0 -0
magic_pdf/spark/spark_api.py +51 -0
magic_pdf/train_utils/__init__.py +0 -0
magic_pdf/train_utils/convert_to_train_format.py +65 -0
magic_pdf/train_utils/extract_caption.py +59 -0
magic_pdf/train_utils/remove_footer_header.py +159 -0
magic_pdf/train_utils/vis_utils.py +327 -0
magic_pdf/user_api.py +136 -0
magic_pdf-0.5.4.dist-info/LICENSE.md +661 -0
magic_pdf-0.5.4.dist-info/METADATA +24 -0
magic_pdf-0.5.4.dist-info/RECORD +121 -0
magic_pdf-0.5.4.dist-info/WHEEL +5 -0
magic_pdf-0.5.4.dist-info/top_level.txt +1 -0

magic_pdf/layout/layout_spiler_recog.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""
+找到能分割布局的水平的横线、色块
+"""
+import os
+from magic_pdf.libs.commons import fitz
+from magic_pdf.libs.boxbase import _is_in_or_part_overlap
+def __rect_filter_by_width(rect, page_w, page_h):
+    mid_x = page_w/2
+    if rect[0]< mid_x < rect[2]:
+        return True
+    return False
+def __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
+    """
+    不能出现在table和image的位置
+    """
+    for box in image_bboxes:
+        if _is_in_or_part_overlap(rect, box):
+            return False
+    for box in table_bboxes:
+        if _is_in_or_part_overlap(rect, box):
+            return False
+    return True
+def __debug_show_page(page, bboxes1: list,bboxes2: list,bboxes3: list,):
+    save_path = "./tmp/debug.pdf"
+    if os.path.exists(save_path):
+        # 删除已经存在的文件
+        os.remove(save_path)
+    # 创建一个新的空白 PDF 文件
+    doc = fitz.open('')
+    width = page.rect.width
+    height = page.rect.height
+    new_page = doc.new_page(width=width, height=height)
+    shape = new_page.new_shape()
+    for bbox in bboxes1:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['blue'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+    for bbox in bboxes2:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=None, fill=fitz.pdfcolor['yellow'], fill_opacity=0.2)
+        shape.finish()
+        shape.commit()
+    for bbox in bboxes3:
+        # 原始box画上去
+        rect = fitz.Rect(*bbox[0:4])
+        shape = new_page.new_shape()
+        shape.draw_rect(rect)
+        shape.finish(color=fitz.pdfcolor['red'], fill=None)
+        shape.finish()
+        shape.commit()
+    parent_dir = os.path.dirname(save_path)
+    if not os.path.exists(parent_dir):
+        os.makedirs(parent_dir)
+    doc.save(save_path)
+    doc.close()
+def get_spilter_of_page(page, image_bboxes, table_bboxes):
+    """
+    获取到色块和横线
+    """
+    cdrawings = page.get_cdrawings()
+    spilter_bbox = []
+    for block in cdrawings:
+        if 'fill' in block:
+            fill = block['fill']
+        if 'fill' in block and block['fill'] and block['fill']!=(1.0,1.0,1.0):
+            rect = block['rect']
+            if __rect_filter_by_width(rect, page.rect.width, page.rect.height) and __rect_filter_by_pos(rect, image_bboxes, table_bboxes):
+                spilter_bbox.append(list(rect))
+    """过滤、修正一下这些box。因为有时候会有一些矩形，高度为0或者为负数，造成layout计算无限循环。如果是负高度或者0高度，统一修正为高度为1"""
+    for box in spilter_bbox:
+        if box[3]-box[1] <= 0:
+            box[3] = box[1] + 1
+    #__debug_show_page(page, spilter_bbox, [], [])
+    return spilter_bbox

magic_pdf/layout/mcol_sort.py ADDED Viewed

@@ -0,0 +1,336 @@
+"""
+This is an advanced PyMuPDF utility for detecting multi-column pages.
+It can be used in a shell script, or its main function can be imported and
+invoked as descript below.
+Features
+---------
+- Identify text belonging to (a variable number of) columns on the page.
+- Text with different background color is handled separately, allowing for
+  easier treatment of side remarks, comment boxes, etc.
+- Uses text block detection capability to identify text blocks and
+  uses the block bboxes as primary structuring principle.
+- Supports ignoring footers via a footer margin parameter.
+- Returns re-created text boundary boxes (integer coordinates), sorted ascending
+  by the top, then by the left coordinates.
+Restrictions
+-------------
+- Only supporting horizontal, left-to-right text
+- Returns a list of text boundary boxes - not the text itself. The caller is
+  expected to extract text from within the returned boxes.
+- Text written above images is ignored altogether (option).
+- This utility works as expected in most cases. The following situation cannot
+  be handled correctly:
+    * overlapping (non-disjoint) text blocks
+    * image captions are not recognized and are handled like normal text
+Usage
+------
+- As a CLI shell command use
+  python multi_column.py input.pdf footer_margin
+  Where footer margin is the height of the bottom stripe to ignore on each page.
+  This code is intended to be modified according to your need.
+- Use in a Python script as follows:
+  ----------------------------------------------------------------------------------
+  from multi_column import column_boxes
+  # for each page execute
+  bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
+  # bboxes is a list of fitz.IRect objects, that are sort ascending by their y0,
+  # then x0 coordinates. Their text content can be extracted by all PyMuPDF
+  # get_text() variants, like for instance the following:
+  for rect in bboxes:
+      print(page.get_text(clip=rect, sort=True))
+  ----------------------------------------------------------------------------------
+"""
+import sys
+from magic_pdf.libs.commons import fitz
+def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
+    """Determine bboxes which wrap a column."""
+    paths = page.get_drawings()
+    bboxes = []
+    # path rectangles
+    path_rects = []
+    # image bboxes
+    img_bboxes = []
+    # bboxes of non-horizontal text
+    # avoid when expanding horizontal text boxes
+    vert_bboxes = []
+    # compute relevant page area
+    clip = +page.rect
+    clip.y1 -= footer_margin  # Remove footer area
+    clip.y0 += header_margin  # Remove header area
+    def can_extend(temp, bb, bboxlist):
+        """Determines whether rectangle 'temp' can be extended by 'bb'
+        without intersecting any of the rectangles contained in 'bboxlist'.
+        Items of bboxlist may be None if they have been removed.
+        Returns:
+            True if 'temp' has no intersections with items of 'bboxlist'.
+        """
+        for b in bboxlist:
+            if not intersects_bboxes(temp, vert_bboxes) and (
+                b == None or b == bb or (temp & b).is_empty
+            ):
+                continue
+            return False
+        return True
+    def in_bbox(bb, bboxes):
+        """Return 1-based number if a bbox contains bb, else return 0."""
+        for i, bbox in enumerate(bboxes):
+            if bb in bbox:
+                return i + 1
+        return 0
+    def intersects_bboxes(bb, bboxes):
+        """Return True if a bbox intersects bb, else return False."""
+        for bbox in bboxes:
+            if not (bb & bbox).is_empty:
+                return True
+        return False
+    def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
+        """Extend a bbox to the right page border.
+        Whenever there is no text to the right of a bbox, enlarge it up
+        to the right page border.
+        Args:
+            bboxes: (list[IRect]) bboxes to check
+            width: (int) page width
+            path_bboxes: (list[IRect]) bboxes with a background color
+            vert_bboxes: (list[IRect]) bboxes with vertical text
+            img_bboxes: (list[IRect]) bboxes of images
+        Returns:
+            Potentially modified bboxes.
+        """
+        for i, bb in enumerate(bboxes):
+            # do not extend text with background color
+            if in_bbox(bb, path_bboxes):
+                continue
+            # do not extend text in images
+            if in_bbox(bb, img_bboxes):
+                continue
+            # temp extends bb to the right page border
+            temp = +bb
+            temp.x1 = width
+            # do not cut through colored background or images
+            if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
+                continue
+            # also, do not intersect other text bboxes
+            check = can_extend(temp, bb, bboxes)
+            if check:
+                bboxes[i] = temp  # replace with enlarged bbox
+        return [b for b in bboxes if b != None]
+    def clean_nblocks(nblocks):
+        """Do some elementary cleaning."""
+        # 1. remove any duplicate blocks.
+        blen = len(nblocks)
+        if blen < 2:
+            return nblocks
+        start = blen - 1
+        for i in range(start, -1, -1):
+            bb1 = nblocks[i]
+            bb0 = nblocks[i - 1]
+            if bb0 == bb1:
+                del nblocks[i]
+        # 2. repair sequence in special cases:
+        # consecutive bboxes with almost same bottom value are sorted ascending
+        # by x-coordinate.
+        y1 = nblocks[0].y1  # first bottom coordinate
+        i0 = 0  # its index
+        i1 = -1  # index of last bbox with same bottom
+        # Iterate over bboxes, identifying segments with approx. same bottom value.
+        # Replace every segment by its sorted version.
+        for i in range(1, len(nblocks)):
+            b1 = nblocks[i]
+            if abs(b1.y1 - y1) > 10:  # different bottom
+                if i1 > i0:  # segment length > 1? Sort it!
+                    nblocks[i0 : i1 + 1] = sorted(
+                        nblocks[i0 : i1 + 1], key=lambda b: b.x0
+                    )
+                y1 = b1.y1  # store new bottom value
+                i0 = i  # store its start index
+            i1 = i  # store current index
+        if i1 > i0:  # segment waiting to be sorted
+            nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
+        return nblocks
+    # extract vector graphics
+    for p in paths:
+        path_rects.append(p["rect"].irect)
+    path_bboxes = path_rects
+    # sort path bboxes by ascending top, then left coordinates
+    path_bboxes.sort(key=lambda b: (b.y0, b.x0))
+    # bboxes of images on page, no need to sort them
+    for item in page.get_images():
+        img_bboxes.extend(page.get_image_rects(item[0]))
+    # blocks of text on page
+    blocks = page.get_text(
+        "dict",
+        flags=fitz.TEXTFLAGS_TEXT,
+        clip=clip,
+    )["blocks"]
+    # Make block rectangles, ignoring non-horizontal text
+    for b in blocks:
+        bbox = fitz.IRect(b["bbox"])  # bbox of the block
+        # ignore text written upon images
+        if no_image_text and in_bbox(bbox, img_bboxes):
+            continue
+        # confirm first line to be horizontal
+        line0 = b["lines"][0]  # get first line
+        if line0["dir"] != (1, 0):  # only accept horizontal text
+            vert_bboxes.append(bbox)
+            continue
+        srect = fitz.EMPTY_IRECT()
+        for line in b["lines"]:
+            lbbox = fitz.IRect(line["bbox"])
+            text = "".join([s["text"].strip() for s in line["spans"]])
+            if len(text) > 1:
+                srect |= lbbox
+        bbox = +srect
+        if not bbox.is_empty:
+            bboxes.append(bbox)
+    # Sort text bboxes by ascending background, top, then left coordinates
+    bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))
+    # Extend bboxes to the right where possible
+    bboxes = extend_right(
+        bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
+    )
+    # immediately return of no text found
+    if bboxes == []:
+        return []
+    # --------------------------------------------------------------------
+    # Join bboxes to establish some column structure
+    # --------------------------------------------------------------------
+    # the final block bboxes on page
+    nblocks = [bboxes[0]]  # pre-fill with first bbox
+    bboxes = bboxes[1:]  # remaining old bboxes
+    for i, bb in enumerate(bboxes):  # iterate old bboxes
+        check = False  # indicates unwanted joins
+        # check if bb can extend one of the new blocks
+        for j in range(len(nblocks)):
+            nbb = nblocks[j]  # a new block
+            # never join across columns
+            if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
+                continue
+            # never join across different background colors
+            if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
+                continue
+            temp = bb | nbb  # temporary extension of new block
+            check = can_extend(temp, nbb, nblocks)
+            if check == True:
+                break
+        if not check:  # bb cannot be used to extend any of the new bboxes
+            nblocks.append(bb)  # so add it to the list
+            j = len(nblocks) - 1  # index of it
+            temp = nblocks[j]  # new bbox added
+        # check if some remaining bbox is contained in temp
+        check = can_extend(temp, bb, bboxes)
+        if check == False:
+            nblocks.append(bb)
+        else:
+            nblocks[j] = temp
+        bboxes[i] = None
+    # do some elementary cleaning
+    nblocks = clean_nblocks(nblocks)
+    # return identified text bboxes
+    return nblocks
+if __name__ == "__main__":
+    """Only for debugging purposes, currently.
+    Draw red borders around the returned text bboxes and insert
+    the bbox number.
+    Then save the file under the name "input-blocks.pdf".
+    """
+    # get the file name
+    filename = sys.argv[1]
+    # check if footer margin is given
+    if len(sys.argv) > 2:
+        footer_margin = int(sys.argv[2])
+    else:  # use default vaue
+        footer_margin = 50
+    # check if header margin is given
+    if len(sys.argv) > 3:
+        header_margin = int(sys.argv[3])
+    else:  # use default vaue
+        header_margin = 50
+    # open document
+    doc = fitz.open(filename)
+    # iterate over the pages
+    for page in doc:
+        # remove any geometry issues
+        page.wrap_contents()
+        # get the text bboxes
+        bboxes = column_boxes(page, footer_margin=footer_margin, header_margin=header_margin)
+        # prepare a canvas to draw rectangles and text
+        shape = page.new_shape()
+        # iterate over the bboxes
+        for i, rect in enumerate(bboxes):
+            shape.draw_rect(rect)  # draw a border
+            # write sequence number
+            shape.insert_text(rect.tl + (5, 15), str(i), color=fitz.pdfcolor["red"])
+        # finish drawing / text with color red
+        shape.finish(color=fitz.pdfcolor["red"])
+        shape.commit()  # store to the page
+    # save document with text bboxes
+    doc.ez_save(filename.replace(".pdf", "-blocks.pdf"))

magic_pdf/libs/Constants.py ADDED Viewed

@@ -0,0 +1,11 @@
+"""
+span维度自定义字段
+"""
+# span是否是跨页合并的
+CROSS_PAGE = "cross_page"
+"""
+block维度自定义字段
+"""
+# block中lines是否被删除
+LINES_DELETED = "lines_deleted"

magic_pdf/libs/MakeContentConfig.py ADDED Viewed

@@ -0,0 +1,10 @@
+class MakeMode:
+    MM_MD = "mm_markdown"
+    NLP_MD = "nlp_markdown"
+    STANDARD_FORMAT = "standard_format"
+class DropMode:
+    WHOLE_PDF = "whole_pdf"
+    SINGLE_PAGE = "single_page"
+    NONE = "none"

magic_pdf/libs/ModelBlockTypeEnum.py ADDED Viewed

@@ -0,0 +1,9 @@
+from enum import Enum
+class ModelBlockTypeEnum(Enum):
+    TITLE = 0
+    PLAIN_TEXT = 1
+    ABANDON = 2
+    ISOLATE_FORMULA = 8
+    EMBEDDING = 13
+    ISOLATED = 14

magic_pdf/libs/__init__.py ADDED Viewed

File without changes