PyPI - magic-pdf - Versions diffs - 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl - Mend

magic-pdf 0.9.3py3-none-any.whl → 0.10.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

magic_pdf/config/constants.py +53 -0
magic_pdf/config/drop_reason.py +35 -0
magic_pdf/config/drop_tag.py +19 -0
magic_pdf/config/make_content_config.py +11 -0
magic_pdf/{libs/ModelBlockTypeEnum.py → config/model_block_type.py} +2 -1
magic_pdf/data/read_api.py +1 -1
magic_pdf/dict2md/mkcontent.py +226 -185
magic_pdf/dict2md/ocr_mkcontent.py +11 -11
magic_pdf/filter/pdf_meta_scan.py +101 -79
magic_pdf/integrations/rag/utils.py +4 -5
magic_pdf/libs/config_reader.py +5 -5
magic_pdf/libs/draw_bbox.py +3 -2
magic_pdf/libs/pdf_image_tools.py +36 -12
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +2 -0
magic_pdf/model/magic_model.py +13 -13
magic_pdf/model/pdf_extract_kit.py +122 -76
magic_pdf/model/sub_modules/model_init.py +40 -35
magic_pdf/model/sub_modules/ocr/paddleocr/ocr_utils.py +33 -7
magic_pdf/model/sub_modules/ocr/paddleocr/ppocr_273_mod.py +12 -4
magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py +2 -0
magic_pdf/model/sub_modules/table/tablemaster/tablemaster_paddle.py +30 -28
magic_pdf/para/para_split.py +411 -248
magic_pdf/para/para_split_v2.py +352 -182
magic_pdf/para/para_split_v3.py +110 -53
magic_pdf/pdf_parse_by_ocr.py +2 -0
magic_pdf/pdf_parse_by_txt.py +2 -0
magic_pdf/pdf_parse_union_core.py +174 -100
magic_pdf/pdf_parse_union_core_v2.py +202 -36
magic_pdf/pipe/AbsPipe.py +28 -44
magic_pdf/pipe/OCRPipe.py +5 -5
magic_pdf/pipe/TXTPipe.py +5 -6
magic_pdf/pipe/UNIPipe.py +24 -25
magic_pdf/post_proc/pdf_post_filter.py +7 -14
magic_pdf/pre_proc/cut_image.py +9 -11
magic_pdf/pre_proc/equations_replace.py +203 -212
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +235 -49
magic_pdf/pre_proc/ocr_dict_merge.py +5 -5
magic_pdf/pre_proc/ocr_span_list_modify.py +122 -63
magic_pdf/pre_proc/pdf_pre_filter.py +37 -33
magic_pdf/pre_proc/remove_bbox_overlap.py +20 -18
magic_pdf/pre_proc/remove_colored_strip_bbox.py +36 -14
magic_pdf/pre_proc/remove_footer_header.py +2 -5
magic_pdf/pre_proc/remove_rotate_bbox.py +111 -63
magic_pdf/pre_proc/resolve_bbox_conflict.py +10 -17
magic_pdf/spark/spark_api.py +15 -17
magic_pdf/tools/cli.py +3 -4
magic_pdf/tools/cli_dev.py +6 -9
magic_pdf/tools/common.py +26 -36
magic_pdf/user_api.py +29 -38
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/METADATA +11 -12
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/RECORD +57 -58
magic_pdf/libs/Constants.py +0 -55
magic_pdf/libs/MakeContentConfig.py +0 -11
magic_pdf/libs/drop_reason.py +0 -27
magic_pdf/libs/drop_tag.py +0 -19
magic_pdf/para/para_pipeline.py +0 -297
/magic_pdf/{libs → config}/ocr_content_type.py +0 -0
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/WHEEL +0 -0
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.9.3.dist-info → magic_pdf-0.10.0.dist-info}/top_level.txt +0 -0

magic_pdf/dict2md/mkcontent.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import math
 from loguru import logger
-from magic_pdf.libs.boxbase import find_bottom_nearest_text_bbox, find_top_nearest_text_bbox
+from magic_pdf.config.ocr_content_type import ContentType
+from magic_pdf.libs.boxbase import (find_bottom_nearest_text_bbox,
+                                    find_top_nearest_text_bbox)
 from magic_pdf.libs.commons import join_path
-from magic_pdf.libs.ocr_content_type import ContentType
 TYPE_INLINE_EQUATION = ContentType.InlineEquation
 TYPE_INTERLINE_EQUATION = ContentType.InterlineEquation
@@ -12,33 +14,30 @@ UNI_FORMAT_TEXT_TYPE = ['text', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 @DeprecationWarning
 def mk_nlp_markdown_1(para_dict: dict):
-    """
-    对排序后的bboxes拼接内容
-    """
+    """对排序后的bboxes拼接内容."""
     content_lst = []
     for _, page_info in para_dict.items():
-        para_blocks = page_info.get("para_blocks")
+        para_blocks = page_info.get('para_blocks')
         if not para_blocks:
             continue
         for block in para_blocks:
-            item = block["paras"]
+            item = block['paras']
             for _, p in item.items():
-                para_text = p["para_text"]
-                is_title = p["is_para_title"]
+                para_text = p['para_text']
+                is_title = p['is_para_title']
                 title_level = p['para_title_level']
-                md_title_prefix = "#"*title_level
+                md_title_prefix = '#' * title_level
                 if is_title:
-                    content_lst.append(f"{md_title_prefix} {para_text}")
+                    content_lst.append(f'{md_title_prefix} {para_text}')
                 else:
                     content_lst.append(para_text)
-    content_text = "\n\n".join(content_lst)
+    content_text = '\n\n'.join(content_lst)
     return content_text
 # 找到目标字符串在段落中的索引
 def __find_index(paragraph, target):
     index = paragraph.find(target)
@@ -48,69 +47,76 @@ def __find_index(paragraph, target):
         return None
-def __insert_string(paragraph, target, postion):
-    new_paragraph = paragraph[:postion] + target + paragraph[postion:]
+def __insert_string(paragraph, target, position):
+    new_paragraph = paragraph[:position] + target + paragraph[position:]
     return new_paragraph
 def __insert_after(content, image_content, target):
-    """
-    在content中找到target，将image_content插入到target后面
-    """
+    """在content中找到target，将image_content插入到target后面."""
     index = content.find(target)
     if index != -1:
-        content = content[:index+len(target)] + "\n\n" + image_content + "\n\n" + content[index+len(target):]
+        content = (
+            content[: index + len(target)]
+            + '\n\n'
+            + image_content
+            + '\n\n'
+            + content[index + len(target) :]
+        )
     else:
-        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
+        logger.error(
+            f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
+        )
     return content
 def __insert_before(content, image_content, target):
-    """
-    在content中找到target，将image_content插入到target前面
-    """
+    """在content中找到target，将image_content插入到target前面."""
     index = content.find(target)
     if index != -1:
-        content = content[:index] + "\n\n" + image_content + "\n\n" + content[index:]
+        content = content[:index] + '\n\n' + image_content + '\n\n' + content[index:]
     else:
-        logger.error(f"Can't find the location of image {image_content} in the markdown file, search target is {target}")
+        logger.error(
+            f"Can't find the location of image {image_content} in the markdown file, search target is {target}"
+        )
     return content
 @DeprecationWarning
 def mk_mm_markdown_1(para_dict: dict):
-    """拼装多模态markdown"""
+    """拼装多模态markdown."""
     content_lst = []
     for _, page_info in para_dict.items():
-        page_lst = [] # 一个page内的段落列表
-        para_blocks = page_info.get("para_blocks")
-        pymu_raw_blocks = page_info.get("preproc_blocks")
+        page_lst = []  # 一个page内的段落列表
+        para_blocks = page_info.get('para_blocks')
+        pymu_raw_blocks = page_info.get('preproc_blocks')
         all_page_images = []
-        all_page_images.extend(page_info.get("images",[]))
-        all_page_images.extend(page_info.get("image_backup", []) )
-        all_page_images.extend(page_info.get("tables",[]))
-        all_page_images.extend(page_info.get("table_backup",[]) )
-        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
+        all_page_images.extend(page_info.get('images', []))
+        all_page_images.extend(page_info.get('image_backup', []))
+        all_page_images.extend(page_info.get('tables', []))
+        all_page_images.extend(page_info.get('table_backup', []))
+        if not para_blocks or not pymu_raw_blocks:  # 只有图片的拼接的场景
             for img in all_page_images:
-                page_lst.append(f"![]({img['image_path']})") # TODO 图片顺序
-            page_md = "\n\n".join(page_lst)
+                page_lst.append(f"![]({img['image_path']})")  # TODO 图片顺序
+            page_md = '\n\n'.join(page_lst)
         else:
             for block in para_blocks:
-                item = block["paras"]
+                item = block['paras']
                 for _, p in item.items():
-                    para_text = p["para_text"]
-                    is_title = p["is_para_title"]
+                    para_text = p['para_text']
+                    is_title = p['is_para_title']
                     title_level = p['para_title_level']
-                    md_title_prefix = "#"*title_level
+                    md_title_prefix = '#' * title_level
                     if is_title:
-                        page_lst.append(f"{md_title_prefix} {para_text}")
+                        page_lst.append(f'{md_title_prefix} {para_text}')
                     else:
                         page_lst.append(para_text)
             """拼装成一个页面的文本"""
-            page_md = "\n\n".join(page_lst)
+            page_md = '\n\n'.join(page_lst)
             """插入图片"""
             for img in all_page_images:
                 imgbox = img['bbox']
@@ -118,192 +124,215 @@ def mk_mm_markdown_1(para_dict: dict):
                 # 先看在哪个block内
                 for block in pymu_raw_blocks:
                     bbox = block['bbox']
-                    if bbox[0]-1 <= imgbox[0] < bbox[2]+1 and bbox[1]-1 <= imgbox[1] < bbox[3]+1:# 确定在block内
-                        for l in block['lines']:
+                    if (
+                        bbox[0] - 1 <= imgbox[0] < bbox[2] + 1
+                        and bbox[1] - 1 <= imgbox[1] < bbox[3] + 1
+                    ):  # 确定在block内
+                        for l in block['lines']:  # noqa: E741
                             line_box = l['bbox']
-                            if line_box[0]-1 <= imgbox[0] < line_box[2]+1 and line_box[1]-1 <= imgbox[1] < line_box[3]+1: # 在line内的，插入line前面
-                                line_txt = "".join([s['text'] for s in l['spans']])
-                                page_md = __insert_before(page_md, img_content, line_txt)
+                            if (
+                                line_box[0] - 1 <= imgbox[0] < line_box[2] + 1
+                                and line_box[1] - 1 <= imgbox[1] < line_box[3] + 1
+                            ):  # 在line内的，插入line前面
+                                line_txt = ''.join([s['text'] for s in l['spans']])
+                                page_md = __insert_before(
+                                    page_md, img_content, line_txt
+                                )
                                 break
                             break
-                        else:# 在行与行之间
+                        else:  # 在行与行之间
                             # 找到图片x0,y0与line的x0,y0最近的line
                             min_distance = 100000
                             min_line = None
-                            for l in block['lines']:
+                            for l in block['lines']:  # noqa: E741
                                 line_box = l['bbox']
-                                distance = math.sqrt((line_box[0] - imgbox[0])**2 + (line_box[1] - imgbox[1])**2)
+                                distance = math.sqrt(
+                                    (line_box[0] - imgbox[0]) ** 2
+                                    + (line_box[1] - imgbox[1]) ** 2
+                                )
                                 if distance < min_distance:
                                     min_distance = distance
                                     min_line = l
                             if min_line:
-                                line_txt = "".join([s['text'] for s in min_line['spans']])
+                                line_txt = ''.join(
+                                    [s['text'] for s in min_line['spans']]
+                                )
                                 img_h = imgbox[3] - imgbox[1]
-                                if min_distance<img_h: # 文字在图片前面
-                                    page_md = __insert_after(page_md, img_content, line_txt)
+                                if min_distance < img_h:  # 文字在图片前面
+                                    page_md = __insert_after(
+                                        page_md, img_content, line_txt
+                                    )
                                 else:
-                                    page_md = __insert_before(page_md, img_content, line_txt)
+                                    page_md = __insert_before(
+                                        page_md, img_content, line_txt
+                                    )
                             else:
-                                logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #1")
-                else:# 应当在两个block之间
+                                logger.error(
+                                    f"Can't find the location of image {img['image_path']} in the markdown file  #1"
+                                )
+                else:  # 应当在两个block之间
                     # 找到上方最近的block，如果上方没有就找大下方最近的block
                     top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, imgbox)
                     if top_txt_block:
-                        line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
+                        line_txt = ''.join(
+                            [s['text'] for s in top_txt_block['lines'][-1]['spans']]
+                        )
                         page_md = __insert_after(page_md, img_content, line_txt)
                     else:
-                        bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, imgbox)
+                        bottom_txt_block = find_bottom_nearest_text_bbox(
+                            pymu_raw_blocks, imgbox
+                        )
                         if bottom_txt_block:
-                            line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
+                            line_txt = ''.join(
+                                [
+                                    s['text']
+                                    for s in bottom_txt_block['lines'][0]['spans']
+                                ]
+                            )
                             page_md = __insert_before(page_md, img_content, line_txt)
                         else:
-                            logger.error(f"Can't find the location of image {img['image_path']} in the markdown file #2")
+                            logger.error(
+                                f"Can't find the location of image {img['image_path']} in the markdown file  #2"
+                            )
         content_lst.append(page_md)
     """拼装成全部页面的文本"""
-    content_text = "\n\n".join(content_lst)
+    content_text = '\n\n'.join(content_lst)
     return content_text
 def __insert_after_para(text, type, element, content_list):
-    """
-    在content_list中找到text，将image_path作为一个新的node插入到text后面
-    """
+    """在content_list中找到text，将image_path作为一个新的node插入到text后面."""
     for i, c in enumerate(content_list):
-        content_type = c.get("type")
-        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
-            if type == "image":
+        content_type = c.get('type')
+        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
+            if type == 'image':
                 content_node = {
-                    "type": "image",
-                    "img_path": element.get("image_path"),
-                    "img_alt": "",
-                    "img_title": "",
-                    "img_caption": "",
+                    'type': 'image',
+                    'img_path': element.get('image_path'),
+                    'img_alt': '',
+                    'img_title': '',
+                    'img_caption': '',
                 }
-            elif type == "table":
+            elif type == 'table':
                 content_node = {
-                    "type": "table",
-                    "img_path": element.get("image_path"),
-                    "table_latex": element.get("text"),
-                    "table_title": "",
-                    "table_caption": "",
-                    "table_quality": element.get("quality"),
+                    'type': 'table',
+                    'img_path': element.get('image_path'),
+                    'table_latex': element.get('text'),
+                    'table_title': '',
+                    'table_caption': '',
+                    'table_quality': element.get('quality'),
                 }
-            content_list.insert(i+1, content_node)
+            content_list.insert(i + 1, content_node)
             break
     else:
-        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
+        logger.error(
+            f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
+        )
 def __insert_before_para(text, type, element, content_list):
-    """
-    在content_list中找到text，将image_path作为一个新的node插入到text前面
-    """
+    """在content_list中找到text，将image_path作为一个新的node插入到text前面."""
     for i, c in enumerate(content_list):
-        content_type = c.get("type")
-        if content_type in  UNI_FORMAT_TEXT_TYPE and text in c.get("text", ''):
-            if type == "image":
+        content_type = c.get('type')
+        if content_type in UNI_FORMAT_TEXT_TYPE and text in c.get('text', ''):
+            if type == 'image':
                 content_node = {
-                    "type": "image",
-                    "img_path": element.get("image_path"),
-                    "img_alt": "",
-                    "img_title": "",
-                    "img_caption": "",
+                    'type': 'image',
+                    'img_path': element.get('image_path'),
+                    'img_alt': '',
+                    'img_title': '',
+                    'img_caption': '',
                 }
-            elif type == "table":
+            elif type == 'table':
                 content_node = {
-                    "type": "table",
-                    "img_path": element.get("image_path"),
-                    "table_latex": element.get("text"),
-                    "table_title": "",
-                    "table_caption": "",
-                    "table_quality": element.get("quality"),
+                    'type': 'table',
+                    'img_path': element.get('image_path'),
+                    'table_latex': element.get('text'),
+                    'table_title': '',
+                    'table_caption': '',
+                    'table_quality': element.get('quality'),
                 }
             content_list.insert(i, content_node)
             break
     else:
-        logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}")
+        logger.error(
+            f"Can't find the location of image {element.get('image_path')} in the markdown file, search target is {text}"
+        )
 def mk_universal_format(pdf_info_list: list, img_buket_path):
-    """
-    构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY
-    """
+    """构造统一格式 https://aicarrier.feishu.cn/wiki/FqmMwcH69iIdCWkkyjvcDwNUnTY."""
     content_lst = []
     for page_info in pdf_info_list:
-        page_lst = [] # 一个page内的段落列表
-        para_blocks = page_info.get("para_blocks")
-        pymu_raw_blocks = page_info.get("preproc_blocks")
+        page_lst = []  # 一个page内的段落列表
+        para_blocks = page_info.get('para_blocks')
+        pymu_raw_blocks = page_info.get('preproc_blocks')
         all_page_images = []
-        all_page_images.extend(page_info.get("images",[]))
-        all_page_images.extend(page_info.get("image_backup", []) )
+        all_page_images.extend(page_info.get('images', []))
+        all_page_images.extend(page_info.get('image_backup', []))
         # all_page_images.extend(page_info.get("tables",[]))
         # all_page_images.extend(page_info.get("table_backup",[]) )
         all_page_tables = []
-        all_page_tables.extend(page_info.get("tables", []))
+        all_page_tables.extend(page_info.get('tables', []))
-        if not para_blocks or not pymu_raw_blocks: # 只有图片的拼接的场景
+        if not para_blocks or not pymu_raw_blocks:  # 只有图片的拼接的场景
             for img in all_page_images:
                 content_node = {
-                    "type": "image",
-                    "img_path": join_path(img_buket_path, img['image_path']),
-                    "img_alt":"",
-                    "img_title":"",
-                    "img_caption":""
+                    'type': 'image',
+                    'img_path': join_path(img_buket_path, img['image_path']),
+                    'img_alt': '',
+                    'img_title': '',
+                    'img_caption': '',
                 }
-                page_lst.append(content_node) # TODO 图片顺序
+                page_lst.append(content_node)  # TODO 图片顺序
             for table in all_page_tables:
                 content_node = {
-                    "type": "table",
-                    "img_path": join_path(img_buket_path, table['image_path']),
-                    "table_latex": table.get("text"),
-                    "table_title": "",
-                    "table_caption": "",
-                    "table_quality": table.get("quality"),
+                    'type': 'table',
+                    'img_path': join_path(img_buket_path, table['image_path']),
+                    'table_latex': table.get('text'),
+                    'table_title': '',
+                    'table_caption': '',
+                    'table_quality': table.get('quality'),
                 }
-                page_lst.append(content_node) # TODO 图片顺序
+                page_lst.append(content_node)  # TODO 图片顺序
         else:
             for block in para_blocks:
-                item = block["paras"]
+                item = block['paras']
                 for _, p in item.items():
-                    font_type = p['para_font_type']# 对于文本来说，要么是普通文本，要么是个行间公式
+                    font_type = p[
+                        'para_font_type'
+                    ]  # 对于文本来说，要么是普通文本，要么是个行间公式
                     if font_type == TYPE_INTERLINE_EQUATION:
-                        content_node = {
-                            "type": "equation",
-                            "latex": p["para_text"]
-                        }
+                        content_node = {'type': 'equation', 'latex': p['para_text']}
                         page_lst.append(content_node)
                     else:
-                        para_text = p["para_text"]
-                        is_title = p["is_para_title"]
+                        para_text = p['para_text']
+                        is_title = p['is_para_title']
                         title_level = p['para_title_level']
                         if is_title:
                             content_node = {
-                                "type": f"h{title_level}",
-                                "text": para_text
+                                'type': f'h{title_level}',
+                                'text': para_text,
                             }
                             page_lst.append(content_node)
                         else:
-                            content_node = {
-                                "type": "text",
-                                "text": para_text
-                            }
+                            content_node = {'type': 'text', 'text': para_text}
                             page_lst.append(content_node)
         content_lst.extend(page_lst)
         """插入图片"""
         for img in all_page_images:
-            insert_img_or_table("image", img, pymu_raw_blocks, content_lst)
+            insert_img_or_table('image', img, pymu_raw_blocks, content_lst)
         """插入表格"""
         for table in all_page_tables:
-            insert_img_or_table("table", table, pymu_raw_blocks, content_lst)
+            insert_img_or_table('table', table, pymu_raw_blocks, content_lst)
     # end for
     return content_lst
@@ -313,13 +342,17 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
     # 先看在哪个block内
     for block in pymu_raw_blocks:
         bbox = block['bbox']
-        if bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1 and bbox[1] - 1 <= element_bbox[1] < bbox[
-            3] + 1:  # 确定在这个大的block内，然后进入逐行比较距离
-            for l in block['lines']:
+        if (
+            bbox[0] - 1 <= element_bbox[0] < bbox[2] + 1
+            and bbox[1] - 1 <= element_bbox[1] < bbox[3] + 1
+        ):  # 确定在这个大的block内，然后进入逐行比较距离
+            for l in block['lines']:  # noqa: E741
                 line_box = l['bbox']
-                if line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1 and line_box[1] - 1 <= element_bbox[1] < line_box[
-                    3] + 1:  # 在line内的，插入line前面
-                    line_txt = "".join([s['text'] for s in l['spans']])
+                if (
+                    line_box[0] - 1 <= element_bbox[0] < line_box[2] + 1
+                    and line_box[1] - 1 <= element_bbox[1] < line_box[3] + 1
+                ):  # 在line内的，插入line前面
+                    line_txt = ''.join([s['text'] for s in l['spans']])
                     __insert_before_para(line_txt, type, element, content_lst)
                     break
                 break
@@ -327,14 +360,17 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
                 # 找到图片x0,y0与line的x0,y0最近的line
                 min_distance = 100000
                 min_line = None
-                for l in block['lines']:
+                for l in block['lines']:  # noqa: E741
                     line_box = l['bbox']
-                    distance = math.sqrt((line_box[0] - element_bbox[0]) ** 2 + (line_box[1] - element_bbox[1]) ** 2)
+                    distance = math.sqrt(
+                        (line_box[0] - element_bbox[0]) ** 2
+                        + (line_box[1] - element_bbox[1]) ** 2
+                    )
                     if distance < min_distance:
                         min_distance = distance
                         min_line = l
                 if min_line:
-                    line_txt = "".join([s['text'] for s in min_line['spans']])
+                    line_txt = ''.join([s['text'] for s in min_line['spans']])
                     img_h = element_bbox[3] - element_bbox[1]
                     if min_distance < img_h:  # 文字在图片前面
                         __insert_after_para(line_txt, type, element, content_lst)
@@ -342,56 +378,61 @@ def insert_img_or_table(type, element, pymu_raw_blocks, content_lst):
                         __insert_before_para(line_txt, type, element, content_lst)
                     break
                 else:
-                    logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #1")
+                    logger.error(
+                        f"Can't find the location of image {element.get('image_path')} in the markdown file  #1"
+                    )
     else:  # 应当在两个block之间
         # 找到上方最近的block，如果上方没有就找大下方最近的block
         top_txt_block = find_top_nearest_text_bbox(pymu_raw_blocks, element_bbox)
         if top_txt_block:
-            line_txt = "".join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
+            line_txt = ''.join([s['text'] for s in top_txt_block['lines'][-1]['spans']])
             __insert_after_para(line_txt, type, element, content_lst)
         else:
-            bottom_txt_block = find_bottom_nearest_text_bbox(pymu_raw_blocks, element_bbox)
+            bottom_txt_block = find_bottom_nearest_text_bbox(
+                pymu_raw_blocks, element_bbox
+            )
             if bottom_txt_block:
-                line_txt = "".join([s['text'] for s in bottom_txt_block['lines'][0]['spans']])
+                line_txt = ''.join(
+                    [s['text'] for s in bottom_txt_block['lines'][0]['spans']]
+                )
                 __insert_before_para(line_txt, type, element, content_lst)
             else:  # TODO ，图片可能独占一列，这种情况上下是没有图片的
-                logger.error(f"Can't find the location of image {element.get('image_path')} in the markdown file #2")
+                logger.error(
+                    f"Can't find the location of image {element.get('image_path')} in the markdown file  #2"
+                )
 def mk_mm_markdown(content_list):
-    """
-    基于同一格式的内容列表，构造markdown，含图片
-    """
+    """基于同一格式的内容列表，构造markdown，含图片."""
     content_md = []
     for c in content_list:
-        content_type = c.get("type")
-        if content_type == "text":
-            content_md.append(c.get("text"))
-        elif content_type == "equation":
-            content = c.get("latex")
-            if content.startswith("$$") and content.endswith("$$"):
+        content_type = c.get('type')
+        if content_type == 'text':
+            content_md.append(c.get('text'))
+        elif content_type == 'equation':
+            content = c.get('latex')
+            if content.startswith('$$') and content.endswith('$$'):
                 content_md.append(content)
             else:
                 content_md.append(f"\n$$\n{c.get('latex')}\n$$\n")
         elif content_type in UNI_FORMAT_TEXT_TYPE:
             content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-        elif content_type == "image":
+        elif content_type == 'image':
             content_md.append(f"![]({c.get('img_path')})")
-    return "\n\n".join(content_md)
+    return '\n\n'.join(content_md)
 def mk_nlp_markdown(content_list):
-    """
-    基于同一格式的内容列表，构造markdown，不含图片
-    """
+    """基于同一格式的内容列表，构造markdown，不含图片."""
     content_md = []
     for c in content_list:
-        content_type = c.get("type")
-        if content_type == "text":
-            content_md.append(c.get("text"))
-        elif content_type == "equation":
+        content_type = c.get('type')
+        if content_type == 'text':
+            content_md.append(c.get('text'))
+        elif content_type == 'equation':
             content_md.append(f"$$\n{c.get('latex')}\n$$")
-        elif content_type == "table":
+        elif content_type == 'table':
             content_md.append(f"$$$\n{c.get('table_latex')}\n$$$")
         elif content_type in UNI_FORMAT_TEXT_TYPE:
             content_md.append(f"{'#'*int(content_type[1])} {c.get('text')}")
-    return "\n\n".join(content_md)
+    return '\n\n'.join(content_md)

magic-pdf 0.9.3__py3-none-any.whl → 0.10.0__py3-none-any.whl

magic-pdf 0.9.3py3-none-any.whl → 0.10.0py3-none-any.whl