PyPI - magic-pdf - Versions diffs - 1.3.10__py3-none-any.whl → 1.3.12__py3-none-any.whl - Mend

magic-pdf 1.3.10py3-none-any.whl → 1.3.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

magic_pdf/data/utils.py CHANGED Viewed

@@ -10,22 +10,22 @@ from loguru import logger
-def fitz_doc_to_image(doc, dpi=200) -> dict:
+def fitz_doc_to_image(page, dpi=200) -> dict:
     """Convert fitz.Document to image, Then convert the image to numpy array.
     Args:
-        doc (_type_): pymudoc page
+        page (_type_): pymudoc page
         dpi (int, optional): reset the dpi of dpi. Defaults to 200.
     Returns:
         dict:  {'img': numpy array, 'width': width, 'height': height }
     """
     mat = fitz.Matrix(dpi / 72, dpi / 72)
-    pm = doc.get_pixmap(matrix=mat, alpha=False)
+    pm = page.get_pixmap(matrix=mat, alpha=False)
     # If the width or height exceeds 4500 after scaling, do not scale further.
     if pm.width > 4500 or pm.height > 4500:
-        pm = doc.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
+        pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
     # Convert pixmap samples directly to numpy array
     img = np.frombuffer(pm.samples, dtype=np.uint8).reshape(pm.height, pm.width, 3)

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -70,19 +70,34 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
             if mode == 'nlp':
                 continue
             elif mode == 'mm':
-                for block in para_block['blocks']:  # 1st.拼image_body
-                    if block['type'] == BlockType.ImageBody:
-                        for line in block['lines']:
-                            for span in line['spans']:
-                                if span['type'] == ContentType.Image:
-                                    if span.get('image_path', ''):
-                                        para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
-                for block in para_block['blocks']:  # 2nd.拼image_caption
-                    if block['type'] == BlockType.ImageCaption:
-                        para_text += merge_para_with_text(block) + '  \n'
-                for block in para_block['blocks']:  # 3rd.拼image_footnote
-                    if block['type'] == BlockType.ImageFootnote:
-                        para_text += merge_para_with_text(block) + '  \n'
+                # 检测是否存在图片脚注
+                has_image_footnote = any(block['type'] == BlockType.ImageFootnote for block in para_block['blocks'])
+                # 如果存在图片脚注，则将图片脚注拼接到图片正文后面
+                if has_image_footnote:
+                    for block in para_block['blocks']:  # 1st.拼image_caption
+                        if block['type'] == BlockType.ImageCaption:
+                            para_text += merge_para_with_text(block) + '  \n'
+                    for block in para_block['blocks']:  # 2nd.拼image_body
+                        if block['type'] == BlockType.ImageBody:
+                            for line in block['lines']:
+                                for span in line['spans']:
+                                    if span['type'] == ContentType.Image:
+                                        if span.get('image_path', ''):
+                                            para_text += f"![]({img_buket_path}/{span['image_path']})"
+                    for block in para_block['blocks']:  # 3rd.拼image_footnote
+                        if block['type'] == BlockType.ImageFootnote:
+                            para_text += '  \n' + merge_para_with_text(block)
+                else:
+                    for block in para_block['blocks']:  # 1st.拼image_body
+                        if block['type'] == BlockType.ImageBody:
+                            for line in block['lines']:
+                                for span in line['spans']:
+                                    if span['type'] == ContentType.Image:
+                                        if span.get('image_path', ''):
+                                            para_text += f"![]({img_buket_path}/{span['image_path']})"
+                    for block in para_block['blocks']:  # 2nd.拼image_caption
+                        if block['type'] == BlockType.ImageCaption:
+                            para_text += '  \n' + merge_para_with_text(block)
         elif para_type == BlockType.Table:
             if mode == 'nlp':
                 continue
@@ -96,20 +111,19 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
                             for span in line['spans']:
                                 if span['type'] == ContentType.Table:
                                     # if processed by table model
-                                    if span.get('latex', ''):
-                                        para_text += f"\n\n$\n {span['latex']}\n$\n\n"
-                                    elif span.get('html', ''):
-                                        para_text += f"\n\n{span['html']}\n\n"
+                                    if span.get('html', ''):
+                                        para_text += f"\n{span['html']}\n"
                                     elif span.get('image_path', ''):
-                                        para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
+                                        para_text += f"![]({img_buket_path}/{span['image_path']})"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
                     if block['type'] == BlockType.TableFootnote:
-                        para_text += merge_para_with_text(block) + '  \n'
+                        para_text += '\n' + merge_para_with_text(block) + '  '
         if para_text.strip() == '':
             continue
         else:
-            page_markdown.append(para_text.strip() + '  ')
+            # page_markdown.append(para_text.strip() + '  ')
+            page_markdown.append(para_text.strip())
     return page_markdown
@@ -257,9 +271,9 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason
                         if span['type'] == ContentType.Table:
                             if span.get('latex', ''):
-                                para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
+                                para_content['table_body'] = f"{span['latex']}"
                             elif span.get('html', ''):
-                                para_content['table_body'] = f"\n\n{span['html']}\n\n"
+                                para_content['table_body'] = f"{span['html']}"
                             if span.get('image_path', ''):
                                 para_content['img_path'] = join_path(img_buket_path, span['image_path'])

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "1.3.10"
1	+ __version__ = "1.3.12"

magic_pdf/model/batch_analyze.py CHANGED Viewed

@@ -6,7 +6,7 @@ from tqdm import tqdm
 from magic_pdf.config.constants import MODEL_NAME
 from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
 from magic_pdf.model.sub_modules.model_utils import (
-    clean_vram, crop_img, get_res_list_from_layout_res)
+    clean_vram, crop_img, get_res_list_from_layout_res, get_coords_and_area)
 from magic_pdf.model.sub_modules.ocr.paddleocr2pytorch.ocr_utils import (
     get_adjusted_mfdetrec_res, get_ocr_result_list)
@@ -148,6 +148,19 @@ class BatchAnalyze:
                 # Integration results
                 if ocr_res:
                     ocr_result_list = get_ocr_result_list(ocr_res, useful_list, ocr_res_list_dict['ocr_enable'], new_image, _lang)
+                    if res["category_id"] == 3:
+                        # ocr_result_list中所有bbox的面积之和
+                        ocr_res_area = sum(get_coords_and_area(ocr_res_item)[4] for ocr_res_item in ocr_result_list if 'poly' in ocr_res_item)
+                        # 求ocr_res_area和res的面积的比值
+                        res_area = get_coords_and_area(res)[4]
+                        if res_area > 0:
+                            ratio = ocr_res_area / res_area
+                            if ratio > 0.25:
+                                res["category_id"] = 1
+                            else:
+                                continue
                     ocr_res_list_dict['layout_res'].extend(ocr_result_list)
             # det_count += len(ocr_res_list_dict['ocr_res_list'])

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -156,7 +156,10 @@ def doc_analyze(
         batch_images = [images_with_extra_info]
     results = []
-    for batch_image in batch_images:
+    processed_images_count = 0
+    for index, batch_image in enumerate(batch_images):
+        processed_images_count += len(batch_image)
+        logger.info(f'Batch {index + 1}/{len(batch_images)}: {processed_images_count} pages/{len(images_with_extra_info)} pages')
         result = may_batch_image_analyze(batch_image, ocr, show_log,layout_model, formula_enable, table_enable)
         results.extend(result)
@@ -186,7 +189,7 @@ def batch_doc_analyze(
     formula_enable=None,
     table_enable=None,
 ):
-    MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 200))
+    MIN_BATCH_INFERENCE_SIZE = int(os.environ.get('MINERU_MIN_BATCH_INFERENCE_SIZE', 100))
     batch_size = MIN_BATCH_INFERENCE_SIZE
     page_wh_list = []

magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py CHANGED Viewed

@@ -66,9 +66,9 @@ LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
 def fix_latex_left_right(s):
     """
-    修复LaTeX中的\left和\right命令
+    修复LaTeX中的\\left和\\right命令
     1. 确保它们后面跟有效分隔符
-    2. 平衡\left和\right的数量
+    2. 平衡\\left和\\right的数量
     """
     # 白名单分隔符
     valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
@@ -106,7 +106,7 @@ def fix_latex_left_right(s):
 def fix_left_right_pairs(latex_formula):
     """
-    检测并修复LaTeX公式中\left和\right不在同一组的情况
+    检测并修复LaTeX公式中\\left和\\right不在同一组的情况
     Args:
         latex_formula (str): 输入的LaTeX公式
@@ -308,9 +308,9 @@ ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') fo
 def fix_latex_environments(s):
     """
-    检测LaTeX中环境（如array）的\begin和\end是否匹配
-    1. 如果缺少\begin标签则在开头添加
-    2. 如果缺少\end标签则在末尾添加
+    检测LaTeX中环境（如array）的\\begin和\\end是否匹配
+    1. 如果缺少\\begin标签则在开头添加
+    2. 如果缺少\\end标签则在末尾添加
     """
     for env in ENV_TYPES:
         begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
@@ -334,7 +334,7 @@ def fix_latex_environments(s):
 UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
 COMMANDS_TO_REMOVE_PATTERN = re.compile(
-    r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph)')
+    r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
 REPLACEMENTS_PATTERNS = {
     re.compile(r'\\underbar'): r'\\underline',
     re.compile(r'\\Bar'): r'\\hat',
@@ -346,6 +346,9 @@ REPLACEMENTS_PATTERNS = {
     re.compile(r'\\textunderscore'): r'\\_',
     re.compile(r'\\fint'): r'⨏',
     re.compile(r'\\up '): r'\\ ',
+    re.compile(r'\\vline = '): r'\\models ',
+    re.compile(r'\\vDash '): r'\\models ',
+    re.compile(r'\\sq \\sqcup '): r'\\square ',
 }
 QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')

magic_pdf/model/sub_modules/model_utils.py CHANGED Viewed

@@ -31,10 +31,10 @@ def crop_img(input_res, input_np_img, crop_paste_x=0, crop_paste_y=0):
     return return_image, return_list
-def get_coords_and_area(table):
+def get_coords_and_area(block_with_poly):
     """Extract coordinates and area from a table."""
-    xmin, ymin = int(table['poly'][0]), int(table['poly'][1])
-    xmax, ymax = int(table['poly'][4]), int(table['poly'][5])
+    xmin, ymin = int(block_with_poly['poly'][0]), int(block_with_poly['poly'][1])
+    xmax, ymax = int(block_with_poly['poly'][4]), int(block_with_poly['poly'][5])
     area = (xmax - xmin) * (ymax - ymin)
     return xmin, ymin, xmax, ymax, area
@@ -243,7 +243,7 @@ def get_res_list_from_layout_res(layout_res, iou_threshold=0.7, overlap_threshol
                 "bbox": [int(res['poly'][0]), int(res['poly'][1]),
                          int(res['poly'][4]), int(res['poly'][5])],
             })
-        elif category_id in [0, 2, 4, 6, 7]:  # OCR regions
+        elif category_id in [0, 2, 4, 6, 7, 3]:  # OCR regions
             ocr_res_list.append(res)
         elif category_id == 5:  # Table regions
             table_res_list.append(res)

magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/pytorchocr/modeling/backbones/__init__.py CHANGED Viewed

@@ -35,7 +35,7 @@ def build_backbone(config, model_type):
         from .rec_mobilenet_v3 import MobileNetV3
         from .rec_svtrnet import SVTRNet
         from .rec_mv1_enhance import MobileNetV1Enhance
+        from .rec_pphgnetv2 import PPHGNetV2_B4
         support_dict = [
             "MobileNetV1Enhance",
             "MobileNetV3",
@@ -48,6 +48,7 @@ def build_backbone(config, model_type):
             "DenseNet",
             "PPLCNetV3",
             "PPHGNet_small",
+            "PPHGNetV2_B4",
         ]
     else:
         raise NotImplementedError

magic-pdf 1.3.10__py3-none-any.whl → 1.3.12__py3-none-any.whl

magic-pdf 1.3.10py3-none-any.whl → 1.3.12py3-none-any.whl