PyPI - magic-pdf - Versions diffs - 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

magic-pdf 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (102) hide show

magic_pdf/model/sub_modules/ocr/paddleocr2pytorch/tools/infer/pytorchocr_utility.py ADDED Viewed

@@ -0,0 +1,227 @@
+import os
+import math
+from pathlib import Path
+import numpy as np
+import cv2
+import argparse
+root_dir = Path(__file__).resolve().parent.parent.parent
+DEFAULT_CFG_PATH = root_dir / "pytorchocr" / "utils" / "resources" / "arch_config.yaml"
+def init_args():
+    def str2bool(v):
+        return v.lower() in ("true", "t", "1")
+    parser = argparse.ArgumentParser()
+    # params for prediction engine
+    parser.add_argument("--use_gpu", type=str2bool, default=False)
+    parser.add_argument("--det", type=str2bool, default=True)
+    parser.add_argument("--rec", type=str2bool, default=True)
+    parser.add_argument("--device", type=str, default='cpu')
+    # parser.add_argument("--ir_optim", type=str2bool, default=True)
+    # parser.add_argument("--use_tensorrt", type=str2bool, default=False)
+    # parser.add_argument("--use_fp16", type=str2bool, default=False)
+    parser.add_argument("--gpu_mem", type=int, default=500)
+    parser.add_argument("--warmup", type=str2bool, default=False)
+    # params for text detector
+    parser.add_argument("--image_dir", type=str)
+    parser.add_argument("--det_algorithm", type=str, default='DB')
+    parser.add_argument("--det_model_path", type=str)
+    parser.add_argument("--det_limit_side_len", type=float, default=960)
+    parser.add_argument("--det_limit_type", type=str, default='max')
+    # DB parmas
+    parser.add_argument("--det_db_thresh", type=float, default=0.3)
+    parser.add_argument("--det_db_box_thresh", type=float, default=0.6)
+    parser.add_argument("--det_db_unclip_ratio", type=float, default=1.5)
+    parser.add_argument("--max_batch_size", type=int, default=10)
+    parser.add_argument("--use_dilation", type=str2bool, default=False)
+    parser.add_argument("--det_db_score_mode", type=str, default="fast")
+    # EAST parmas
+    parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
+    parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
+    parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
+    # SAST parmas
+    parser.add_argument("--det_sast_score_thresh", type=float, default=0.5)
+    parser.add_argument("--det_sast_nms_thresh", type=float, default=0.2)
+    parser.add_argument("--det_sast_polygon", type=str2bool, default=False)
+    # PSE parmas
+    parser.add_argument("--det_pse_thresh", type=float, default=0)
+    parser.add_argument("--det_pse_box_thresh", type=float, default=0.85)
+    parser.add_argument("--det_pse_min_area", type=float, default=16)
+    parser.add_argument("--det_pse_box_type", type=str, default='box')
+    parser.add_argument("--det_pse_scale", type=int, default=1)
+    # FCE parmas
+    parser.add_argument("--scales", type=list, default=[8, 16, 32])
+    parser.add_argument("--alpha", type=float, default=1.0)
+    parser.add_argument("--beta", type=float, default=1.0)
+    parser.add_argument("--fourier_degree", type=int, default=5)
+    parser.add_argument("--det_fce_box_type", type=str, default='poly')
+    # params for text recognizer
+    parser.add_argument("--rec_algorithm", type=str, default='CRNN')
+    parser.add_argument("--rec_model_path", type=str)
+    parser.add_argument("--rec_image_inverse", type=str2bool, default=True)
+    parser.add_argument("--rec_image_shape", type=str, default="3, 48, 320")
+    parser.add_argument("--rec_char_type", type=str, default='ch')
+    parser.add_argument("--rec_batch_num", type=int, default=6)
+    parser.add_argument("--max_text_length", type=int, default=25)
+    parser.add_argument("--use_space_char", type=str2bool, default=True)
+    parser.add_argument("--drop_score", type=float, default=0.5)
+    parser.add_argument("--limited_max_width", type=int, default=1280)
+    parser.add_argument("--limited_min_width", type=int, default=16)
+    parser.add_argument(
+        "--vis_font_path", type=str,
+        default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))), 'doc/fonts/simfang.ttf'))
+    parser.add_argument(
+        "--rec_char_dict_path",
+        type=str,
+        default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                             'pytorchocr/utils/ppocr_keys_v1.txt'))
+    # params for text classifier
+    parser.add_argument("--use_angle_cls", type=str2bool, default=False)
+    parser.add_argument("--cls_model_path", type=str)
+    parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
+    parser.add_argument("--label_list", type=list, default=['0', '180'])
+    parser.add_argument("--cls_batch_num", type=int, default=6)
+    parser.add_argument("--cls_thresh", type=float, default=0.9)
+    parser.add_argument("--enable_mkldnn", type=str2bool, default=False)
+    parser.add_argument("--use_pdserving", type=str2bool, default=False)
+    # params for e2e
+    parser.add_argument("--e2e_algorithm", type=str, default='PGNet')
+    parser.add_argument("--e2e_model_path", type=str)
+    parser.add_argument("--e2e_limit_side_len", type=float, default=768)
+    parser.add_argument("--e2e_limit_type", type=str, default='max')
+    # PGNet parmas
+    parser.add_argument("--e2e_pgnet_score_thresh", type=float, default=0.5)
+    parser.add_argument(
+        "--e2e_char_dict_path", type=str,
+        default=os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
+                             'pytorchocr/utils/ic15_dict.txt'))
+    parser.add_argument("--e2e_pgnet_valid_set", type=str, default='totaltext')
+    parser.add_argument("--e2e_pgnet_polygon", type=bool, default=True)
+    parser.add_argument("--e2e_pgnet_mode", type=str, default='fast')
+    # SR parmas
+    parser.add_argument("--sr_model_path", type=str)
+    parser.add_argument("--sr_image_shape", type=str, default="3, 32, 128")
+    parser.add_argument("--sr_batch_num", type=int, default=1)
+    # params .yaml
+    parser.add_argument("--det_yaml_path", type=str, default=None)
+    parser.add_argument("--rec_yaml_path", type=str, default=None)
+    parser.add_argument("--cls_yaml_path", type=str, default=None)
+    parser.add_argument("--e2e_yaml_path", type=str, default=None)
+    parser.add_argument("--sr_yaml_path", type=str, default=None)
+    # multi-process
+    parser.add_argument("--use_mp", type=str2bool, default=False)
+    parser.add_argument("--total_process_num", type=int, default=1)
+    parser.add_argument("--process_id", type=int, default=0)
+    parser.add_argument("--benchmark", type=str2bool, default=False)
+    parser.add_argument("--save_log_path", type=str, default="./log_output/")
+    parser.add_argument("--show_log", type=str2bool, default=True)
+    return parser
+def parse_args():
+    parser = init_args()
+    return parser.parse_args()
+def get_default_config(args):
+    return vars(args)
+def read_network_config_from_yaml(yaml_path, char_num=None):
+    if not os.path.exists(yaml_path):
+        raise FileNotFoundError('{} is not existed.'.format(yaml_path))
+    import yaml
+    with open(yaml_path, encoding='utf-8') as f:
+        res = yaml.safe_load(f)
+    if res.get('Architecture') is None:
+        raise ValueError('{} has no Architecture'.format(yaml_path))
+    if res['Architecture']['Head']['name'] == 'MultiHead' and char_num is not None:
+        res['Architecture']['Head']['out_channels_list'] = {
+            'CTCLabelDecode': char_num,
+            'SARLabelDecode': char_num + 2,
+            'NRTRLabelDecode': char_num + 3
+        }
+    return res['Architecture']
+def AnalysisConfig(weights_path, yaml_path=None, char_num=None):
+    if not os.path.exists(os.path.abspath(weights_path)):
+        raise FileNotFoundError('{} is not found.'.format(weights_path))
+    if yaml_path is not None:
+        return read_network_config_from_yaml(yaml_path, char_num=char_num)
+def resize_img(img, input_size=600):
+    """
+    resize img and limit the longest side of the image to input_size
+    """
+    img = np.array(img)
+    im_shape = img.shape
+    im_size_max = np.max(im_shape[0:2])
+    im_scale = float(input_size) / float(im_size_max)
+    img = cv2.resize(img, None, None, fx=im_scale, fy=im_scale)
+    return img
+def str_count(s):
+    """
+    Count the number of Chinese characters,
+    a single English character and a single number
+    equal to half the length of Chinese characters.
+    args:
+        s(string): the input of string
+    return(int):
+        the number of Chinese characters
+    """
+    import string
+    count_zh = count_pu = 0
+    s_len = len(s)
+    en_dg_count = 0
+    for c in s:
+        if c in string.ascii_letters or c.isdigit() or c.isspace():
+            en_dg_count += 1
+        elif c.isalpha():
+            count_zh += 1
+        else:
+            count_pu += 1
+    return s_len - math.ceil(en_dg_count / 2)
+def base64_to_cv2(b64str):
+    import base64
+    data = base64.b64decode(b64str.encode('utf8'))
+    data = np.fromstring(data, np.uint8)
+    data = cv2.imdecode(data, cv2.IMREAD_COLOR)
+    return data
+def get_arch_config(model_path):
+    from omegaconf import OmegaConf
+    all_arch_config = OmegaConf.load(DEFAULT_CFG_PATH)
+    path = Path(model_path)
+    file_name = path.stem
+    if file_name not in all_arch_config:
+        raise ValueError(f"architecture {file_name} is not in arch_config.yaml")
+    arch_config = all_arch_config[file_name]
+    return arch_config

magic_pdf/model/sub_modules/table/rapidtable/rapid_table.py CHANGED Viewed

@@ -9,7 +9,7 @@ from magic_pdf.libs.config_reader import get_device
 class RapidTableModel(object):
-    def __init__(self, ocr_engine, table_sub_model_name):
+    def __init__(self, ocr_engine, table_sub_model_name='slanet_plus'):
         sub_model_list = [model.value for model in ModelType]
         if table_sub_model_name is None:
             input_args = RapidTableInput()
@@ -23,25 +23,17 @@ class RapidTableModel(object):
         self.table_model = RapidTable(input_args)
-        # if ocr_engine is None:
-        #     self.ocr_model_name = "RapidOCR"
-        #     if torch.cuda.is_available():
-        #         from rapidocr_paddle import RapidOCR
-        #         self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
-        #     else:
-        #         from rapidocr_onnxruntime import RapidOCR
-        #         self.ocr_engine = RapidOCR()
+        # self.ocr_model_name = "RapidOCR"
+        # if torch.cuda.is_available():
+        #     from rapidocr_paddle import RapidOCR
+        #     self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
         # else:
-        #     self.ocr_model_name = "PaddleOCR"
-        #     self.ocr_engine = ocr_engine
+        #     from rapidocr_onnxruntime import RapidOCR
+        #     self.ocr_engine = RapidOCR()
+        self.ocr_model_name = "PaddleOCR"
+        self.ocr_engine = ocr_engine
-        self.ocr_model_name = "RapidOCR"
-        if torch.cuda.is_available():
-            from rapidocr_paddle import RapidOCR
-            self.ocr_engine = RapidOCR(det_use_cuda=True, cls_use_cuda=True, rec_use_cuda=True)
-        else:
-            from rapidocr_onnxruntime import RapidOCR
-            self.ocr_engine = RapidOCR()
     def predict(self, image):

magic_pdf/pdf_parse_union_core_v2.py CHANGED Viewed

@@ -4,6 +4,7 @@ import os
 import re
 import statistics
 import time
+import warnings
 from typing import List
 import cv2
@@ -11,6 +12,7 @@ import fitz
 import torch
 import numpy as np
 from loguru import logger
+from tqdm import tqdm
 from magic_pdf.config.enums import SupportedPdfParseMethod
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
@@ -21,20 +23,9 @@ from magic_pdf.libs.config_reader import get_local_layoutreader_model_dir, get_l
 from magic_pdf.libs.convert_utils import dict_to_list
 from magic_pdf.libs.hash_utils import compute_md5
 from magic_pdf.libs.pdf_image_tools import cut_image_to_pil_image
-from magic_pdf.libs.performance_stats import measure_time, PerformanceStats
 from magic_pdf.model.magic_model import MagicModel
 from magic_pdf.post_proc.llm_aided import llm_aided_formula, llm_aided_text, llm_aided_title
-from concurrent.futures import ThreadPoolExecutor
-try:
-    import torchtext
-    if torchtext.__version__ >= '0.18.0':
-        torchtext.disable_torchtext_deprecation_warning()
-except ImportError:
-    pass
 from magic_pdf.model.sub_modules.model_init import AtomModelSingleton
 from magic_pdf.post_proc.para_split_v3 import para_split
 from magic_pdf.pre_proc.construct_page_dict import ocr_construct_page_component_v2
@@ -42,7 +33,7 @@ from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
 from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
 from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
 from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
-    remove_overlaps_min_spans, check_chars_is_overlap_in_span
+    remove_overlaps_min_spans, remove_x_overlapping_chars
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
@@ -64,14 +55,6 @@ def __replace_STX_ETX(text_str: str):
     return text_str
-def __replace_0xfffd(text_str: str):
-    """Replace \ufffd, as these characters become garbled when extracted using pymupdf."""
-    if text_str:
-        s = text_str.replace('\ufffd', " ")
-        return s
-    return text_str
 # 连写字符拆分
 def __replace_ligatures(text: str):
     ligatures = {
@@ -84,16 +67,17 @@ def chars_to_content(span):
     # 检查span中的char是否为空
     if len(span['chars']) == 0:
         pass
-        # span['content'] = ''
-    elif check_chars_is_overlap_in_span(span['chars']):
-        pass
     else:
         # 先给chars按char['bbox']的中心点的x坐标排序
         span['chars'] = sorted(span['chars'], key=lambda x: (x['bbox'][0] + x['bbox'][2]) / 2)
-        # 求char的平均宽度
-        char_width_sum = sum([char['bbox'][2] - char['bbox'][0] for char in span['chars']])
-        char_avg_width = char_width_sum / len(span['chars'])
+        # Calculate the width of each character
+        char_widths = [char['bbox'][2] - char['bbox'][0] for char in span['chars']]
+        # Calculate the median width
+        median_width = statistics.median(char_widths)
+        # 通过x轴重叠比率移除一部分char
+        span = remove_x_overlapping_chars(span, median_width)
         content = ''
         for char in span['chars']:
@@ -101,13 +85,12 @@ def chars_to_content(span):
             # 如果下一个char的x0和上一个char的x1距离超过0.25个字符宽度，则需要在中间插入一个空格
             char1 = char
             char2 = span['chars'][span['chars'].index(char) + 1] if span['chars'].index(char) + 1 < len(span['chars']) else None
-            if char2 and char2['bbox'][0] - char1['bbox'][2] > char_avg_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
+            if char2 and char2['bbox'][0] - char1['bbox'][2] > median_width * 0.25 and char['c'] != ' ' and char2['c'] != ' ':
                 content += f"{char['c']} "
             else:
                 content += char['c']
-        content = __replace_ligatures(content)
-        span['content'] = __replace_0xfffd(content)
+        span['content'] = __replace_ligatures(content)
     del span['chars']
@@ -122,10 +105,6 @@ def fill_char_in_spans(spans, all_chars):
     spans = sorted(spans, key=lambda x: x['bbox'][1])
     for char in all_chars:
-        # 跳过非法bbox的char
-        # x1, y1, x2, y2 = char['bbox']
-        # if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01:
-        #     continue
         for span in spans:
             if calculate_char_in_span(char['bbox'], span['bbox'], char['c']):
@@ -215,7 +194,7 @@ def calculate_contrast(img, img_mode) -> float:
     std_dev = np.std(gray_img)
     # 对比度定义为标准差除以平均值（加上小常数避免除零错误）
     contrast = std_dev / (mean_value + 1e-6)
-    # logger.info(f"contrast: {contrast}")
+    # logger.debug(f"contrast: {contrast}")
     return round(contrast, 2)
 # @measure_time
@@ -308,41 +287,53 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
     if len(need_ocr_spans) > 0:
         # 初始化ocr模型
-        atom_model_manager = AtomModelSingleton()
-        ocr_model = atom_model_manager.get_atom_model(
-            atom_model_name='ocr',
-            ocr_show_log=False,
-            det_db_box_thresh=0.3,
-            lang=lang
-        )
+        # atom_model_manager = AtomModelSingleton()
+        # ocr_model = atom_model_manager.get_atom_model(
+        #     atom_model_name='ocr',
+        #     ocr_show_log=False,
+        #     det_db_box_thresh=0.3,
+        #     lang=lang
+        # )
         for span in need_ocr_spans:
             # 对span的bbox截图再ocr
             span_img = cut_image_to_pil_image(span['bbox'], pdf_page, mode='cv2')
             # 计算span的对比度，低于0.20的span不进行ocr
-            if calculate_contrast(span_img, img_mode='bgr') <= 0.20:
+            if calculate_contrast(span_img, img_mode='bgr') <= 0.17:
                 spans.remove(span)
                 continue
+                # pass
+            span['content'] = ''
+            span['score'] = 1
+            span['np_img'] = span_img
-            ocr_res = ocr_model.ocr(span_img, det=False)
-            if ocr_res and len(ocr_res) > 0:
-                if len(ocr_res[0]) > 0:
-                    ocr_text, ocr_score = ocr_res[0][0]
-                    # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
-                    if ocr_score > 0.5 and len(ocr_text) > 0:
-                        span['content'] = ocr_text
-                        span['score'] = ocr_score
-                    else:
-                        spans.remove(span)
+            # ocr_res = ocr_model.ocr(span_img, det=False)
+            # if ocr_res and len(ocr_res) > 0:
+            #     if len(ocr_res[0]) > 0:
+            #         ocr_text, ocr_score = ocr_res[0][0]
+            #         # logger.info(f"ocr_text: {ocr_text}, ocr_score: {ocr_score}")
+            #         if ocr_score > 0.5 and len(ocr_text) > 0:
+            #             span['content'] = ocr_text
+            #             span['score'] = float(round(ocr_score, 2))
+            #         else:
+            #             spans.remove(span)
     return spans
 def model_init(model_name: str):
     from transformers import LayoutLMv3ForTokenClassification
-    device = torch.device(get_device())
+    device_name = get_device()
+    bf_16_support = False
+    if device_name.startswith("cuda"):
+        bf_16_support = torch.cuda.is_bf16_supported()
+    elif device_name.startswith("mps"):
+        bf_16_support = True
+    device = torch.device(device_name)
     if model_name == 'layoutreader':
         # 检测modelscope的缓存目录是否存在
         layoutreader_model_dir = get_local_layoutreader_model_dir()
@@ -357,7 +348,10 @@ def model_init(model_name: str):
             model = LayoutLMv3ForTokenClassification.from_pretrained(
                 'hantian/layoutreader'
             )
-        model.to(device).eval()
+        if bf_16_support:
+            model.to(device).eval().bfloat16()
+        else:
+            model.to(device).eval()
     else:
         logger.error('model name not allow')
         exit(1)
@@ -383,9 +377,12 @@ def do_predict(boxes: List[List[int]], model) -> List[int]:
     from magic_pdf.model.sub_modules.reading_oreder.layoutreader.helpers import (
         boxes2inputs, parse_logits, prepare_inputs)
-    inputs = boxes2inputs(boxes)
-    inputs = prepare_inputs(inputs, model)
-    logits = model(**inputs).logits.cpu().squeeze(0)
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=FutureWarning, module="transformers")
+        inputs = boxes2inputs(boxes)
+        inputs = prepare_inputs(inputs, model)
+        logits = model(**inputs).logits.cpu().squeeze(0)
     return parse_logits(logits, len(boxes))
@@ -463,20 +460,20 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
         if (
             block_height > page_h * 0.25 and page_w * 0.5 > block_weight > page_w * 0.25
         ):  # 可能是双列结构，可以切细点
-            lines = int(block_height / line_height) + 1
+            lines = int(block_height / line_height)
         else:
             # 如果block的宽度超过0.4页面宽度，则将block分成3行(是一种复杂布局，图不能切的太细)
             if block_weight > page_w * 0.4:
                 lines = 3
-                line_height = (y1 - y0) / lines
             elif block_weight > page_w * 0.25:  # （可能是三列结构，也切细点）
-                lines = int(block_height / line_height) + 1
+                lines = int(block_height / line_height)
             else:  # 判断长宽比
                 if block_height / block_weight > 1.2:  # 细长的不分
                     return [[x0, y0, x1, y1]]
                 else:  # 不细长的还是分成两行
                     lines = 2
-                    line_height = (y1 - y0) / lines
+        line_height = (y1 - y0) / lines
         # 确定从哪个y位置开始绘制线条
         current_y = y0
@@ -492,7 +489,7 @@ def insert_lines_into_block(block_bbox, line_height, page_w, page_h):
     else:
         return [[x0, y0, x1, y1]]
-# @measure_time
 def sort_lines_by_model(fix_blocks, page_w, page_h, line_height):
     page_line_list = []
@@ -936,17 +933,18 @@ def pdf_parse_union(
         logger.warning('end_page_id is out of range, use pdf_docs length')
         end_page_id = len(dataset) - 1
-    """初始化启动时间"""
-    start_time = time.time()
+    # """初始化启动时间"""
+    # start_time = time.time()
-    for page_id, page in enumerate(dataset):
-        """debug时输出每页解析的耗时."""
-        if debug_mode:
-            time_now = time.time()
-            logger.info(
-                f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
-            )
-            start_time = time_now
+    # for page_id, page in enumerate(dataset):
+    for page_id, page in tqdm(enumerate(dataset), total=len(dataset), desc="Processing pages"):
+        # """debug时输出每页解析的耗时."""
+        # if debug_mode:
+            # time_now = time.time()
+            # logger.info(
+            #     f'page_id: {page_id}, last_page_cost_time: {round(time.time() - start_time, 2)}'
+            # )
+            # start_time = time_now
         """解析pdf中的每一页"""
         if start_page_id <= page_id <= end_page_id:
@@ -962,7 +960,47 @@ def pdf_parse_union(
             )
         pdf_info_dict[f'page_{page_id}'] = page_info
-    # PerformanceStats.print_stats()
+    need_ocr_list = []
+    img_crop_list = []
+    text_block_list = []
+    for pange_id, page_info in pdf_info_dict.items():
+        for block in page_info['preproc_blocks']:
+            if block['type'] in ['table', 'image']:
+                for sub_block in block['blocks']:
+                    if sub_block['type'] in ['image_caption', 'image_footnote', 'table_caption', 'table_footnote']:
+                        text_block_list.append(sub_block)
+            elif block['type'] in ['text', 'title']:
+                text_block_list.append(block)
+        for block in page_info['discarded_blocks']:
+            text_block_list.append(block)
+    for block in text_block_list:
+        for line in block['lines']:
+            for span in line['spans']:
+                if 'np_img' in span:
+                    need_ocr_list.append(span)
+                    img_crop_list.append(span['np_img'])
+                    span.pop('np_img')
+    if len(img_crop_list) > 0:
+        # Get OCR results for this language's images
+        atom_model_manager = AtomModelSingleton()
+        ocr_model = atom_model_manager.get_atom_model(
+            atom_model_name='ocr',
+            ocr_show_log=False,
+            det_db_box_thresh=0.3,
+            lang=lang
+        )
+        # rec_start = time.time()
+        ocr_res_list = ocr_model.ocr(img_crop_list, det=False, tqdm_enable=True)[0]
+        # Verify we have matching counts
+        assert len(ocr_res_list) == len(need_ocr_list), f'ocr_res_list: {len(ocr_res_list)}, need_ocr_list: {len(need_ocr_list)}'
+        # Process OCR results for this language
+        for index, span in enumerate(need_ocr_list):
+            ocr_text, ocr_score = ocr_res_list[index]
+            span['content'] = ocr_text
+            span['score'] = float(round(ocr_score, 2))
+        # rec_time = time.time() - rec_start
+        # logger.info(f'ocr-dynamic-rec time: {round(rec_time, 2)}, total images processed: {len(img_crop_list)}')
     """分段"""
     para_split(pdf_info_dict)

magic_pdf/post_proc/para_split_v3.py CHANGED Viewed

@@ -108,29 +108,32 @@ def __is_list_or_index_block(block):
         ):
             multiple_para_flag = True
-        for line in block['lines']:
-            line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
-            block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
-            if (
-                line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
-                and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
-            ):
-                external_sides_not_close_num += 1
-            if abs(line_mid_x - block_mid_x) < line_height / 2:
-                center_close_num += 1
+        block_text = ''
+        for line in block['lines']:
             line_text = ''
             for span in line['spans']:
                 span_type = span['type']
                 if span_type == ContentType.Text:
                     line_text += span['content'].strip()
             # 添加所有文本，包括空行，保持与block['lines']长度一致
             lines_text_list.append(line_text)
             block_text = ''.join(lines_text_list)
-            block_lang = detect_lang(block_text)
-            # logger.info(f"block_lang: {block_lang}")
+        block_lang = detect_lang(block_text)
+        # logger.info(f"block_lang: {block_lang}")
+        for line in block['lines']:
+            line_mid_x = (line['bbox'][0] + line['bbox'][2]) / 2
+            block_mid_x = (block['bbox_fs'][0] + block['bbox_fs'][2]) / 2
+            if (
+                line['bbox'][0] - block['bbox_fs'][0] > 0.7 * line_height
+                and block['bbox_fs'][2] - line['bbox'][2] > 0.7 * line_height
+            ):
+                external_sides_not_close_num += 1
+            if abs(line_mid_x - block_mid_x) < line_height / 2:
+                center_close_num += 1
             # 计算line左侧顶格数量是否大于2，是否顶格用abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height/2 来判断
             if abs(block['bbox_fs'][0] - line['bbox'][0]) < line_height / 2:

magic_pdf/pre_proc/ocr_dict_merge.py CHANGED Viewed

@@ -62,7 +62,15 @@ def merge_spans_to_line(spans, threshold=0.6):
 def span_block_type_compatible(span_type, block_type):
     if span_type in [ContentType.Text, ContentType.InlineEquation]:
-        return block_type in [BlockType.Text, BlockType.Title, BlockType.ImageCaption, BlockType.ImageFootnote, BlockType.TableCaption, BlockType.TableFootnote]
+        return block_type in [
+            BlockType.Text,
+            BlockType.Title,
+            BlockType.ImageCaption,
+            BlockType.ImageFootnote,
+            BlockType.TableCaption,
+            BlockType.TableFootnote,
+            BlockType.Discarded
+        ]
     elif span_type == ContentType.InterlineEquation:
         return block_type in [BlockType.InterlineEquation, BlockType.Text]
     elif span_type == ContentType.Image:

magic-pdf 1.2.1__py3-none-any.whl → 1.3.0__py3-none-any.whl

magic-pdf 1.2.1py3-none-any.whl → 1.3.0py3-none-any.whl