PyPI - magic-pdf - Versions diffs - 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

magic-pdf 0.8.1py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

magic_pdf/config/__init__.py +0 -0
magic_pdf/config/enums.py +7 -0
magic_pdf/config/exceptions.py +32 -0
magic_pdf/data/__init__.py +0 -0
magic_pdf/data/data_reader_writer/__init__.py +12 -0
magic_pdf/data/data_reader_writer/base.py +51 -0
magic_pdf/data/data_reader_writer/filebase.py +59 -0
magic_pdf/data/data_reader_writer/multi_bucket_s3.py +143 -0
magic_pdf/data/data_reader_writer/s3.py +73 -0
magic_pdf/data/dataset.py +194 -0
magic_pdf/data/io/__init__.py +6 -0
magic_pdf/data/io/base.py +42 -0
magic_pdf/data/io/http.py +37 -0
magic_pdf/data/io/s3.py +114 -0
magic_pdf/data/read_api.py +95 -0
magic_pdf/data/schemas.py +19 -0
magic_pdf/data/utils.py +32 -0
magic_pdf/dict2md/ocr_mkcontent.py +106 -244
magic_pdf/libs/Constants.py +21 -8
magic_pdf/libs/MakeContentConfig.py +1 -0
magic_pdf/libs/boxbase.py +35 -0
magic_pdf/libs/clean_memory.py +10 -0
magic_pdf/libs/config_reader.py +53 -23
magic_pdf/libs/draw_bbox.py +150 -65
magic_pdf/libs/ocr_content_type.py +2 -0
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +77 -32
magic_pdf/model/magic_model.py +331 -15
magic_pdf/model/pdf_extract_kit.py +170 -83
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +40 -16
magic_pdf/model/ppTableModel.py +8 -6
magic_pdf/model/pp_structure_v2.py +5 -2
magic_pdf/model/v3/__init__.py +0 -0
magic_pdf/model/v3/helpers.py +125 -0
magic_pdf/para/para_split_v3.py +322 -0
magic_pdf/pdf_parse_by_ocr.py +6 -3
magic_pdf/pdf_parse_by_txt.py +6 -3
magic_pdf/pdf_parse_union_core_v2.py +644 -0
magic_pdf/pipe/AbsPipe.py +5 -1
magic_pdf/pipe/OCRPipe.py +10 -4
magic_pdf/pipe/TXTPipe.py +10 -4
magic_pdf/pipe/UNIPipe.py +16 -7
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +83 -1
magic_pdf/pre_proc/ocr_dict_merge.py +27 -2
magic_pdf/resources/model_config/UniMERNet/demo.yaml +7 -7
magic_pdf/resources/model_config/model_configs.yaml +5 -13
magic_pdf/tools/cli.py +14 -1
magic_pdf/tools/common.py +18 -8
magic_pdf/user_api.py +25 -6
magic_pdf/utils/__init__.py +0 -0
magic_pdf/utils/annotations.py +11 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/LICENSE.md +1 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/METADATA +124 -78
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/RECORD +57 -33
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/WHEEL +0 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.8.1.dist-info → magic_pdf-0.9.1.dist-info}/top_level.txt +0 -0

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import re
-import wordninja
 from loguru import logger
 from magic_pdf.libs.commons import join_path
@@ -8,6 +7,7 @@ from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.libs.ocr_content_type import BlockType, ContentType
+from magic_pdf.para.para_split_v3 import ListLineTag
 def __is_hyphen_at_line_end(line):
@@ -24,37 +24,6 @@ def __is_hyphen_at_line_end(line):
     return bool(re.search(r'[A-Za-z]+-\s*$', line))
-def split_long_words(text):
-    segments = text.split(' ')
-    for i in range(len(segments)):
-        words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
-        for j in range(len(words)):
-            if len(words[j]) > 10:
-                words[j] = ' '.join(wordninja.split(words[j]))
-        segments[i] = ''.join(words)
-    return ' '.join(segments)
-def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
-    markdown = []
-    for page_info in pdf_info_list:
-        paras_of_layout = page_info.get('para_blocks')
-        page_markdown = ocr_mk_markdown_with_para_core_v2(
-            paras_of_layout, 'mm', img_buket_path)
-        markdown.extend(page_markdown)
-    return '\n\n'.join(markdown)
-def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
-    markdown = []
-    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get('para_blocks')
-        page_markdown = ocr_mk_markdown_with_para_core_v2(
-            paras_of_layout, 'nlp')
-        markdown.extend(page_markdown)
-    return '\n\n'.join(markdown)
 def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
                                                 img_buket_path):
     markdown_with_para_and_pagination = []
@@ -67,61 +36,23 @@ def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
             paras_of_layout, 'mm', img_buket_path)
         markdown_with_para_and_pagination.append({
             'page_no':
-            page_no,
+                page_no,
             'md_content':
-            '\n\n'.join(page_markdown)
+                '\n\n'.join(page_markdown)
         })
         page_no += 1
     return markdown_with_para_and_pagination
-def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
-    page_markdown = []
-    for paras in paras_of_layout:
-        for para in paras:
-            para_text = ''
-            for line in para:
-                for span in line['spans']:
-                    span_type = span.get('type')
-                    content = ''
-                    language = ''
-                    if span_type == ContentType.Text:
-                        content = span['content']
-                        language = detect_lang(content)
-                        if (language == 'en'):  # 只对英文长词进行分词处理，中文分词会丢失文本
-                            content = ocr_escape_special_markdown_char(
-                                split_long_words(content))
-                        else:
-                            content = ocr_escape_special_markdown_char(content)
-                    elif span_type == ContentType.InlineEquation:
-                        content = f"${span['content']}$"
-                    elif span_type == ContentType.InterlineEquation:
-                        content = f"\n$$\n{span['content']}\n$$\n"
-                    elif span_type in [ContentType.Image, ContentType.Table]:
-                        if mode == 'mm':
-                            content = f"\n![]({join_path(img_buket_path, span['image_path'])})\n"
-                        elif mode == 'nlp':
-                            pass
-                    if content != '':
-                        if language == 'en':  # 英文语境下 content间需要空格分隔
-                            para_text += content + ' '
-                        else:  # 中文语境下，content间不需要空格分隔
-                            para_text += content
-            if para_text.strip() == '':
-                continue
-            else:
-                page_markdown.append(para_text.strip() + '  ')
-    return page_markdown
 def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
                                       mode,
-                                      img_buket_path=''):
+                                      img_buket_path='',
+                                      ):
     page_markdown = []
     for para_block in paras_of_layout:
         para_text = ''
         para_type = para_block['type']
-        if para_type == BlockType.Text:
+        if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
             para_text = merge_para_with_text(para_block)
         elif para_type == BlockType.Title:
             para_text = f'# {merge_para_with_text(para_block)}'
@@ -136,20 +67,21 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
                         for line in block['lines']:
                             for span in line['spans']:
                                 if span['type'] == ContentType.Image:
-                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
+                                    if span.get('image_path', ''):
+                                        para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 2nd.拼image_caption
                     if block['type'] == BlockType.ImageCaption:
-                        para_text += merge_para_with_text(block)
-                for block in para_block['blocks']:  # 2nd.拼image_caption
+                        para_text += merge_para_with_text(block) + '  \n'
+                for block in para_block['blocks']:  # 3rd.拼image_footnote
                     if block['type'] == BlockType.ImageFootnote:
-                        para_text += merge_para_with_text(block)
+                        para_text += merge_para_with_text(block) + '  \n'
         elif para_type == BlockType.Table:
             if mode == 'nlp':
                 continue
             elif mode == 'mm':
                 for block in para_block['blocks']:  # 1st.拼table_caption
                     if block['type'] == BlockType.TableCaption:
-                        para_text += merge_para_with_text(block)
+                        para_text += merge_para_with_text(block) + '  \n'
                 for block in para_block['blocks']:  # 2nd.拼table_body
                     if block['type'] == BlockType.TableBody:
                         for line in block['lines']:
@@ -160,11 +92,11 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
                                         para_text += f"\n\n$\n {span['latex']}\n$\n\n"
                                     elif span.get('html', ''):
                                         para_text += f"\n\n{span['html']}\n\n"
-                                    else:
+                                    elif span.get('image_path', ''):
                                         para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
                     if block['type'] == BlockType.TableFootnote:
-                        para_text += merge_para_with_text(block)
+                        para_text += merge_para_with_text(block) + '  \n'
         if para_text.strip() == '':
             continue
@@ -174,22 +106,36 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
     return page_markdown
-def merge_para_with_text(para_block):
-    def detect_language(text):
-        en_pattern = r'[a-zA-Z]+'
-        en_matches = re.findall(en_pattern, text)
-        en_length = sum(len(match) for match in en_matches)
-        if len(text) > 0:
-            if en_length / len(text) >= 0.5:
-                return 'en'
-            else:
-                return 'unknown'
+def detect_language(text):
+    en_pattern = r'[a-zA-Z]+'
+    en_matches = re.findall(en_pattern, text)
+    en_length = sum(len(match) for match in en_matches)
+    if len(text) > 0:
+        if en_length / len(text) >= 0.5:
+            return 'en'
         else:
-            return 'empty'
+            return 'unknown'
+    else:
+        return 'empty'
+# 连写字符拆分
+def __replace_ligatures(text: str):
+    text = re.sub(r'ﬁ', 'fi', text)  # 替换 fi 连写符
+    text = re.sub(r'ﬂ', 'fl', text)  # 替换 fl 连写符
+    text = re.sub(r'ﬀ', 'ff', text)  # 替换 ff 连写符
+    text = re.sub(r'ﬃ', 'ffi', text)  # 替换 ffi 连写符
+    text = re.sub(r'ﬄ', 'ffl', text)  # 替换 ffl 连写符
+    return text
+def merge_para_with_text(para_block):
     para_text = ''
-    for line in para_block['lines']:
+    for i, line in enumerate(para_block['lines']):
+        if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
+            para_text += '  \n'
         line_text = ''
         line_lang = ''
         for span in line['spans']:
@@ -199,208 +145,120 @@ def merge_para_with_text(para_block):
         if line_text != '':
             line_lang = detect_lang(line_text)
         for span in line['spans']:
             span_type = span['type']
             content = ''
             if span_type == ContentType.Text:
-                content = span['content']
-                # language = detect_lang(content)
-                language = detect_language(content)
-                if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                    content = ocr_escape_special_markdown_char(
-                        split_long_words(content))
-                else:
-                    content = ocr_escape_special_markdown_char(content)
+                content = ocr_escape_special_markdown_char(span['content'])
             elif span_type == ContentType.InlineEquation:
-                content = f" ${span['content']}$ "
+                content = f"${span['content']}$"
             elif span_type == ContentType.InterlineEquation:
                 content = f"\n$$\n{span['content']}\n$$\n"
+            content = content.strip()
             if content != '':
                 langs = ['zh', 'ja', 'ko']
                 if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
-                elif line_lang == 'en':
-                    # 如果是前一行带有-连字符，那么末尾不应该加空格
-                    if __is_hyphen_at_line_end(content):
-                        para_text += content[:-1]
-                    else:
-                        para_text += content + ' '
+                    if span_type in [ContentType.Text, ContentType.InterlineEquation]:
+                        para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
+                    elif span_type == ContentType.InlineEquation:
+                        para_text += f" {content} "
                 else:
-                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
-    return para_text
+                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
+                        # 如果是前一行带有-连字符，那么末尾不应该加空格
+                        if __is_hyphen_at_line_end(content):
+                            para_text += content[:-1]
+                        elif len(content) == 1 and content not in ['A', 'I', 'a', 'i']:
+                            para_text += content
+                        else:  # 西方文本语境下 content间需要空格分隔
+                            para_text += f"{content} "
+                    elif span_type == ContentType.InterlineEquation:
+                        para_text += content
+            else:
+                continue
+    # 连写字符拆分
+    para_text = __replace_ligatures(para_text)
-def para_to_standard_format(para, img_buket_path):
-    para_content = {}
-    if len(para) == 1:
-        para_content = line_to_standard_format(para[0], img_buket_path)
-    elif len(para) > 1:
-        para_text = ''
-        inline_equation_num = 0
-        for line in para:
-            for span in line['spans']:
-                language = ''
-                span_type = span.get('type')
-                content = ''
-                if span_type == ContentType.Text:
-                    content = span['content']
-                    language = detect_lang(content)
-                    if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                        content = ocr_escape_special_markdown_char(
-                            split_long_words(content))
-                    else:
-                        content = ocr_escape_special_markdown_char(content)
-                elif span_type == ContentType.InlineEquation:
-                    content = f"${span['content']}$"
-                    inline_equation_num += 1
-                if language == 'en':  # 英文语境下 content间需要空格分隔
-                    para_text += content + ' '
-                else:  # 中文语境下，content间不需要空格分隔
-                    para_text += content
-        para_content = {
-            'type': 'text',
-            'text': para_text,
-            'inline_equation_num': inline_equation_num,
-        }
-    return para_content
+    return para_text
-def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
+def para_to_standard_format_v2(para_block, img_buket_path, page_idx, drop_reason=None):
     para_type = para_block['type']
-    if para_type == BlockType.Text:
+    para_content = {}
+    if para_type in [BlockType.Text, BlockType.List, BlockType.Index]:
         para_content = {
             'type': 'text',
             'text': merge_para_with_text(para_block),
-            'page_idx': page_idx,
         }
     elif para_type == BlockType.Title:
         para_content = {
             'type': 'text',
             'text': merge_para_with_text(para_block),
             'text_level': 1,
-            'page_idx': page_idx,
         }
     elif para_type == BlockType.InterlineEquation:
         para_content = {
             'type': 'equation',
             'text': merge_para_with_text(para_block),
             'text_format': 'latex',
-            'page_idx': page_idx,
         }
     elif para_type == BlockType.Image:
-        para_content = {'type': 'image', 'page_idx': page_idx}
+        para_content = {'type': 'image', 'img_path': '', 'img_caption': [], 'img_footnote': []}
         for block in para_block['blocks']:
             if block['type'] == BlockType.ImageBody:
-                para_content['img_path'] = join_path(
-                    img_buket_path,
-                    block['lines'][0]['spans'][0]['image_path'])
+                for line in block['lines']:
+                    for span in line['spans']:
+                        if span['type'] == ContentType.Image:
+                            if span.get('image_path', ''):
+                                para_content['img_path'] = join_path(img_buket_path, span['image_path'])
             if block['type'] == BlockType.ImageCaption:
-                para_content['img_caption'] = merge_para_with_text(block)
+                para_content['img_caption'].append(merge_para_with_text(block))
             if block['type'] == BlockType.ImageFootnote:
-                para_content['img_footnote'] = merge_para_with_text(block)
+                para_content['img_footnote'].append(merge_para_with_text(block))
     elif para_type == BlockType.Table:
-        para_content = {'type': 'table', 'page_idx': page_idx}
+        para_content = {'type': 'table', 'img_path': '', 'table_caption': [], 'table_footnote': []}
         for block in para_block['blocks']:
             if block['type'] == BlockType.TableBody:
-                if block["lines"][0]["spans"][0].get('latex', ''):
-                    para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
-                elif block["lines"][0]["spans"][0].get('html', ''):
-                    para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
-                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
+                for line in block['lines']:
+                    for span in line['spans']:
+                        if span['type'] == ContentType.Table:
+                            if span.get('latex', ''):
+                                para_content['table_body'] = f"\n\n$\n {span['latex']}\n$\n\n"
+                            elif span.get('html', ''):
+                                para_content['table_body'] = f"\n\n{span['html']}\n\n"
+                            if span.get('image_path', ''):
+                                para_content['img_path'] = join_path(img_buket_path, span['image_path'])
             if block['type'] == BlockType.TableCaption:
-                para_content['table_caption'] = merge_para_with_text(block)
+                para_content['table_caption'].append(merge_para_with_text(block))
             if block['type'] == BlockType.TableFootnote:
-                para_content['table_footnote'] = merge_para_with_text(block)
+                para_content['table_footnote'].append(merge_para_with_text(block))
-    return para_content
+    para_content['page_idx'] = page_idx
+    if drop_reason is not None:
+        para_content['drop_reason'] = drop_reason
-def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
-    content_list = []
-    for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get('para_blocks')
-        if not paras_of_layout:
-            continue
-        for para_block in paras_of_layout:
-            para_content = para_to_standard_format_v2(para_block,
-                                                      img_buket_path)
-            content_list.append(para_content)
-    return content_list
-def line_to_standard_format(line, img_buket_path):
-    line_text = ''
-    inline_equation_num = 0
-    for span in line['spans']:
-        if not span.get('content'):
-            if not span.get('image_path'):
-                continue
-            else:
-                if span['type'] == ContentType.Image:
-                    content = {
-                        'type': 'image',
-                        'img_path': join_path(img_buket_path,
-                                              span['image_path']),
-                    }
-                    return content
-                elif span['type'] == ContentType.Table:
-                    content = {
-                        'type': 'table',
-                        'img_path': join_path(img_buket_path,
-                                              span['image_path']),
-                    }
-                    return content
-        else:
-            if span['type'] == ContentType.InterlineEquation:
-                interline_equation = span['content']
-                content = {
-                    'type': 'equation',
-                    'latex': f'$$\n{interline_equation}\n$$'
-                }
-                return content
-            elif span['type'] == ContentType.InlineEquation:
-                inline_equation = span['content']
-                line_text += f'${inline_equation}$'
-                inline_equation_num += 1
-            elif span['type'] == ContentType.Text:
-                text_content = ocr_escape_special_markdown_char(
-                    span['content'])  # 转义特殊符号
-                line_text += text_content
-    content = {
-        'type': 'text',
-        'text': line_text,
-        'inline_equation_num': inline_equation_num,
-    }
-    return content
-def ocr_mk_mm_standard_format(pdf_info_dict: list):
-    """content_list type         string
-    image/text/table/equation(行间的单独拿出来，行内的和text合并) latex        string
-    latex文本字段。 text         string      纯文本格式的文本数据。 md           string
-    markdown格式的文本数据。 img_path     string      s3://full/path/to/img.jpg."""
-    content_list = []
-    for page_info in pdf_info_dict:
-        blocks = page_info.get('preproc_blocks')
-        if not blocks:
-            continue
-        for block in blocks:
-            for line in block['lines']:
-                content = line_to_standard_format(line)
-                content_list.append(content)
-    return content_list
+    return para_content
 def union_make(pdf_info_dict: list,
                make_mode: str,
                drop_mode: str,
-               img_buket_path: str = ''):
+               img_buket_path: str = '',
+               ):
     output_content = []
     for page_info in pdf_info_dict:
+        drop_reason_flag = False
+        drop_reason = None
         if page_info.get('need_drop', False):
             drop_reason = page_info.get('drop_reason')
             if drop_mode == DropMode.NONE:
                 pass
+            elif drop_mode == DropMode.NONE_WITH_REASON:
+                drop_reason_flag = True
             elif drop_mode == DropMode.WHOLE_PDF:
                 raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
                                  f'drop_reason is {drop_reason}'))
@@ -425,8 +283,12 @@ def union_make(pdf_info_dict: list,
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.STANDARD_FORMAT:
             for para_block in paras_of_layout:
-                para_content = para_to_standard_format_v2(
-                    para_block, img_buket_path, page_idx)
+                if drop_reason_flag:
+                    para_content = para_to_standard_format_v2(
+                        para_block, img_buket_path, page_idx)
+                else:
+                    para_content = para_to_standard_format_v2(
+                        para_block, img_buket_path, page_idx)
                 output_content.append(para_content)
     if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
         return '\n\n'.join(output_content)

magic_pdf/libs/Constants.py CHANGED Viewed

@@ -10,18 +10,12 @@ block维度自定义字段
 # block中lines是否被删除
 LINES_DELETED = "lines_deleted"
-# struct eqtable
-STRUCT_EQTABLE = "struct_eqtable"
 # table recognition max time default value
 TABLE_MAX_TIME_VALUE = 400
 # pp_table_result_max_length
 TABLE_MAX_LEN = 480
-# pp table structure algorithm
-TABLE_MASTER = "TableMaster"
 # table master structure dict
 TABLE_MASTER_DICT = "table_master_structure_dict.txt"
@@ -29,12 +23,31 @@ TABLE_MASTER_DICT = "table_master_structure_dict.txt"
 TABLE_MASTER_DIR = "table_structure_tablemaster_infer/"
 # pp detect model dir
-DETECT_MODEL_DIR = "ch_PP-OCRv3_det_infer"
+DETECT_MODEL_DIR = "ch_PP-OCRv4_det_infer"
 # pp rec model dir
-REC_MODEL_DIR = "ch_PP-OCRv3_rec_infer"
+REC_MODEL_DIR = "ch_PP-OCRv4_rec_infer"
 # pp rec char dict path
 REC_CHAR_DICT = "ppocr_keys_v1.txt"
+# pp rec copy rec directory
+PP_REC_DIRECTORY = ".paddleocr/whl/rec/ch/ch_PP-OCRv4_rec_infer"
+# pp rec copy det directory
+PP_DET_DIRECTORY = ".paddleocr/whl/det/ch/ch_PP-OCRv4_det_infer"
+class MODEL_NAME:
+    # pp table structure algorithm
+    TABLE_MASTER = "tablemaster"
+    # struct eqtable
+    STRUCT_EQTABLE = "struct_eqtable"
+    DocLayout_YOLO = "doclayout_yolo"
+    LAYOUTLMv3 = "layoutlmv3"
+    YOLO_V8_MFD = "yolo_v8_mfd"
+    UniMerNet_v2_Small = "unimernet_small"

magic_pdf/libs/MakeContentConfig.py CHANGED Viewed

@@ -8,3 +8,4 @@ class DropMode:
     WHOLE_PDF = "whole_pdf"
     SINGLE_PAGE = "single_page"
     NONE = "none"
+    NONE_WITH_REASON = "none_with_reason"

magic_pdf/libs/boxbase.py CHANGED Viewed

@@ -445,3 +445,38 @@ def get_overlap_area(bbox1, bbox2):
     # The area of overlap area
     return (x_right - x_left) * (y_bottom - y_top)
+def calculate_vertical_projection_overlap_ratio(block1, block2):
+    """
+    Calculate the proportion of the x-axis covered by the vertical projection of two blocks.
+    Args:
+        block1 (tuple): Coordinates of the first block (x0, y0, x1, y1).
+        block2 (tuple): Coordinates of the second block (x0, y0, x1, y1).
+    Returns:
+        float: The proportion of the x-axis covered by the vertical projection of the two blocks.
+    """
+    x0_1, _, x1_1, _ = block1
+    x0_2, _, x1_2, _ = block2
+    # Calculate the intersection of the x-coordinates
+    x_left = max(x0_1, x0_2)
+    x_right = min(x1_1, x1_2)
+    if x_right < x_left:
+        return 0.0
+    # Length of the intersection
+    intersection_length = x_right - x_left
+    # Length of the x-axis projection of the first block
+    block1_length = x1_1 - x0_1
+    if block1_length == 0:
+        return 0.0
+    # Proportion of the x-axis covered by the intersection
+    # logger.info(f"intersection_length: {intersection_length}, block1_length: {block1_length}")
+    return intersection_length / block1_length

magic_pdf/libs/clean_memory.py ADDED Viewed

@@ -0,0 +1,10 @@
+# Copyright (c) Opendatalab. All rights reserved.
+import torch
+import gc
+def clean_memory():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    gc.collect()

magic-pdf 0.8.1__py3-none-any.whl → 0.9.1__py3-none-any.whl

magic-pdf 0.8.1py3-none-any.whl → 0.9.1py3-none-any.whl