PyPI - magic-pdf - Versions diffs - 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

magic-pdf 0.7.0b1py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

magic_pdf/dict2md/ocr_mkcontent.py +134 -76
magic_pdf/integrations/__init__.py +0 -0
magic_pdf/integrations/rag/__init__.py +0 -0
magic_pdf/integrations/rag/api.py +82 -0
magic_pdf/integrations/rag/type.py +82 -0
magic_pdf/integrations/rag/utils.py +285 -0
magic_pdf/layout/layout_sort.py +472 -283
magic_pdf/libs/Constants.py +27 -1
magic_pdf/libs/boxbase.py +169 -149
magic_pdf/libs/draw_bbox.py +113 -87
magic_pdf/libs/ocr_content_type.py +21 -18
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +14 -2
magic_pdf/model/magic_model.py +230 -161
magic_pdf/model/model_list.py +8 -0
magic_pdf/model/pdf_extract_kit.py +135 -22
magic_pdf/model/pek_sub_modules/self_modify.py +84 -0
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +0 -1
magic_pdf/model/ppTableModel.py +67 -0
magic_pdf/para/para_split_v2.py +76 -74
magic_pdf/pdf_parse_union_core.py +34 -6
magic_pdf/pipe/AbsPipe.py +4 -1
magic_pdf/pipe/OCRPipe.py +7 -4
magic_pdf/pipe/TXTPipe.py +7 -4
magic_pdf/pipe/UNIPipe.py +11 -6
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +12 -3
magic_pdf/pre_proc/ocr_dict_merge.py +60 -59
magic_pdf/resources/model_config/model_configs.yaml +3 -1
magic_pdf/tools/cli.py +56 -29
magic_pdf/tools/cli_dev.py +61 -64
magic_pdf/tools/common.py +57 -37
magic_pdf/user_api.py +17 -9
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/METADATA +71 -33
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/RECORD +38 -32
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/WHEEL +0 -0
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/entry_points.txt +0 -0
{magic_pdf-0.7.0b1.dist-info → magic_pdf-0.8.0.dist-info}/top_level.txt +0 -0

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -1,12 +1,27 @@
+import re
+import wordninja
 from loguru import logger
-from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.commons import join_path
 from magic_pdf.libs.language import detect_lang
+from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
-from magic_pdf.libs.ocr_content_type import ContentType, BlockType
-import wordninja
-import re
+from magic_pdf.libs.ocr_content_type import BlockType, ContentType
+def __is_hyphen_at_line_end(line):
+    """
+    Check if a line ends with one or more letters followed by a hyphen.
+    Args:
+    line (str): The line of text to check.
+    Returns:
+    bool: True if the line ends with one or more letters followed by a hyphen, False otherwise.
+    """
+    # Use regex to check if the line ends with one or more letters followed by a hyphen
+    return bool(re.search(r'[A-Za-z]+-\s*$', line))
 def split_long_words(text):
@@ -14,7 +29,7 @@ def split_long_words(text):
     for i in range(len(segments)):
         words = re.findall(r'\w+|[^\w]', segments[i], re.UNICODE)
         for j in range(len(words)):
-            if len(words[j]) > 15:
+            if len(words[j]) > 10:
                 words[j] = ' '.join(wordninja.split(words[j]))
         segments[i] = ''.join(words)
     return ' '.join(segments)
@@ -23,8 +38,9 @@ def split_long_words(text):
 def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
     markdown = []
     for page_info in pdf_info_list:
-        paras_of_layout = page_info.get("para_blocks")
-        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
+        paras_of_layout = page_info.get('para_blocks')
+        page_markdown = ocr_mk_markdown_with_para_core_v2(
+            paras_of_layout, 'mm', img_buket_path)
         markdown.extend(page_markdown)
     return '\n\n'.join(markdown)
@@ -32,29 +48,34 @@ def ocr_mk_mm_markdown_with_para(pdf_info_list: list, img_buket_path):
 def ocr_mk_nlp_markdown_with_para(pdf_info_dict: list):
     markdown = []
     for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get("para_blocks")
-        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
+        paras_of_layout = page_info.get('para_blocks')
+        page_markdown = ocr_mk_markdown_with_para_core_v2(
+            paras_of_layout, 'nlp')
         markdown.extend(page_markdown)
     return '\n\n'.join(markdown)
-def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list, img_buket_path):
+def ocr_mk_mm_markdown_with_para_and_pagination(pdf_info_dict: list,
+                                                img_buket_path):
     markdown_with_para_and_pagination = []
     page_no = 0
     for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get("para_blocks")
+        paras_of_layout = page_info.get('para_blocks')
         if not paras_of_layout:
             continue
-        page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
+        page_markdown = ocr_mk_markdown_with_para_core_v2(
+            paras_of_layout, 'mm', img_buket_path)
         markdown_with_para_and_pagination.append({
-            'page_no': page_no,
-            'md_content': '\n\n'.join(page_markdown)
+            'page_no':
+            page_no,
+            'md_content':
+            '\n\n'.join(page_markdown)
         })
         page_no += 1
     return markdown_with_para_and_pagination
-def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
+def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=''):
     page_markdown = []
     for paras in paras_of_layout:
         for para in paras:
@@ -67,8 +88,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
                     if span_type == ContentType.Text:
                         content = span['content']
                         language = detect_lang(content)
-                        if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                            content = ocr_escape_special_markdown_char(split_long_words(content))
+                        if (language == 'en'):  # 只对英文长词进行分词处理，中文分词会丢失文本
+                            content = ocr_escape_special_markdown_char(
+                                split_long_words(content))
                         else:
                             content = ocr_escape_special_markdown_char(content)
                     elif span_type == ContentType.InlineEquation:
@@ -92,7 +114,9 @@ def ocr_mk_markdown_with_para_core(paras_of_layout, mode, img_buket_path=""):
     return page_markdown
-def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
+def ocr_mk_markdown_with_para_core_v2(paras_of_layout,
+                                      mode,
+                                      img_buket_path=''):
     page_markdown = []
     for para_block in paras_of_layout:
         para_text = ''
@@ -100,7 +124,7 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
         if para_type == BlockType.Text:
             para_text = merge_para_with_text(para_block)
         elif para_type == BlockType.Title:
-            para_text = f"# {merge_para_with_text(para_block)}"
+            para_text = f'# {merge_para_with_text(para_block)}'
         elif para_type == BlockType.InterlineEquation:
             para_text = merge_para_with_text(para_block)
         elif para_type == BlockType.Image:
@@ -116,14 +140,16 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                 for block in para_block['blocks']:  # 2nd.拼image_caption
                     if block['type'] == BlockType.ImageCaption:
                         para_text += merge_para_with_text(block)
+                for block in para_block['blocks']:  # 2nd.拼image_caption
+                    if block['type'] == BlockType.ImageFootnote:
+                        para_text += merge_para_with_text(block)
         elif para_type == BlockType.Table:
             if mode == 'nlp':
                 continue
             elif mode == 'mm':
-                table_caption = ''
                 for block in para_block['blocks']:  # 1st.拼table_caption
                     if block['type'] == BlockType.TableCaption:
-                        table_caption = merge_para_with_text(block)
+                        para_text += merge_para_with_text(block)
                 for block in para_block['blocks']:  # 2nd.拼table_body
                     if block['type'] == BlockType.TableBody:
                         for line in block['lines']:
@@ -132,8 +158,10 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
                                     # if processed by table model
                                     if span.get('latex', ''):
                                         para_text += f"\n\n$\n {span['latex']}\n$\n\n"
+                                    elif span.get('html', ''):
+                                        para_text += f"\n\n{span['html']}\n\n"
                                     else:
-                                        para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])})  \n"
+                                        para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
                     if block['type'] == BlockType.TableFootnote:
                         para_text += merge_para_with_text(block)
@@ -147,24 +175,39 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
 def merge_para_with_text(para_block):
+    def detect_language(text):
+        en_pattern = r'[a-zA-Z]+'
+        en_matches = re.findall(en_pattern, text)
+        en_length = sum(len(match) for match in en_matches)
+        if len(text) > 0:
+            if en_length / len(text) >= 0.5:
+                return 'en'
+            else:
+                return 'unknown'
+        else:
+            return 'empty'
     para_text = ''
     for line in para_block['lines']:
-        line_text = ""
-        line_lang = ""
+        line_text = ''
+        line_lang = ''
         for span in line['spans']:
             span_type = span['type']
             if span_type == ContentType.Text:
                 line_text += span['content'].strip()
-        if line_text != "":
+        if line_text != '':
             line_lang = detect_lang(line_text)
         for span in line['spans']:
             span_type = span['type']
             content = ''
             if span_type == ContentType.Text:
                 content = span['content']
-                language = detect_lang(content)
+                # language = detect_lang(content)
+                language = detect_language(content)
                 if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                    content = ocr_escape_special_markdown_char(split_long_words(content))
+                    content = ocr_escape_special_markdown_char(
+                        split_long_words(content))
                 else:
                     content = ocr_escape_special_markdown_char(content)
             elif span_type == ContentType.InlineEquation:
@@ -173,10 +216,17 @@ def merge_para_with_text(para_block):
                 content = f"\n$$\n{span['content']}\n$$\n"
             if content != '':
-                if 'zh' in line_lang:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
-                    para_text += content  # 中文语境下，content间不需要空格分隔
+                langs = ['zh', 'ja', 'ko']
+                if line_lang in langs:  # 遇到一些一个字一个span的文档，这种单字语言判断不准，需要用整行文本判断
+                    para_text += content  # 中文/日语/韩文语境下，content间不需要空格分隔
+                elif line_lang == 'en':
+                    # 如果是前一行带有-连字符，那么末尾不应该加空格
+                    if __is_hyphen_at_line_end(content):
+                        para_text += content[:-1]
+                    else:
+                        para_text += content + ' '
                 else:
-                    para_text += content + ' '  # 英文语境下 content间需要空格分隔
+                    para_text += content + ' '  # 西方文本语境下 content间需要空格分隔
     return para_text
@@ -191,18 +241,18 @@ def para_to_standard_format(para, img_buket_path):
             for span in line['spans']:
                 language = ''
                 span_type = span.get('type')
-                content = ""
+                content = ''
                 if span_type == ContentType.Text:
                     content = span['content']
                     language = detect_lang(content)
                     if language == 'en':  # 只对英文长词进行分词处理，中文分词会丢失文本
-                        content = ocr_escape_special_markdown_char(split_long_words(content))
+                        content = ocr_escape_special_markdown_char(
+                            split_long_words(content))
                     else:
                         content = ocr_escape_special_markdown_char(content)
                 elif span_type == ContentType.InlineEquation:
                     content = f"${span['content']}$"
                     inline_equation_num += 1
                 if language == 'en':  # 英文语境下 content间需要空格分隔
                     para_text += content + ' '
                 else:  # 中文语境下，content间不需要空格分隔
@@ -210,7 +260,7 @@ def para_to_standard_format(para, img_buket_path):
         para_content = {
             'type': 'text',
             'text': para_text,
-            'inline_equation_num': inline_equation_num
+            'inline_equation_num': inline_equation_num,
         }
     return para_content
@@ -221,41 +271,41 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
         para_content = {
             'type': 'text',
             'text': merge_para_with_text(para_block),
-            'page_idx': page_idx
+            'page_idx': page_idx,
         }
     elif para_type == BlockType.Title:
         para_content = {
             'type': 'text',
             'text': merge_para_with_text(para_block),
             'text_level': 1,
-            'page_idx': page_idx
+            'page_idx': page_idx,
         }
     elif para_type == BlockType.InterlineEquation:
         para_content = {
             'type': 'equation',
             'text': merge_para_with_text(para_block),
-            'text_format': "latex",
-            'page_idx': page_idx
+            'text_format': 'latex',
+            'page_idx': page_idx,
         }
     elif para_type == BlockType.Image:
-        para_content = {
-            'type': 'image',
-            'page_idx': page_idx
-        }
+        para_content = {'type': 'image', 'page_idx': page_idx}
         for block in para_block['blocks']:
             if block['type'] == BlockType.ImageBody:
-                para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
+                para_content['img_path'] = join_path(
+                    img_buket_path,
+                    block['lines'][0]['spans'][0]['image_path'])
             if block['type'] == BlockType.ImageCaption:
                 para_content['img_caption'] = merge_para_with_text(block)
+            if block['type'] == BlockType.ImageFootnote:
+                para_content['img_footnote'] = merge_para_with_text(block)
     elif para_type == BlockType.Table:
-        para_content = {
-            'type': 'table',
-            'page_idx': page_idx
-        }
+        para_content = {'type': 'table', 'page_idx': page_idx}
         for block in para_block['blocks']:
             if block['type'] == BlockType.TableBody:
                 if block["lines"][0]["spans"][0].get('latex', ''):
                     para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
+                elif block["lines"][0]["spans"][0].get('html', ''):
+                    para_content['table_body'] = f"\n\n{block['lines'][0]['spans'][0]['html']}\n\n"
                 para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
             if block['type'] == BlockType.TableCaption:
                 para_content['table_caption'] = merge_para_with_text(block)
@@ -268,17 +318,18 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
 def make_standard_format_with_para(pdf_info_dict: list, img_buket_path: str):
     content_list = []
     for page_info in pdf_info_dict:
-        paras_of_layout = page_info.get("para_blocks")
+        paras_of_layout = page_info.get('para_blocks')
         if not paras_of_layout:
             continue
         for para_block in paras_of_layout:
-            para_content = para_to_standard_format_v2(para_block, img_buket_path)
+            para_content = para_to_standard_format_v2(para_block,
+                                                      img_buket_path)
             content_list.append(para_content)
     return content_list
 def line_to_standard_format(line, img_buket_path):
-    line_text = ""
+    line_text = ''
     inline_equation_num = 0
     for span in line['spans']:
         if not span.get('content'):
@@ -288,13 +339,15 @@ def line_to_standard_format(line, img_buket_path):
                 if span['type'] == ContentType.Image:
                     content = {
                         'type': 'image',
-                        'img_path': join_path(img_buket_path, span['image_path'])
+                        'img_path': join_path(img_buket_path,
+                                              span['image_path']),
                     }
                     return content
                 elif span['type'] == ContentType.Table:
                     content = {
                         'type': 'table',
-                        'img_path': join_path(img_buket_path, span['image_path'])
+                        'img_path': join_path(img_buket_path,
+                                              span['image_path']),
                     }
                     return content
         else:
@@ -302,36 +355,33 @@ def line_to_standard_format(line, img_buket_path):
                 interline_equation = span['content']
                 content = {
                     'type': 'equation',
-                    'latex': f"$$\n{interline_equation}\n$$"
+                    'latex': f'$$\n{interline_equation}\n$$'
                 }
                 return content
             elif span['type'] == ContentType.InlineEquation:
                 inline_equation = span['content']
-                line_text += f"${inline_equation}$"
+                line_text += f'${inline_equation}$'
                 inline_equation_num += 1
             elif span['type'] == ContentType.Text:
-                text_content = ocr_escape_special_markdown_char(span['content'])  # 转义特殊符号
+                text_content = ocr_escape_special_markdown_char(
+                    span['content'])  # 转义特殊符号
                 line_text += text_content
     content = {
         'type': 'text',
         'text': line_text,
-        'inline_equation_num': inline_equation_num
+        'inline_equation_num': inline_equation_num,
     }
     return content
 def ocr_mk_mm_standard_format(pdf_info_dict: list):
-    """
-    content_list
-    type         string      image/text/table/equation(行间的单独拿出来，行内的和text合并)
-    latex        string      latex文本字段。
-    text         string      纯文本格式的文本数据。
-    md           string      markdown格式的文本数据。
-    img_path     string      s3://full/path/to/img.jpg
-    """
+    """content_list type         string
+    image/text/table/equation(行间的单独拿出来，行内的和text合并) latex        string
+    latex文本字段。 text         string      纯文本格式的文本数据。 md           string
+    markdown格式的文本数据。 img_path     string      s3://full/path/to/img.jpg."""
     content_list = []
     for page_info in pdf_info_dict:
-        blocks = page_info.get("preproc_blocks")
+        blocks = page_info.get('preproc_blocks')
         if not blocks:
             continue
         for block in blocks:
@@ -341,34 +391,42 @@ def ocr_mk_mm_standard_format(pdf_info_dict: list):
     return content_list
-def union_make(pdf_info_dict: list, make_mode: str, drop_mode: str, img_buket_path: str = ""):
+def union_make(pdf_info_dict: list,
+               make_mode: str,
+               drop_mode: str,
+               img_buket_path: str = ''):
     output_content = []
     for page_info in pdf_info_dict:
-        if page_info.get("need_drop", False):
-            drop_reason = page_info.get("drop_reason")
+        if page_info.get('need_drop', False):
+            drop_reason = page_info.get('drop_reason')
             if drop_mode == DropMode.NONE:
                 pass
             elif drop_mode == DropMode.WHOLE_PDF:
-                raise Exception(f"drop_mode is {DropMode.WHOLE_PDF} , drop_reason is {drop_reason}")
+                raise Exception((f'drop_mode is {DropMode.WHOLE_PDF} ,'
+                                 f'drop_reason is {drop_reason}'))
             elif drop_mode == DropMode.SINGLE_PAGE:
-                logger.warning(f"drop_mode is {DropMode.SINGLE_PAGE} , drop_reason is {drop_reason}")
+                logger.warning((f'drop_mode is {DropMode.SINGLE_PAGE} ,'
+                                f'drop_reason is {drop_reason}'))
                 continue
             else:
-                raise Exception(f"drop_mode can not be null")
+                raise Exception('drop_mode can not be null')
-        paras_of_layout = page_info.get("para_blocks")
-        page_idx = page_info.get("page_idx")
+        paras_of_layout = page_info.get('para_blocks')
+        page_idx = page_info.get('page_idx')
         if not paras_of_layout:
             continue
         if make_mode == MakeMode.MM_MD:
-            page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "mm", img_buket_path)
+            page_markdown = ocr_mk_markdown_with_para_core_v2(
+                paras_of_layout, 'mm', img_buket_path)
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.NLP_MD:
-            page_markdown = ocr_mk_markdown_with_para_core_v2(paras_of_layout, "nlp")
+            page_markdown = ocr_mk_markdown_with_para_core_v2(
+                paras_of_layout, 'nlp')
             output_content.extend(page_markdown)
         elif make_mode == MakeMode.STANDARD_FORMAT:
             for para_block in paras_of_layout:
-                para_content = para_to_standard_format_v2(para_block, img_buket_path, page_idx)
+                para_content = para_to_standard_format_v2(
+                    para_block, img_buket_path, page_idx)
                 output_content.append(para_content)
     if make_mode in [MakeMode.MM_MD, MakeMode.NLP_MD]:
         return '\n\n'.join(output_content)

magic_pdf/integrations/__init__.py ADDED Viewed

File without changes

magic_pdf/integrations/rag/__init__.py ADDED Viewed

File without changes

magic_pdf/integrations/rag/api.py ADDED Viewed

@@ -0,0 +1,82 @@
+import os
+from pathlib import Path
+from loguru import logger
+from magic_pdf.integrations.rag.type import (ElementRelation, LayoutElements,
+                                             Node)
+from magic_pdf.integrations.rag.utils import inference
+class RagPageReader:
+    def __init__(self, pagedata: LayoutElements):
+        self.o = [
+            Node(
+                category_type=v.category_type,
+                text=v.text,
+                image_path=v.image_path,
+                anno_id=v.anno_id,
+                latex=v.latex,
+                html=v.html,
+            ) for v in pagedata.layout_dets
+        ]
+        self.pagedata = pagedata
+    def __iter__(self):
+        return iter(self.o)
+    def get_rel_map(self) -> list[ElementRelation]:
+        return self.pagedata.extra.element_relation
+class RagDocumentReader:
+    def __init__(self, ragdata: list[LayoutElements]):
+        self.o = [RagPageReader(v) for v in ragdata]
+    def __iter__(self):
+        return iter(self.o)
+class DataReader:
+    def __init__(self, path_or_directory: str, method: str, output_dir: str):
+        self.path_or_directory = path_or_directory
+        self.method = method
+        self.output_dir = output_dir
+        self.pdfs = []
+        if os.path.isdir(path_or_directory):
+            for doc_path in Path(path_or_directory).glob('*.pdf'):
+                self.pdfs.append(doc_path)
+        else:
+            assert path_or_directory.endswith('.pdf')
+            self.pdfs.append(Path(path_or_directory))
+    def get_documents_count(self) -> int:
+        """Returns the number of documents in the directory."""
+        return len(self.pdfs)
+    def get_document_result(self, idx: int) -> RagDocumentReader | None:
+        """
+        Args:
+            idx (int): the index of documents under the
+                directory path_or_directory
+        Returns:
+            RagDocumentReader | None: RagDocumentReader is an iterable object,
+            more details @RagDocumentReader
+        """
+        if idx >= self.get_documents_count() or idx < 0:
+            logger.error(f'invalid idx: {idx}')
+            return None
+        res = inference(str(self.pdfs[idx]), self.output_dir, self.method)
+        if res is None:
+            logger.warning(f'failed to inference pdf {self.pdfs[idx]}')
+            return None
+        return RagDocumentReader(res)
+    def get_document_filename(self, idx: int) -> Path:
+        """get the filename of the document."""
+        return self.pdfs[idx]

magic_pdf/integrations/rag/type.py ADDED Viewed

@@ -0,0 +1,82 @@
+from enum import Enum
+from pydantic import BaseModel, Field
+# rag
+class CategoryType(Enum):  # py310 not support StrEnum
+    text = 'text'
+    title = 'title'
+    interline_equation = 'interline_equation'
+    image = 'image'
+    image_body = 'image_body'
+    image_caption = 'image_caption'
+    table = 'table'
+    table_body = 'table_body'
+    table_caption = 'table_caption'
+    table_footnote = 'table_footnote'
+class ElementRelType(Enum):
+    sibling = 'sibling'
+class PageInfo(BaseModel):
+    page_no: int = Field(description='the index of page, start from zero',
+                         ge=0)
+    height: int = Field(description='the height of page', gt=0)
+    width: int = Field(description='the width of page', ge=0)
+    image_path: str | None = Field(description='the image of this page',
+                                   default=None)
+class ContentObject(BaseModel):
+    category_type: CategoryType = Field(description='类别')
+    poly: list[float] = Field(
+        description=('Coordinates, need to convert back to PDF coordinates,'
+                     ' order is top-left, top-right, bottom-right, bottom-left'
+                     ' x,y coordinates'))
+    ignore: bool = Field(description='whether ignore this object',
+                         default=False)
+    text: str | None = Field(description='text content of the object',
+                             default=None)
+    image_path: str | None = Field(description='path of embedded image',
+                                   default=None)
+    order: int = Field(description='the order of this object within a page',
+                       default=-1)
+    anno_id: int = Field(description='unique id', default=-1)
+    latex: str | None = Field(description='latex result', default=None)
+    html: str | None = Field(description='html result', default=None)
+class ElementRelation(BaseModel):
+    source_anno_id: int = Field(description='unique id of the source object',
+                                default=-1)
+    target_anno_id: int = Field(description='unique id of the target object',
+                                default=-1)
+    relation: ElementRelType = Field(
+        description='the relation between source and target element')
+class LayoutElementsExtra(BaseModel):
+    element_relation: list[ElementRelation] = Field(
+        description='the relation between source and target element')
+class LayoutElements(BaseModel):
+    layout_dets: list[ContentObject] = Field(
+        description='layout element details')
+    page_info: PageInfo = Field(description='page info')
+    extra: LayoutElementsExtra = Field(description='extra information')
+# iter data format
+class Node(BaseModel):
+    category_type: CategoryType = Field(description='类别')
+    text: str | None = Field(description='text content of the object',
+                             default=None)
+    image_path: str | None = Field(description='path of embedded image',
+                                   default=None)
+    anno_id: int = Field(description='unique id', default=-1)
+    latex: str | None = Field(description='latex result', default=None)
+    html: str | None = Field(description='html result', default=None)

magic-pdf 0.7.0b1__py3-none-any.whl → 0.8.0__py3-none-any.whl

magic-pdf 0.7.0b1py3-none-any.whl → 0.8.0py3-none-any.whl