PyPI - magic-pdf - Versions diffs - 0.6.2b1__py3-none-any.whl → 0.7.0b1__py3-none-any.whl - Mend

magic-pdf 0.6.2b1py3-none-any.whl → 0.7.0b1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

magic_pdf/dict2md/ocr_mkcontent.py +10 -3
magic_pdf/libs/Constants.py +4 -1
magic_pdf/libs/config_reader.py +10 -10
magic_pdf/libs/draw_bbox.py +66 -1
magic_pdf/libs/ocr_content_type.py +14 -0
magic_pdf/libs/version.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +10 -4
magic_pdf/model/magic_model.py +4 -0
magic_pdf/model/pdf_extract_kit.py +83 -39
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
magic_pdf/resources/model_config/model_configs.yaml +4 -0
magic_pdf/rw/AbsReaderWriter.py +1 -18
magic_pdf/rw/DiskReaderWriter.py +32 -24
magic_pdf/rw/S3ReaderWriter.py +83 -48
magic_pdf/tools/cli.py +79 -0
magic_pdf/tools/cli_dev.py +155 -0
magic_pdf/tools/common.py +122 -0
magic_pdf-0.7.0b1.dist-info/METADATA +421 -0
{magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0b1.dist-info}/RECORD +25 -27
{magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0b1.dist-info}/WHEEL +1 -1
magic_pdf-0.7.0b1.dist-info/entry_points.txt +3 -0
magic_pdf/cli/magicpdf.py +0 -359
magic_pdf/pdf_parse_for_train.py +0 -685
magic_pdf/train_utils/convert_to_train_format.py +0 -65
magic_pdf/train_utils/extract_caption.py +0 -59
magic_pdf/train_utils/remove_footer_header.py +0 -159
magic_pdf/train_utils/vis_utils.py +0 -327
magic_pdf-0.6.2b1.dist-info/METADATA +0 -344
magic_pdf-0.6.2b1.dist-info/entry_points.txt +0 -2
/magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
/magic_pdf/{train_utils → tools}/__init__.py +0 -0
{magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0b1.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.6.2b1.dist-info → magic_pdf-0.7.0b1.dist-info}/top_level.txt +0 -0

magic_pdf/dict2md/ocr_mkcontent.py CHANGED Viewed

@@ -120,15 +120,20 @@ def ocr_mk_markdown_with_para_core_v2(paras_of_layout, mode, img_buket_path=""):
             if mode == 'nlp':
                 continue
             elif mode == 'mm':
+                table_caption = ''
                 for block in para_block['blocks']:  # 1st.拼table_caption
                     if block['type'] == BlockType.TableCaption:
-                        para_text += merge_para_with_text(block)
+                        table_caption = merge_para_with_text(block)
                 for block in para_block['blocks']:  # 2nd.拼table_body
                     if block['type'] == BlockType.TableBody:
                         for line in block['lines']:
                             for span in line['spans']:
                                 if span['type'] == ContentType.Table:
-                                    para_text += f"\n![]({join_path(img_buket_path, span['image_path'])})  \n"
+                                    # if processed by table model
+                                    if span.get('latex', ''):
+                                        para_text += f"\n\n$\n {span['latex']}\n$\n\n"
+                                    else:
+                                        para_text += f"\n![{table_caption}]({join_path(img_buket_path, span['image_path'])})  \n"
                 for block in para_block['blocks']:  # 3rd.拼table_footnote
                     if block['type'] == BlockType.TableFootnote:
                         para_text += merge_para_with_text(block)
@@ -163,7 +168,7 @@ def merge_para_with_text(para_block):
                 else:
                     content = ocr_escape_special_markdown_char(content)
             elif span_type == ContentType.InlineEquation:
-                content = f"${span['content']}$"
+                content = f" ${span['content']}$ "
             elif span_type == ContentType.InterlineEquation:
                 content = f"\n$$\n{span['content']}\n$$\n"
@@ -249,6 +254,8 @@ def para_to_standard_format_v2(para_block, img_buket_path, page_idx):
         }
         for block in para_block['blocks']:
             if block['type'] == BlockType.TableBody:
+                if block["lines"][0]["spans"][0].get('latex', ''):
+                    para_content['table_body'] = f"\n\n$\n {block['lines'][0]['spans'][0]['latex']}\n$\n\n"
                 para_content['img_path'] = join_path(img_buket_path, block["lines"][0]["spans"][0]['image_path'])
             if block['type'] == BlockType.TableCaption:
                 para_content['table_caption'] = merge_para_with_text(block)

magic_pdf/libs/Constants.py CHANGED Viewed

@@ -8,4 +8,7 @@ CROSS_PAGE = "cross_page"
 block维度自定义字段
 """
 # block中lines是否被删除
-LINES_DELETED = "lines_deleted"
+LINES_DELETED = "lines_deleted"
+# table recognition max time default value
+TABLE_MAX_TIME_VALUE = 400

magic_pdf/libs/config_reader.py CHANGED Viewed

@@ -57,16 +57,6 @@ def get_bucket_name(path):
     return bucket
-def get_local_dir():
-    config = read_config()
-    local_dir = config.get("temp-output-dir")
-    if local_dir is None:
-        logger.warning(f"'temp-output-dir' not found in {CONFIG_FILE_NAME}, use '/tmp' as default")
-        return "/tmp"
-    else:
-        return local_dir
 def get_local_models_dir():
     config = read_config()
     models_dir = config.get("models-dir")
@@ -87,5 +77,15 @@ def get_device():
         return device
+def get_table_recog_config():
+    config = read_config()
+    table_config = config.get("table-config")
+    if table_config is None:
+        logger.warning(f"'table-config' not found in {CONFIG_FILE_NAME}, use 'False' as default")
+        return json.loads('{"is_table_recog_enable": false, "max_time": 400}')
+    else:
+        return table_config
 if __name__ == "__main__":
     ak, sk, endpoint = get_s3_config("llm-raw")

magic_pdf/libs/draw_bbox.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from magic_pdf.libs.Constants import CROSS_PAGE
 from magic_pdf.libs.commons import fitz  # PyMuPDF
-from magic_pdf.libs.ocr_content_type import ContentType, BlockType
+from magic_pdf.libs.ocr_content_type import ContentType, BlockType, CategoryId
+from magic_pdf.model.magic_model import MagicModel
 def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config):
@@ -225,3 +226,67 @@ def draw_span_bbox(pdf_info, pdf_bytes, out_path):
     # Save the PDF
     pdf_docs.save(f"{out_path}/spans.pdf")
+def drow_model_bbox(model_list: list, pdf_bytes, out_path):
+    dropped_bbox_list = []
+    tables_body_list, tables_caption_list, tables_footnote_list = [], [], []
+    imgs_body_list, imgs_caption_list = [], []
+    titles_list = []
+    texts_list = []
+    interequations_list = []
+    pdf_docs = fitz.open("pdf", pdf_bytes)
+    magic_model = MagicModel(model_list, pdf_docs)
+    for i in range(len(model_list)):
+        page_dropped_list = []
+        tables_body, tables_caption, tables_footnote = [], [], []
+        imgs_body, imgs_caption = [], []
+        titles = []
+        texts = []
+        interequations = []
+        page_info = magic_model.get_model_list(i)
+        layout_dets = page_info["layout_dets"]
+        for layout_det in layout_dets:
+            bbox = layout_det["bbox"]
+            if layout_det["category_id"] == CategoryId.Text:
+                texts.append(bbox)
+            elif layout_det["category_id"] == CategoryId.Title:
+                titles.append(bbox)
+            elif layout_det["category_id"] == CategoryId.TableBody:
+                tables_body.append(bbox)
+            elif layout_det["category_id"] == CategoryId.TableCaption:
+                tables_caption.append(bbox)
+            elif layout_det["category_id"] == CategoryId.TableFootnote:
+                tables_footnote.append(bbox)
+            elif layout_det["category_id"] == CategoryId.ImageBody:
+                imgs_body.append(bbox)
+            elif layout_det["category_id"] == CategoryId.ImageCaption:
+                imgs_caption.append(bbox)
+            elif layout_det["category_id"] == CategoryId.InterlineEquation_YOLO:
+                interequations.append(bbox)
+            elif layout_det["category_id"] == CategoryId.Abandon:
+                page_dropped_list.append(bbox)
+        tables_body_list.append(tables_body)
+        tables_caption_list.append(tables_caption)
+        tables_footnote_list.append(tables_footnote)
+        imgs_body_list.append(imgs_body)
+        imgs_caption_list.append(imgs_caption)
+        titles_list.append(titles)
+        texts_list.append(texts)
+        interequations_list.append(interequations)
+        dropped_bbox_list.append(page_dropped_list)
+    for i, page in enumerate(pdf_docs):
+        draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158], True) # color !
+        draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True)
+        draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True)
+        draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True)
+        draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True)
+        draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True)
+        draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True)
+        draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True)
+        draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True)
+    # Save the PDF
+    pdf_docs.save(f"{out_path}/model.pdf")

magic_pdf/libs/ocr_content_type.py CHANGED Viewed

@@ -19,3 +19,17 @@ class BlockType:
     Footnote = "footnote"
     Discarded = "discarded"
+class CategoryId:
+    Title = 0
+    Text = 1
+    Abandon = 2
+    ImageBody = 3
+    ImageCaption = 4
+    TableBody = 5
+    TableCaption = 6
+    TableFootnote = 7
+    InterlineEquation_Layout = 8
+    InlineEquation = 13
+    InterlineEquation_YOLO = 14
+    OcrText = 15

magic_pdf/libs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.6.~~2b1~~"
1	+ __version__ = "0.7.0b1"

magic_pdf/model/doc_analyze_by_custom_model.py CHANGED Viewed

@@ -4,7 +4,7 @@ import fitz
 import numpy as np
 from loguru import logger
-from magic_pdf.libs.config_reader import get_local_models_dir, get_device
+from magic_pdf.libs.config_reader import get_local_models_dir, get_device, get_table_recog_config
 from magic_pdf.model.model_list import MODEL
 import magic_pdf.model as model_config
@@ -37,8 +37,8 @@ def load_images_from_pdf(pdf_bytes: bytes, dpi=200) -> list:
             mat = fitz.Matrix(dpi / 72, dpi / 72)
             pm = page.get_pixmap(matrix=mat, alpha=False)
-            # if width or height > 3000 pixels, don't enlarge the image
-            if pm.width > 3000 or pm.height > 3000:
+            # If the width or height exceeds 9000 after scaling, do not scale further.
+            if pm.width > 9000 or pm.height > 9000:
                 pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
             img = Image.frombytes("RGB", (pm.width, pm.height), pm.samples)
@@ -84,7 +84,13 @@ def custom_model_init(ocr: bool = False, show_log: bool = False):
             # 从配置文件读取model-dir和device
             local_models_dir = get_local_models_dir()
             device = get_device()
-            custom_model = CustomPEKModel(ocr=ocr, show_log=show_log, models_dir=local_models_dir, device=device)
+            table_config = get_table_recog_config()
+            model_input = {"ocr": ocr,
+                           "show_log": show_log,
+                           "models_dir": local_models_dir,
+                           "device": device,
+                           "table_config": table_config}
+            custom_model = CustomPEKModel(**model_input)
         else:
             logger.error("Not allow model_name!")
             exit(1)

magic_pdf/model/magic_model.py CHANGED Viewed

@@ -560,6 +560,10 @@ class MagicModel:
                 if category_id == 3:
                     span["type"] = ContentType.Image
                 elif category_id == 5:
+                    # 获取table模型结果
+                    latex = layout_det.get("latex", None)
+                    if latex:
+                        span["latex"] = latex
                     span["type"] = ContentType.Table
                 elif category_id == 13:
                     span["content"] = layout_det["latex"]

magic_pdf/model/pdf_extract_kit.py CHANGED Viewed

@@ -2,6 +2,8 @@ from loguru import logger
 import os
 import time
+from magic_pdf.libs.Constants import TABLE_MAX_TIME_VALUE
 os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1'  # 禁止albumentations检查更新
 try:
     import cv2
@@ -10,6 +12,7 @@ try:
     import numpy as np
     import torch
     import torchtext
     if torchtext.__version__ >= "0.18.0":
         torchtext.disable_torchtext_deprecation_warning()
     from PIL import Image
@@ -24,12 +27,18 @@ except ImportError as e:
     logger.exception(e)
     logger.error(
         'Required dependency not installed, please install by \n'
-        '"pip install magic-pdf[full] detectron2 --extra-index-url https://myhloli.github.io/wheels/"')
+        '"pip install magic-pdf[full] --extra-index-url https://myhloli.github.io/wheels/"')
     exit(1)
 from magic_pdf.model.pek_sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
 from magic_pdf.model.pek_sub_modules.post_process import get_croped_image, latex_rm_whitespace
 from magic_pdf.model.pek_sub_modules.self_modify import ModifiedPaddleOCR
+from magic_pdf.model.pek_sub_modules.structeqtable.StructTableModel import StructTableModel
+def table_model_init(model_path, max_time, _device_='cpu'):
+    table_model = StructTableModel(model_path, max_time=max_time, device=_device_)
+    return table_model
 def mfd_model_init(weight):
@@ -95,10 +104,13 @@ class CustomPEKModel:
         # 初始化解析配置
         self.apply_layout = kwargs.get("apply_layout", self.configs["config"]["layout"])
         self.apply_formula = kwargs.get("apply_formula", self.configs["config"]["formula"])
+        self.table_config = kwargs.get("table_config", self.configs["config"]["table_config"])
+        self.apply_table = self.table_config.get("is_table_recog_enable", False)
+        self.table_max_time = self.table_config.get("max_time", TABLE_MAX_TIME_VALUE)
         self.apply_ocr = ocr
         logger.info(
-            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}".format(
-                self.apply_layout, self.apply_formula, self.apply_ocr
+            "DocAnalysis init, this may take some times. apply_layout: {}, apply_formula: {}, apply_ocr: {}, apply_table: {}".format(
+                self.apply_layout, self.apply_formula, self.apply_ocr, self.apply_table
             )
         )
         assert self.apply_layout, "DocAnalysis must contain layout model."
@@ -129,6 +141,10 @@ class CustomPEKModel:
         if self.apply_ocr:
             self.ocr_model = ModifiedPaddleOCR(show_log=show_log)
+        # init structeqtable
+        if self.apply_table:
+            self.table_model = table_model_init(str(os.path.join(models_dir, self.configs["weights"]["table"])),
+                                                max_time = self.table_max_time, _device_=self.device)
         logger.info('DocAnalysis init done!')
     def __call__(self, image):
@@ -172,50 +188,56 @@ class CustomPEKModel:
             mfr_cost = round(time.time() - mfr_start, 2)
             logger.info(f"formula nums: {len(mf_image_list)}, mfr time: {mfr_cost}")
+        # Select regions for OCR / formula regions / table regions
+        ocr_res_list = []
+        table_res_list = []
+        single_page_mfdetrec_res = []
+        for res in layout_res:
+            if int(res['category_id']) in [13, 14]:
+                single_page_mfdetrec_res.append({
+                    "bbox": [int(res['poly'][0]), int(res['poly'][1]),
+                             int(res['poly'][4]), int(res['poly'][5])],
+                })
+            elif int(res['category_id']) in [0, 1, 2, 4, 6, 7]:
+                ocr_res_list.append(res)
+            elif int(res['category_id']) in [5]:
+                table_res_list.append(res)
+        #  Unified crop img logic
+        def crop_img(input_res, input_pil_img, crop_paste_x=0, crop_paste_y=0):
+            crop_xmin, crop_ymin = int(input_res['poly'][0]), int(input_res['poly'][1])
+            crop_xmax, crop_ymax = int(input_res['poly'][4]), int(input_res['poly'][5])
+            # Create a white background with an additional width and height of 50
+            crop_new_width = crop_xmax - crop_xmin + crop_paste_x * 2
+            crop_new_height = crop_ymax - crop_ymin + crop_paste_y * 2
+            return_image = Image.new('RGB', (crop_new_width, crop_new_height), 'white')
+            # Crop image
+            crop_box = (crop_xmin, crop_ymin, crop_xmax, crop_ymax)
+            cropped_img = input_pil_img.crop(crop_box)
+            return_image.paste(cropped_img, (crop_paste_x, crop_paste_y))
+            return_list = [crop_paste_x, crop_paste_y, crop_xmin, crop_ymin, crop_xmax, crop_ymax, crop_new_width, crop_new_height]
+            return return_image, return_list
+        pil_img = Image.fromarray(image)
         # ocr识别
         if self.apply_ocr:
             ocr_start = time.time()
-            pil_img = Image.fromarray(image)
-            # 筛选出需要OCR的区域和公式区域
-            ocr_res_list = []
-            single_page_mfdetrec_res = []
-            for res in layout_res:
-                if int(res['category_id']) in [13, 14]:
-                    single_page_mfdetrec_res.append({
-                        "bbox": [int(res['poly'][0]), int(res['poly'][1]),
-                                 int(res['poly'][4]), int(res['poly'][5])],
-                    })
-                elif int(res['category_id']) in [0, 1, 2, 4, 6, 7]:
-                    ocr_res_list.append(res)
-            # 对每一个需OCR处理的区域进行处理
+            # Process each area that requires OCR processing
             for res in ocr_res_list:
-                xmin, ymin = int(res['poly'][0]), int(res['poly'][1])
-                xmax, ymax = int(res['poly'][4]), int(res['poly'][5])
-                paste_x = 50
-                paste_y = 50
-                # 创建一个宽高各多50的白色背景
-                new_width = xmax - xmin + paste_x * 2
-                new_height = ymax - ymin + paste_y * 2
-                new_image = Image.new('RGB', (new_width, new_height), 'white')
-                # 裁剪图像
-                crop_box = (xmin, ymin, xmax, ymax)
-                cropped_img = pil_img.crop(crop_box)
-                new_image.paste(cropped_img, (paste_x, paste_y))
-                # 调整公式区域坐标
+                new_image, useful_list = crop_img(res, pil_img, crop_paste_x=50, crop_paste_y=50)
+                paste_x, paste_y, xmin, ymin, xmax, ymax, new_width, new_height = useful_list
+                # Adjust the coordinates of the formula area
                 adjusted_mfdetrec_res = []
                 for mf_res in single_page_mfdetrec_res:
                     mf_xmin, mf_ymin, mf_xmax, mf_ymax = mf_res["bbox"]
-                    # 将公式区域坐标调整为相对于裁剪区域的坐标
+                    # Adjust the coordinates of the formula area to the coordinates relative to the cropping area
                     x0 = mf_xmin - xmin + paste_x
                     y0 = mf_ymin - ymin + paste_y
                     x1 = mf_xmax - xmin + paste_x
                     y1 = mf_ymax - ymin + paste_y
-                    # 过滤在图外的公式块
+                    # Filter formula blocks outside the graph
                     if any([x1 < 0, y1 < 0]) or any([x0 > new_width, y0 > new_height]):
                         continue
                     else:
@@ -223,17 +245,17 @@ class CustomPEKModel:
                             "bbox": [x0, y0, x1, y1],
                         })
-                # OCR识别
+                # OCR recognition
                 new_image = cv2.cvtColor(np.asarray(new_image), cv2.COLOR_RGB2BGR)
                 ocr_res = self.ocr_model.ocr(new_image, mfd_res=adjusted_mfdetrec_res)[0]
-                # 整合结果
+                # Integration results
                 if ocr_res:
                     for box_ocr_res in ocr_res:
                         p1, p2, p3, p4 = box_ocr_res[0]
                         text, score = box_ocr_res[1]
-                        # 将坐标转换回原图坐标系
+                        # Convert the coordinates back to the original coordinate system
                         p1 = [p1[0] - paste_x + xmin, p1[1] - paste_y + ymin]
                         p2 = [p2[0] - paste_x + xmin, p2[1] - paste_y + ymin]
                         p3 = [p3[0] - paste_x + xmin, p3[1] - paste_y + ymin]
@@ -249,4 +271,26 @@ class CustomPEKModel:
             ocr_cost = round(time.time() - ocr_start, 2)
             logger.info(f"ocr cost: {ocr_cost}")
+        # 表格识别 table recognition
+        if self.apply_table:
+            table_start = time.time()
+            for res in table_res_list:
+                new_image, _ = crop_img(res, pil_img)
+                single_table_start_time = time.time()
+                logger.info("------------------table recognition processing begins-----------------")
+                with torch.no_grad():
+                    latex_code = self.table_model.image2latex(new_image)[0]
+                run_time = time.time() - single_table_start_time
+                logger.info(f"------------table recognition processing ends within {run_time}s-----")
+                if run_time > self.table_max_time:
+                    logger.warning(f"------------table recognition processing exceeds max time {self.table_max_time}s----------")
+                # 判断是否返回正常
+                expected_ending = latex_code.strip().endswith('end{tabular}') or latex_code.strip().endswith('end{table}')
+                if latex_code and expected_ending:
+                    res["latex"] = latex_code
+                else:
+                    logger.warning(f"------------table recognition processing fails----------")
+            table_cost = round(time.time() - table_start, 2)
+            logger.info(f"table cost: {table_cost}")
         return layout_res

magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py ADDED Viewed

@@ -0,0 +1,22 @@
+from struct_eqtable.model import StructTable
+from pypandoc import convert_text
+class StructTableModel:
+    def __init__(self, model_path, max_new_tokens=2048, max_time=400, device = 'cpu'):
+        # init
+        self.model_path = model_path
+        self.max_new_tokens = max_new_tokens # maximum output tokens length
+        self.max_time = max_time # timeout for processing in seconds
+        if device == 'cuda':
+            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time).cuda()
+        else:
+            self.model = StructTable(self.model_path, self.max_new_tokens, self.max_time)
+    def image2latex(self, image) -> str:
+        #
+        table_latex = self.model.forward(image)
+        return table_latex
+    def image2html(self, image) -> str:
+        table_latex = self.image2latex(image)
+        table_html = convert_text(table_latex, 'html', format='latex')
+        return table_html

magic_pdf/resources/model_config/model_configs.yaml CHANGED Viewed

@@ -2,8 +2,12 @@ config:
   device: cpu
   layout: True
   formula: True
+  table_config:
+    is_table_recog_enable: False
+    max_time: 400
 weights:
   layout: Layout/model_final.pth
   mfd: MFD/weights.pt
   mfr: MFR/UniMERNet
+  table: TabRec/StructEqTable

magic_pdf/rw/AbsReaderWriter.py CHANGED Viewed

@@ -2,33 +2,16 @@ from abc import ABC, abstractmethod
 class AbsReaderWriter(ABC):
-    """
-    同时支持二进制和文本读写的抽象类
-    """
     MODE_TXT = "text"
     MODE_BIN = "binary"
-    def __init__(self, parent_path):
-        # 初始化代码可以在这里添加，如果需要的话
-        self.parent_path = parent_path # 对于本地目录是父目录，对于s3是会写到这个path下。
     @abstractmethod
     def read(self, path: str, mode=MODE_TXT):
-        """
-        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
-        """
         raise NotImplementedError
     @abstractmethod
     def write(self, content: str, path: str, mode=MODE_TXT):
-        """
-        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
-        """
         raise NotImplementedError
     @abstractmethod
-    def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding='utf-8'):
-        """
-        无论对于本地还是s3的路径，检查如果path是绝对路径，那么就不再 拼接parent_path, 如果是相对路径就拼接parent_path
-        """
+    def read_offset(self, path: str, offset=0, limit=None) -> bytes:
         raise NotImplementedError

magic_pdf/rw/DiskReaderWriter.py CHANGED Viewed

@@ -3,34 +3,29 @@ from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 from loguru import logger
-MODE_TXT = "text"
-MODE_BIN = "binary"
 class DiskReaderWriter(AbsReaderWriter):
     def __init__(self, parent_path, encoding="utf-8"):
         self.path = parent_path
         self.encoding = encoding
-    def read(self, path, mode=MODE_TXT):
+    def read(self, path, mode=AbsReaderWriter.MODE_TXT):
         if os.path.isabs(path):
             abspath = path
         else:
             abspath = os.path.join(self.path, path)
         if not os.path.exists(abspath):
-            logger.error(f"文件 {abspath} 不存在")
-            raise Exception(f"文件 {abspath} 不存在")
-        if mode == MODE_TXT:
+            logger.error(f"file {abspath} not exists")
+            raise Exception(f"file {abspath} no exists")
+        if mode == AbsReaderWriter.MODE_TXT:
             with open(abspath, "r", encoding=self.encoding) as f:
                 return f.read()
-        elif mode == MODE_BIN:
+        elif mode == AbsReaderWriter.MODE_BIN:
             with open(abspath, "rb") as f:
                 return f.read()
         else:
             raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-    def write(self, content, path, mode=MODE_TXT):
+    def write(self, content, path, mode=AbsReaderWriter.MODE_TXT):
         if os.path.isabs(path):
             abspath = path
         else:
@@ -38,29 +33,42 @@ class DiskReaderWriter(AbsReaderWriter):
         directory_path = os.path.dirname(abspath)
         if not os.path.exists(directory_path):
             os.makedirs(directory_path)
-        if mode == MODE_TXT:
+        if mode == AbsReaderWriter.MODE_TXT:
             with open(abspath, "w", encoding=self.encoding, errors="replace") as f:
                 f.write(content)
-        elif mode == MODE_BIN:
+        elif mode == AbsReaderWriter.MODE_BIN:
             with open(abspath, "wb") as f:
                 f.write(content)
         else:
             raise ValueError("Invalid mode. Use 'text' or 'binary'.")
-    def read_jsonl(self, path: str, byte_start=0, byte_end=None, encoding="utf-8"):
-        return self.read(path)
+    def read_offset(self, path: str, offset=0, limit=None):
+        abspath = path
+        if not os.path.isabs(path):
+            abspath = os.path.join(self.path, path)
+        with open(abspath, "rb") as f:
+            f.seek(offset)
+            return f.read(limit)
-# 使用示例
 if __name__ == "__main__":
-    file_path = "io/test/example.txt"
-    drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
+    if 0:
+        file_path = "io/test/example.txt"
+        drw = DiskReaderWriter("D:\projects\papayfork\Magic-PDF\magic_pdf")
+        # 写入内容到文件
+        drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
+        # 从文件读取内容
+        content = drw.read(path=file_path)
+        if content:
+            logger.info(f"从 {file_path} 读取的内容: {content}")
+    if 1:
+        drw = DiskReaderWriter("/opt/data/pdf/resources/test/io/")
+        content_bin = drw.read_offset("1.txt")
+        assert content_bin == b"ABCD!"
-    # 写入内容到文件
-    drw.write(b"Hello, World!", path="io/test/example.txt", mode="binary")
+        content_bin = drw.read_offset("1.txt", offset=1, limit=2)
+        assert content_bin == b"BC"
-    # 从文件读取内容
-    content = drw.read(path=file_path)
-    if content:
-        logger.info(f"从 {file_path} 读取的内容: {content}")

magic-pdf 0.6.2b1__py3-none-any.whl → 0.7.0b1__py3-none-any.whl

magic-pdf 0.6.2b1py3-none-any.whl → 0.7.0b1py3-none-any.whl