PyPI - magic-pdf - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl - Mend

magic-pdf 0.6.1py3-none-any.whl → 0.7.0a1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

magic_pdf/dict2md/ocr_mkcontent.py +20 -7
magic_pdf/libs/config_reader.py +28 -10
magic_pdf/libs/language.py +12 -0
magic_pdf/libs/version.py +1 -1
magic_pdf/model/__init__.py +1 -1
magic_pdf/model/doc_analyze_by_custom_model.py +35 -3
magic_pdf/model/magic_model.py +49 -41
magic_pdf/model/pdf_extract_kit.py +155 -60
magic_pdf/model/pek_sub_modules/layoutlmv3/model_init.py +7 -6
magic_pdf/model/pek_sub_modules/self_modify.py +87 -43
magic_pdf/model/pek_sub_modules/structeqtable/StructTableModel.py +22 -0
magic_pdf/model/pp_structure_v2.py +1 -1
magic_pdf/pdf_parse_union_core.py +4 -2
magic_pdf/pre_proc/citationmarker_remove.py +5 -1
magic_pdf/pre_proc/ocr_detect_all_bboxes.py +40 -2
magic_pdf/pre_proc/ocr_span_list_modify.py +12 -7
magic_pdf/resources/fasttext-langdetect/lid.176.ftz +0 -0
magic_pdf/resources/model_config/layoutlmv3/layoutlmv3_base_inference.yaml +2 -2
magic_pdf/resources/model_config/model_configs.yaml +4 -0
magic_pdf/rw/AbsReaderWriter.py +1 -18
magic_pdf/rw/DiskReaderWriter.py +32 -24
magic_pdf/rw/S3ReaderWriter.py +83 -48
magic_pdf/tools/cli.py +79 -0
magic_pdf/tools/cli_dev.py +156 -0
magic_pdf/tools/common.py +119 -0
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/METADATA +120 -72
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/RECORD +34 -35
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/WHEEL +1 -1
magic_pdf-0.7.0a1.dist-info/entry_points.txt +3 -0
magic_pdf/cli/magicpdf.py +0 -337
magic_pdf/pdf_parse_for_train.py +0 -685
magic_pdf/train_utils/convert_to_train_format.py +0 -65
magic_pdf/train_utils/extract_caption.py +0 -59
magic_pdf/train_utils/remove_footer_header.py +0 -159
magic_pdf/train_utils/vis_utils.py +0 -327
magic_pdf-0.6.1.dist-info/entry_points.txt +0 -2
/magic_pdf/libs/{math.py → local_math.py} +0 -0
/magic_pdf/{cli → model/pek_sub_modules/structeqtable}/__init__.py +0 -0
/magic_pdf/{train_utils → tools}/__init__.py +0 -0
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/LICENSE.md +0 -0
{magic_pdf-0.6.1.dist-info → magic_pdf-0.7.0a1.dist-info}/top_level.txt +0 -0

magic_pdf/cli/magicpdf.py DELETED Viewed

@@ -1,337 +0,0 @@
-"""
-这里实现2个click命令：
-第一个：
- 接收一个完整的s3路径，例如：s3://llm-pdf-text/pdf_ebook_and_paper/pre-clean-mm-markdown/v014/part-660420b490be-000008.jsonl?bytes=0,81350
-    1）根据~/magic-pdf.json里的ak,sk等，构造s3cliReader读取到这个jsonl的对应行，返回json对象。
-    2）根据Json对象里的pdf的s3路径获取到他的ak,sk,endpoint，构造出s3cliReader用来读取pdf
-    3）从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalImageWriter，用来保存截图
-    4）从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter，用来读写本地文件
-    最后把以上步骤准备好的对象传入真正的解析API
-第二个：
-  接收1）pdf的本地路径。2）模型json文件（可选）。然后：
-    1）根据~/magic-pdf.json读取到本地保存图片、md等临时目录的位置，构造出LocalImageWriter，用来保存截图
-    2）从magic-pdf.json里读取到本地保存图片、Md等的临时目录位置,构造出LocalIRdWriter，用来读写本地文件
-    3）根据约定，根据pdf本地路径，推导出pdf模型的json，并读入
-效果：
-python magicpdf.py json-command --json  s3://llm-pdf-text/scihub/xxxx.json?bytes=0,81350
-python magicpdf.py pdf-command --pdf  /home/llm/Downloads/xxxx.pdf --model /home/llm/Downloads/xxxx.json  或者 python magicpdf.py --pdf  /home/llm/Downloads/xxxx.pdf
-"""
-import os
-import json as json_parse
-import click
-from loguru import logger
-from pathlib import Path
-from magic_pdf.libs.version import __version__
-from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
-from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox
-from magic_pdf.pipe.UNIPipe import UNIPipe
-from magic_pdf.pipe.OCRPipe import OCRPipe
-from magic_pdf.pipe.TXTPipe import TXTPipe
-from magic_pdf.libs.path_utils import (
-    parse_s3path,
-    parse_s3_range_params,
-    remove_non_official_s3_args,
-)
-from magic_pdf.libs.config_reader import (
-    get_local_dir,
-    get_s3_config,
-)
-from magic_pdf.rw.S3ReaderWriter import S3ReaderWriter
-from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
-from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
-import csv
-import copy
-import magic_pdf.model as model_config
-parse_pdf_methods = click.Choice(["ocr", "txt", "auto"])
-def prepare_env(pdf_file_name, method):
-    local_parent_dir = os.path.join(get_local_dir(), "magic-pdf", pdf_file_name, method)
-    local_image_dir = os.path.join(str(local_parent_dir), "images")
-    local_md_dir = local_parent_dir
-    os.makedirs(local_image_dir, exist_ok=True)
-    os.makedirs(local_md_dir, exist_ok=True)
-    return local_image_dir, local_md_dir
-def write_to_csv(csv_file_path, csv_data):
-    with open(csv_file_path, mode="a", newline="", encoding="utf-8") as csvfile:
-        # 创建csv writer对象
-        csv_writer = csv.writer(csvfile)
-        # 写入数据
-        csv_writer.writerow(csv_data)
-    logger.info(f"数据已成功追加到 '{csv_file_path}'")
-def do_parse(
-        pdf_file_name,
-        pdf_bytes,
-        model_list,
-        parse_method,
-        f_draw_span_bbox=True,
-        f_draw_layout_bbox=True,
-        f_dump_md=True,
-        f_dump_middle_json=True,
-        f_dump_model_json=True,
-        f_dump_orig_pdf=True,
-        f_dump_content_list=True,
-        f_make_md_mode=MakeMode.MM_MD,
-):
-    orig_model_list = copy.deepcopy(model_list)
-    local_image_dir, local_md_dir = prepare_env(pdf_file_name, parse_method)
-    logger.info(f"local output dir is {local_md_dir}")
-    image_writer, md_writer = DiskReaderWriter(local_image_dir), DiskReaderWriter(local_md_dir)
-    image_dir = str(os.path.basename(local_image_dir))
-    if parse_method == "auto":
-        jso_useful_key = {"_pdf_type": "", "model_list": model_list}
-        pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer, is_debug=True)
-    elif parse_method == "txt":
-        pipe = TXTPipe(pdf_bytes, model_list, image_writer, is_debug=True)
-    elif parse_method == "ocr":
-        pipe = OCRPipe(pdf_bytes, model_list, image_writer, is_debug=True)
-    else:
-        logger.error("unknown parse method")
-        exit(1)
-    pipe.pipe_classify()
-    """如果没有传入有效的模型数据，则使用内置model解析"""
-    if len(model_list) == 0:
-        if model_config.__use_inside_model__:
-            pipe.pipe_analyze()
-            orig_model_list = copy.deepcopy(pipe.model_list)
-        else:
-            logger.error("need model list input")
-            exit(1)
-    pipe.pipe_parse()
-    pdf_info = pipe.pdf_mid_data["pdf_info"]
-    if f_draw_layout_bbox:
-        draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir)
-    if f_draw_span_bbox:
-        draw_span_bbox(pdf_info, pdf_bytes, local_md_dir)
-    md_content = pipe.pipe_mk_markdown(image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode)
-    if f_dump_md:
-        """写markdown"""
-        md_writer.write(
-            content=md_content,
-            path=f"{pdf_file_name}.md",
-            mode=AbsReaderWriter.MODE_TXT,
-        )
-    if f_dump_middle_json:
-        """写middle_json"""
-        md_writer.write(
-            content=json_parse.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
-            path=f"{pdf_file_name}_middle.json",
-            mode=AbsReaderWriter.MODE_TXT,
-        )
-    if f_dump_model_json:
-        """写model_json"""
-        md_writer.write(
-            content=json_parse.dumps(orig_model_list, ensure_ascii=False, indent=4),
-            path=f"{pdf_file_name}_model.json",
-            mode=AbsReaderWriter.MODE_TXT,
-        )
-    if f_dump_orig_pdf:
-        """写源pdf"""
-        md_writer.write(
-            content=pdf_bytes,
-            path=f"{pdf_file_name}_origin.pdf",
-            mode=AbsReaderWriter.MODE_BIN,
-        )
-    content_list = pipe.pipe_mk_uni_format(image_dir, drop_mode=DropMode.NONE)
-    if f_dump_content_list:
-        """写content_list"""
-        md_writer.write(
-            content=json_parse.dumps(content_list, ensure_ascii=False, indent=4),
-            path=f"{pdf_file_name}_content_list.json",
-            mode=AbsReaderWriter.MODE_TXT,
-        )
-@click.group()
-@click.version_option(__version__, "--version", "-v", help="显示版本信息")
-@click.help_option("--help", "-h", help="显示帮助信息")
-def cli():
-    pass
-@cli.command()
-@click.option("--json", type=str, help="输入一个S3路径")
-@click.option(
-    "--method",
-    type=parse_pdf_methods,
-    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
-    default="auto",
-)
-@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-@click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
-def json_command(json, method, inside_model, model_mode):
-    model_config.__use_inside_model__ = inside_model
-    model_config.__model_mode__ = model_mode
-    if not json.startswith("s3://"):
-        logger.error("usage: magic-pdf json-command --json s3://some_bucket/some_path")
-        exit(1)
-    def read_s3_path(s3path):
-        bucket, key = parse_s3path(s3path)
-        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
-        s3_rw = S3ReaderWriter(
-            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
-        )
-        may_range_params = parse_s3_range_params(s3path)
-        if may_range_params is None or 2 != len(may_range_params):
-            byte_start, byte_end = 0, None
-        else:
-            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
-            byte_end += byte_start - 1
-        return s3_rw.read_jsonl(
-            remove_non_official_s3_args(s3path),
-            byte_start,
-            byte_end,
-            AbsReaderWriter.MODE_BIN,
-        )
-    jso = json_parse.loads(read_s3_path(json).decode("utf-8"))
-    s3_file_path = jso.get("file_location")
-    if s3_file_path is None:
-        s3_file_path = jso.get("path")
-    pdf_file_name = Path(s3_file_path).stem
-    pdf_data = read_s3_path(s3_file_path)
-    do_parse(
-        pdf_file_name,
-        pdf_data,
-        jso["doc_layout_result"],
-        method,
-    )
-@cli.command()
-@click.option("--local_json", type=str, help="输入一个本地jsonl路径")
-@click.option(
-    "--method",
-    type=parse_pdf_methods,
-    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
-    default="auto",
-)
-@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-@click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
-def local_json_command(local_json, method, inside_model, model_mode):
-    model_config.__use_inside_model__ = inside_model
-    model_config.__model_mode__ = model_mode
-    def read_s3_path(s3path):
-        bucket, key = parse_s3path(s3path)
-        s3_ak, s3_sk, s3_endpoint = get_s3_config(bucket)
-        s3_rw = S3ReaderWriter(
-            s3_ak, s3_sk, s3_endpoint, "auto", remove_non_official_s3_args(s3path)
-        )
-        may_range_params = parse_s3_range_params(s3path)
-        if may_range_params is None or 2 != len(may_range_params):
-            byte_start, byte_end = 0, None
-        else:
-            byte_start, byte_end = int(may_range_params[0]), int(may_range_params[1])
-            byte_end += byte_start - 1
-        return s3_rw.read_jsonl(
-            remove_non_official_s3_args(s3path),
-            byte_start,
-            byte_end,
-            AbsReaderWriter.MODE_BIN,
-        )
-    with open(local_json, "r", encoding="utf-8") as f:
-        for json_line in f:
-            jso = json_parse.loads(json_line)
-            s3_file_path = jso.get("file_location")
-            if s3_file_path is None:
-                s3_file_path = jso.get("path")
-            pdf_file_name = Path(s3_file_path).stem
-            pdf_data = read_s3_path(s3_file_path)
-            do_parse(
-                pdf_file_name,
-                pdf_data,
-                jso["doc_layout_result"],
-                method,
-            )
-@cli.command()
-@click.option(
-    "--pdf", type=click.Path(exists=True), required=True, help="PDF文件的路径"
-)
-@click.option("--model", type=click.Path(exists=True), help="模型的路径")
-@click.option(
-    "--method",
-    type=parse_pdf_methods,
-    help="指定解析方法。txt: 文本型 pdf 解析方法， ocr: 光学识别解析 pdf, auto: 程序智能选择解析方法",
-    default="auto",
-)
-@click.option("--inside_model", type=click.BOOL, default=False, help="使用内置模型测试")
-@click.option("--model_mode", type=click.STRING, default="full", help="内置模型选择。lite: 快速解析，精度较低，full: 高精度解析，速度较慢")
-def pdf_command(pdf, model, method, inside_model, model_mode):
-    model_config.__use_inside_model__ = inside_model
-    model_config.__model_mode__ = model_mode
-    def read_fn(path):
-        disk_rw = DiskReaderWriter(os.path.dirname(path))
-        return disk_rw.read(os.path.basename(path), AbsReaderWriter.MODE_BIN)
-    pdf_data = read_fn(pdf)
-    def get_model_json(model_path):
-        # 这里处理pdf和模型相关的逻辑
-        if model_path is None:
-            file_name_without_extension, extension = os.path.splitext(pdf)
-            if extension == ".pdf":
-                model_path = file_name_without_extension + ".json"
-            else:
-                raise Exception("pdf_path input error")
-            if not os.path.exists(model_path):
-                logger.warning(
-                    f"not found json {model_path} existed"
-                )
-                # 本地无模型数据则调用内置paddle分析，先传空list，在内部识别到空list再调用paddle
-                model_json = "[]"
-            else:
-                model_json = read_fn(model_path).decode("utf-8")
-        else:
-            model_json = read_fn(model_path).decode("utf-8")
-        return model_json
-    jso = json_parse.loads(get_model_json(model))
-    pdf_file_name = Path(pdf).stem
-    do_parse(
-        pdf_file_name,
-        pdf_data,
-        jso,
-        method,
-    )
-if __name__ == "__main__":
-    """
-    python magic_pdf/cli/magicpdf.py json-command --json s3://llm-pdf-text/pdf_ebook_and_paper/manual/v001/part-660407a28beb-000002.jsonl?bytes=0,63551
-    """
-    cli()

magic-pdf 0.6.1__py3-none-any.whl → 0.7.0a1__py3-none-any.whl

magic-pdf 0.6.1py3-none-any.whl → 0.7.0a1py3-none-any.whl