PyPI - pydatamax - Versions diffs - 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl - Mend

pydatamax 0.1.16py3-none-any.whl → 0.1.16.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

datamax/loader/core.py +67 -42
datamax/loader/minio_handler.py +38 -19
datamax/parser/__init__.py +2 -1
datamax/parser/base.py +46 -22
datamax/parser/core.py +215 -126
datamax/parser/csv_parser.py +25 -5
datamax/parser/doc_parser.py +230 -141
datamax/parser/docx_parser.py +275 -186
datamax/parser/epub_parser.py +49 -13
datamax/parser/html_parser.py +36 -16
datamax/parser/image_parser.py +52 -14
datamax/parser/json_parser.py +26 -5
datamax/parser/md_parser.py +40 -21
datamax/parser/pdf_parser.py +69 -29
datamax/parser/ppt_parser.py +41 -9
datamax/parser/pptx_parser.py +49 -21
datamax/parser/txt_parser.py +45 -14
datamax/parser/xls_parser.py +34 -6
datamax/parser/xlsx_parser.py +58 -51
datamax/utils/__init__.py +2 -1
datamax/utils/data_cleaner.py +36 -22
datamax/utils/env_setup.py +25 -18
datamax/utils/gotocr_pdf.py +13 -13
datamax/utils/lifecycle_types.py +18 -0
datamax/utils/mineru_operator.py +17 -15
datamax/utils/paddleocr_pdf_operator.py +34 -19
datamax/utils/ppt_extract.py +34 -11
datamax/utils/qa_generator.py +332 -44
datamax/utils/tokenizer.py +10 -9
datamax/utils/uno_handler.py +91 -68
{pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
pydatamax-0.1.16.dist-info/RECORD +0 -38
{pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
{pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
{pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0

datamax/utils/gotocr_pdf.py CHANGED Viewed

@@ -18,11 +18,11 @@ from transformers import AutoTokenizer
 fitz = try_import("fitz")
-DEFAULT_IMAGE_TOKEN = "<image>"  # nosec B105 - 这是技术常量，不是密码
-DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>"  # nosec B105 - 这是技术常量，不是密码
+DEFAULT_IMAGE_TOKEN = "<image>"  # nosec B105 - technical const,not a passward
+DEFAULT_IMAGE_PATCH_TOKEN = "<imgpad>"  # nosec B105 - technical const,not a passward
-DEFAULT_IM_START_TOKEN = "<img>"  # nosec B105 - 这是技术常量，不是密码
-DEFAULT_IM_END_TOKEN = "</img>"  # nosec B105 - 这是技术常量，不是密码
+DEFAULT_IM_START_TOKEN = "<img>"  # nosec B105 - technical const,not a passward
+DEFAULT_IM_END_TOKEN = "</img>"  # nosec B105 - technical const,not a passward
 translation_table = str.maketrans(punctuation_dict)
@@ -50,7 +50,7 @@ def covert_pdf_to_image(image_path: str):
     with fitz.open(image_path) as pdf:
         for pg in range(0, pdf.page_count):
             page = pdf[pg]
-            mat = fitz.Matrix(4, 4)  # 全程放大四倍
+            mat = fitz.Matrix(4, 4)  # Magnify by four times throughout the process
             pm = page.get_pixmap(matrix=mat, alpha=False)
             # if pm.width > 2000 or pm.height > 2000:
             #     pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
@@ -64,16 +64,16 @@ def covert_pdf_to_image(image_path: str):
     output = "output"
     img_paths = []
     for index, pdf_img in enumerate(imgs):
-        # 图片处理
+        # img processing
         gray_img = cv2.cvtColor(pdf_img, cv2.COLOR_BGR2GRAY)
-        # 二值化处理
+        # Binarization processing
         _, binary_img = cv2.threshold(
             gray_img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
         )
-        # 去噪
+        # denoise
         filtered_img = cv2.medianBlur(binary_img, 3)
         processed_img = filtered_img
@@ -100,7 +100,7 @@ def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
     model_name = os.path.expanduser(model_path)
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-    # 加载模型
+    # load model
     model = GOTQwenForCausalLM.from_pretrained(
         model_name,
         low_cpu_mem_usage=True,
@@ -109,13 +109,13 @@ def initialize_model(model_path: str = "./GOT_weights/", gpu_id: int = 6):
         pad_token_id=151643,
     ).eval()
-    # 确保模型和张量都移动到目标设备
+    # Ensure that both the model and the tensor are moved to the target device.
     device = torch.device(f"cuda:{gpu_id}")
     model.to(device=device, dtype=torch.bfloat16)
-    # 确保分词器的输出也在目标设备上
-    tokenizer.model_max_length = 512  # 设置最大长度，根据需要调整
-    tokenizer.padding_side = "right"  # 设置填充方向，根据需要调整
+    # Ensure that the output of the tokenizer is also on the target device.
+    tokenizer.model_max_length = 512  # maxlength，adjust to need
+    tokenizer.padding_side = "right"  # padding side，adjust to need
     return model, tokenizer

datamax/utils/lifecycle_types.py ADDED Viewed

@@ -0,0 +1,18 @@
+from enum import Enum
+class LifeType(Enum):
+    # 数据处理阶段
+    DATA_PROCESSING = "DATA_PROCESSING"  # 正在处理
+    DATA_PROCESSED = "DATA_PROCESSED"  # 处理完成
+    DATA_PROCESS_FAILED = "DATA_PROCESS_FAILED"  # 处理失败
+    # 数据清洗阶段
+    DATA_CLEANING = "DATA_CLEANING"  # 正在清洗
+    DATA_CLEANED = "DATA_CLEANED"  # 清洗完成
+    DATA_CLEAN_FAILED = "DATA_CLEAN_FAILED"  # 清洗失败
+    # 数据标注阶段
+    DATA_LABELLING = "DATA_LABELLING"  # 正在标注
+    DATA_LABELLED = "DATA_LABELLED"  # 标注完成
+    DATA_LABEL_FAILED = "DATA_LABEL_FAILED"  # 标注失败

datamax/utils/mineru_operator.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import os
-from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
+from magic_pdf.config.enums import SupportedPdfParseMethod
+from magic_pdf.data.data_reader_writer import FileBasedDataReader, FileBasedDataWriter
 from magic_pdf.data.dataset import PymuDocDataset
 from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
-from magic_pdf.config.enums import SupportedPdfParseMethod
 class PdfProcessor:
@@ -31,32 +32,33 @@ class PdfProcessor:
         # 处理流程
         ds = PymuDocDataset(pdf_bytes)
-        markdown_path = os.path.join(local_md_dir, f"{name_without_suff}.md")  # 完整路径
-        image_dir = os.path.basename(local_image_dir)  # 保持相对路径为 "images"
+        markdown_path = os.path.join(
+            local_md_dir, f"{name_without_suff}.md"
+        )  # absolute path
+        image_dir = os.path.basename(local_image_dir)  # keep relative path as  "images"
         if ds.classify() == SupportedPdfParseMethod.OCR:
             ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer).dump_md(
-                md_writer,
-                os.path.basename(markdown_path),  # 文件名部分
-                image_dir
+                md_writer, os.path.basename(markdown_path), image_dir  # filename
             )
         else:
             ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer).dump_md(
-                md_writer,
-                os.path.basename(markdown_path),  # 文件名部分
-                image_dir
+                md_writer, os.path.basename(markdown_path), image_dir  # filename
             )
-        with open(markdown_path, "r", encoding='utf-8') as f:
+        with open(markdown_path, "r", encoding="utf-8") as f:
             markdown_content = f.read()
         return markdown_content
 pdf_processor = PdfProcessor()
-# 使用示例
+# usage example
 if __name__ == "__main__":
     # pdf_processor = PdfProcessor()
-    print(pdf_processor.process_pdf(
-        "/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
-    ))
+    print(
+        pdf_processor.process_pdf(
+            "/home/caocaiyu/datamax-service/backend/uploaded_files/fde1daee-e899-4e93-87ff-706234c399c3/20250227132500_5447d25cbf094a3295f9d52d3408a048.pdf"
+        )
+    )

datamax/utils/paddleocr_pdf_operator.py CHANGED Viewed

@@ -1,22 +1,24 @@
 """layout_recovery"""
 import os
 import pathlib
 import sys
+from copy import deepcopy
+from datetime import datetime
 import cv2
 import numpy as np
 from PIL import Image
-from copy import deepcopy
-from datetime import datetime
-os.environ['KMP_DUPLICATE_LIB_OK']='True'
+os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
 ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
 sys.path.append(str(ROOT_DIR))
 from paddle.utils import try_import
 from paddleocr import PPStructure, save_structure_res
-sys.path.append('/usr/local/lib/python3.10/dist-packages/paddleocr')
-from ppstructure.recovery.recovery_to_doc import convert_info_docx
-from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
+sys.path.append("/usr/local/lib/python3.10/dist-packages/paddleocr")
+from ppstructure.recovery.recovery_to_doc import convert_info_docx, sorted_layout_boxes
 def recovery(img_path, output, use_gpu, gpu_id):
@@ -48,20 +50,24 @@ def recovery(img_path, output, use_gpu, gpu_id):
     img_paths = []
     for index, pdf_img in enumerate(imgs):
         os.makedirs(os.path.join(output, img_name), exist_ok=True)
-        pdf_img_path = os.path.join(output, img_name, img_name + "_" + str(index) + ".jpg")
+        pdf_img_path = os.path.join(
+            output, img_name, img_name + "_" + str(index) + ".jpg"
+        )
         cv2.imwrite(pdf_img_path, pdf_img)
         img_paths.append([pdf_img_path, pdf_img])
     # step3: Convert images to DOCX
     all_res = []
-    engine = PPStructure(recovery=True,
-                         use_gpu=use_gpu,
-                         gpu_id=gpu_id,
-                         det_model_dir=f'{ROOT_DIR}/ocr_model_dir/det/en/en_PP-OCRv3_det_infer',
-                         rec_model_dir=f'{ROOT_DIR}/ocr_model_dir/rec/ch/ch_PP-OCRv4_rec_infer',
-                         table_model_dir=f'{ROOT_DIR}/ocr_model_dir/table/en_ppstructure_mobile_v2.0_SLANet_infer',
-                         layout_model_dir=f'{ROOT_DIR}/ocr_model_dir/layout/picodet_lcnet_x1_0_fgd_layout_infer',
-                         formula_model_dir=f'{ROOT_DIR}/ocr_model_dir/formula/rec_latex_ocr_infer')
+    engine = PPStructure(
+        recovery=True,
+        use_gpu=use_gpu,
+        gpu_id=gpu_id,
+        det_model_dir=f"{ROOT_DIR}/ocr_model_dir/det/en/en_PP-OCRv3_det_infer",
+        rec_model_dir=f"{ROOT_DIR}/ocr_model_dir/rec/ch/ch_PP-OCRv4_rec_infer",
+        table_model_dir=f"{ROOT_DIR}/ocr_model_dir/table/en_ppstructure_mobile_v2.0_SLANet_infer",
+        layout_model_dir=f"{ROOT_DIR}/ocr_model_dir/layout/picodet_lcnet_x1_0_fgd_layout_infer",
+        formula_model_dir=f"{ROOT_DIR}/ocr_model_dir/formula/rec_latex_ocr_infer",
+    )
     for index, (new_img_path, imgs) in enumerate(img_paths):
         print("processing {}/{} page:".format(index + 1, len(img_paths)))
         result = engine(imgs, img_idx=index)
@@ -72,18 +78,27 @@ def recovery(img_path, output, use_gpu, gpu_id):
         all_res += result_sorted
     try:
         convert_info_docx(imgs, all_res, output, img_name)
-        os.rename(f'./output/{img_name}_ocr.docx',
-                  f'./output/{os.path.basename(img_path).replace(".pdf", "")}_ocr.docx')
+        os.rename(
+            f"./output/{img_name}_ocr.docx",
+            f'./output/{os.path.basename(img_path).replace(".pdf", "")}_ocr.docx',
+        )
     except Exception as e:
         raise e
-def use_paddleocr(input_files: str, output_files: str, use_gpu: bool = False, gpu_id: int = 6):
+def use_paddleocr(
+    input_files: str, output_files: str, use_gpu: bool = False, gpu_id: int = 6
+):
     try:
         if not os.path.exists(output_files):
             os.makedirs(output_files)
         try:
-            recovery(img_path=input_files, output=output_files, use_gpu=use_gpu, gpu_id=gpu_id)
+            recovery(
+                img_path=input_files,
+                output=output_files,
+                use_gpu=use_gpu,
+                gpu_id=gpu_id,
+            )
         except Exception as e:
             raise e
     except Exception as e:

datamax/utils/ppt_extract.py CHANGED Viewed

@@ -1,17 +1,18 @@
 import os
-from loguru import logger
 from functools import lru_cache
-from typing import List, Dict, Union
 from pathlib import Path
+from typing import Dict, List, Union
+from loguru import logger
 from PIL.Image import Image
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE
 from pptx.shapes.base import BaseShape as Shape
-from pptx.shapes.picture import Picture
 from pptx.shapes.graphfrm import GraphicFrame
 from pptx.shapes.group import GroupShape
+from pptx.shapes.picture import Picture
 from pptx.slide import Slide
-from pptx.table import Table, _Row, _Cell
+from pptx.table import Table, _Cell, _Row
 from werkzeug.datastructures import FileStorage
@@ -25,7 +26,15 @@ class PPtExtractor:
             raise ValueError("img_name must be a string")
         return f"media/{id}/{img_name}"
-    def handle_shape(self, shape: Shape, content_list: List[Dict[str, str]], media_dir: Path, img_map: Dict[Path, str], id: str, skip_image: bool):
+    def handle_shape(
+        self,
+        shape: Shape,
+        content_list: List[Dict[str, str]],
+        media_dir: Path,
+        img_map: Dict[Path, str],
+        id: str,
+        skip_image: bool,
+    ):
         if not isinstance(shape, Shape):
             raise ValueError("Invalid shape object")
         if not isinstance(content_list, list):
@@ -53,7 +62,9 @@ class PPtExtractor:
                 shape: Picture
                 image: Image = shape.image
                 image_bytes = image.blob
-                img_path = media_dir.resolve().joinpath(f"pic-{len(img_map)}.{image.ext}")
+                img_path = media_dir.resolve().joinpath(
+                    f"pic-{len(img_map)}.{image.ext}"
+                )
                 if not media_dir.exists():
                     media_dir.mkdir(parents=True, exist_ok=True)
                 if not os.access(media_dir, os.W_OK):
@@ -76,14 +87,18 @@ class PPtExtractor:
                         md += "\n|"
                     for col in row.cells:
                         cell: _Cell = col
-                        md += " " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
+                        md += (
+                            " " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
+                        )
                     md += "\n"
                 md += "\n"
                 content_list.append({"type": "md", "data": md})
             elif shape_type == MSO_SHAPE_TYPE.GROUP:
                 shape: GroupShape
                 for sub_shape in shape.shapes:
-                    self.handle_shape(sub_shape, content_list, media_dir, img_map, id, skip_image)
+                    self.handle_shape(
+                        sub_shape, content_list, media_dir, img_map, id, skip_image
+                    )
             else:
                 logger.info(f"Unknown shape type: {shape_type}, {type(shape)}")
         except PermissionError as pe:
@@ -93,8 +108,14 @@ class PPtExtractor:
         except Exception as e:
             logger.error(f"Error handling shape: {e}")
-    def extract(self, presentation_source: Union[FileStorage, Path], id: str, dir: Path, media_dir: Path,
-                skip_image: bool):
+    def extract(
+        self,
+        presentation_source: Union[FileStorage, Path],
+        id: str,
+        dir: Path,
+        media_dir: Path,
+        skip_image: bool,
+    ):
         if not isinstance(presentation_source, (FileStorage, Path)):
             raise ValueError("presentation_source must be a FileStorage or Path object")
         if not isinstance(id, str):
@@ -115,7 +136,9 @@ class PPtExtractor:
                 slide: Slide
                 page = {"page_no": page_no, "content_list": []}
                 for shape in slide.shapes:
-                    self.handle_shape(shape, page["content_list"], media_dir, img_map, id, skip_image)
+                    self.handle_shape(
+                        shape, page["content_list"], media_dir, img_map, id, skip_image
+                    )
                 pages.append(page)
         except FileNotFoundError as fnfe:
             logger.error(f"File not found: {fnfe}")

pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

pydatamax 0.1.16py3-none-any.whl → 0.1.16.post2py3-none-any.whl