PyPI - pydatamax - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

pydatamax 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

datamax/__init__.py +1 -1
datamax/loader/core.py +118 -118
datamax/loader/minio_handler.py +171 -171
datamax/loader/oss_handler.py +191 -191
datamax/parser/__init__.py +2 -4
datamax/parser/base.py +76 -76
datamax/parser/core.py +406 -288
datamax/parser/csv_parser.py +31 -10
datamax/parser/doc_parser.py +466 -10
datamax/parser/docx_parser.py +449 -11
datamax/parser/epub_parser.py +41 -41
datamax/parser/html_parser.py +37 -37
datamax/parser/image_parser.py +34 -34
datamax/parser/json_parser.py +32 -10
datamax/parser/md_parser.py +72 -72
datamax/parser/pdf_parser.py +101 -101
datamax/parser/ppt_parser.py +70 -20
datamax/parser/pptx_parser.py +45 -45
datamax/parser/txt_parser.py +45 -45
datamax/parser/xls_parser.py +26 -26
datamax/parser/xlsx_parser.py +212 -215
datamax/utils/__init__.py +23 -2
datamax/utils/constants.py +58 -58
datamax/utils/data_cleaner.py +275 -237
datamax/utils/env_setup.py +79 -79
datamax/utils/gotocr_pdf.py +265 -265
datamax/utils/mineru_operator.py +62 -62
datamax/utils/paddleocr_pdf_operator.py +90 -90
datamax/utils/ppt_extract.py +140 -140
datamax/utils/qa_generator.py +369 -376
datamax/utils/tokenizer.py +21 -21
datamax/utils/uno_handler.py +426 -0
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/METADATA +117 -5
pydatamax-0.1.15.dist-info/RECORD +38 -0
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
pydatamax-0.1.14.dist-info/RECORD +0 -39
tests/__init__.py +0 -0
tests/test_basic.py +0 -20
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0

datamax/utils/paddleocr_pdf_operator.py CHANGED Viewed

@@ -1,90 +1,90 @@
-"""layout_recovery"""
-import os
-import pathlib
-import sys
-import cv2
-import numpy as np
-from PIL import Image
-from copy import deepcopy
-from datetime import datetime
-os.environ['KMP_DUPLICATE_LIB_OK']='True'
-ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
-sys.path.append(str(ROOT_DIR))
-from paddle.utils import try_import
-from paddleocr import PPStructure, save_structure_res
-sys.path.append('/usr/local/lib/python3.10/dist-packages/paddleocr')
-from ppstructure.recovery.recovery_to_doc import convert_info_docx
-from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
-def recovery(img_path, output, use_gpu, gpu_id):
-    """
-    Convert a PDF file to a Word document with layout recovery.
-    :param img_path: Path to the PDF file
-    :param output: Path to the output folder
-    """
-    fitz = try_import("fitz")
-    # step1: Convert PDF to images
-    imgs = []
-    with fitz.open(img_path) as pdf:
-        for pg in range(0, pdf.page_count):
-            page = pdf[pg]
-            mat = fitz.Matrix(2, 2)
-            pm = page.get_pixmap(matrix=mat, alpha=False)
-            if pm.width > 2000 or pm.height > 2000:
-                pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
-            img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
-            img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-            imgs.append(img)
-    img_name = datetime.now().strftime("%Y%m%d%H%M%S")
-    # step2: Process images
-    img_paths = []
-    for index, pdf_img in enumerate(imgs):
-        os.makedirs(os.path.join(output, img_name), exist_ok=True)
-        pdf_img_path = os.path.join(output, img_name, img_name + "_" + str(index) + ".jpg")
-        cv2.imwrite(pdf_img_path, pdf_img)
-        img_paths.append([pdf_img_path, pdf_img])
-    # step3: Convert images to DOCX
-    all_res = []
-    engine = PPStructure(recovery=True,
-                         use_gpu=use_gpu,
-                         gpu_id=gpu_id,
-                         det_model_dir=f'{ROOT_DIR}/ocr_model_dir/det/en/en_PP-OCRv3_det_infer',
-                         rec_model_dir=f'{ROOT_DIR}/ocr_model_dir/rec/ch/ch_PP-OCRv4_rec_infer',
-                         table_model_dir=f'{ROOT_DIR}/ocr_model_dir/table/en_ppstructure_mobile_v2.0_SLANet_infer',
-                         layout_model_dir=f'{ROOT_DIR}/ocr_model_dir/layout/picodet_lcnet_x1_0_fgd_layout_infer',
-                         formula_model_dir=f'{ROOT_DIR}/ocr_model_dir/formula/rec_latex_ocr_infer')
-    for index, (new_img_path, imgs) in enumerate(img_paths):
-        print("processing {}/{} page:".format(index + 1, len(img_paths)))
-        result = engine(imgs, img_idx=index)
-        save_structure_res(result, output, img_name, index)
-        h, w, _ = imgs.shape
-        result_cp = deepcopy(result)
-        result_sorted = sorted_layout_boxes(result_cp, w)
-        all_res += result_sorted
-    try:
-        convert_info_docx(imgs, all_res, output, img_name)
-        os.rename(f'./output/{img_name}_ocr.docx',
-                  f'./output/{os.path.basename(img_path).replace(".pdf", "")}_ocr.docx')
-    except Exception as e:
-        raise e
-def use_paddleocr(input_files: str, output_files: str, use_gpu: bool = False, gpu_id: int = 6):
-    try:
-        if not os.path.exists(output_files):
-            os.makedirs(output_files)
-        try:
-            recovery(img_path=input_files, output=output_files, use_gpu=use_gpu, gpu_id=gpu_id)
-        except Exception as e:
-            raise e
-    except Exception as e:
-        raise e
+"""layout_recovery"""
+import os
+import pathlib
+import sys
+import cv2
+import numpy as np
+from PIL import Image
+from copy import deepcopy
+from datetime import datetime
+os.environ['KMP_DUPLICATE_LIB_OK']='True'
+ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
+sys.path.append(str(ROOT_DIR))
+from paddle.utils import try_import
+from paddleocr import PPStructure, save_structure_res
+sys.path.append('/usr/local/lib/python3.10/dist-packages/paddleocr')
+from ppstructure.recovery.recovery_to_doc import convert_info_docx
+from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes
+def recovery(img_path, output, use_gpu, gpu_id):
+    """
+    Convert a PDF file to a Word document with layout recovery.
+    :param img_path: Path to the PDF file
+    :param output: Path to the output folder
+    """
+    fitz = try_import("fitz")
+    # step1: Convert PDF to images
+    imgs = []
+    with fitz.open(img_path) as pdf:
+        for pg in range(0, pdf.page_count):
+            page = pdf[pg]
+            mat = fitz.Matrix(2, 2)
+            pm = page.get_pixmap(matrix=mat, alpha=False)
+            if pm.width > 2000 or pm.height > 2000:
+                pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)
+            img = Image.frombytes("RGB", [pm.width, pm.height], pm.samples)
+            img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+            imgs.append(img)
+    img_name = datetime.now().strftime("%Y%m%d%H%M%S")
+    # step2: Process images
+    img_paths = []
+    for index, pdf_img in enumerate(imgs):
+        os.makedirs(os.path.join(output, img_name), exist_ok=True)
+        pdf_img_path = os.path.join(output, img_name, img_name + "_" + str(index) + ".jpg")
+        cv2.imwrite(pdf_img_path, pdf_img)
+        img_paths.append([pdf_img_path, pdf_img])
+    # step3: Convert images to DOCX
+    all_res = []
+    engine = PPStructure(recovery=True,
+                         use_gpu=use_gpu,
+                         gpu_id=gpu_id,
+                         det_model_dir=f'{ROOT_DIR}/ocr_model_dir/det/en/en_PP-OCRv3_det_infer',
+                         rec_model_dir=f'{ROOT_DIR}/ocr_model_dir/rec/ch/ch_PP-OCRv4_rec_infer',
+                         table_model_dir=f'{ROOT_DIR}/ocr_model_dir/table/en_ppstructure_mobile_v2.0_SLANet_infer',
+                         layout_model_dir=f'{ROOT_DIR}/ocr_model_dir/layout/picodet_lcnet_x1_0_fgd_layout_infer',
+                         formula_model_dir=f'{ROOT_DIR}/ocr_model_dir/formula/rec_latex_ocr_infer')
+    for index, (new_img_path, imgs) in enumerate(img_paths):
+        print("processing {}/{} page:".format(index + 1, len(img_paths)))
+        result = engine(imgs, img_idx=index)
+        save_structure_res(result, output, img_name, index)
+        h, w, _ = imgs.shape
+        result_cp = deepcopy(result)
+        result_sorted = sorted_layout_boxes(result_cp, w)
+        all_res += result_sorted
+    try:
+        convert_info_docx(imgs, all_res, output, img_name)
+        os.rename(f'./output/{img_name}_ocr.docx',
+                  f'./output/{os.path.basename(img_path).replace(".pdf", "")}_ocr.docx')
+    except Exception as e:
+        raise e
+def use_paddleocr(input_files: str, output_files: str, use_gpu: bool = False, gpu_id: int = 6):
+    try:
+        if not os.path.exists(output_files):
+            os.makedirs(output_files)
+        try:
+            recovery(img_path=input_files, output=output_files, use_gpu=use_gpu, gpu_id=gpu_id)
+        except Exception as e:
+            raise e
+    except Exception as e:
+        raise e

datamax/utils/ppt_extract.py CHANGED Viewed

@@ -1,140 +1,140 @@
-import os
-import logging
-from functools import lru_cache
-from typing import List, Dict, Union
-from pathlib import Path
-from PIL.Image import Image
-from pptx import Presentation
-from pptx.enum.shapes import MSO_SHAPE_TYPE
-from pptx.shapes.base import BaseShape as Shape
-from pptx.shapes.picture import Picture
-from pptx.shapes.graphfrm import GraphicFrame
-from pptx.shapes.group import GroupShape
-from pptx.slide import Slide
-from pptx.table import Table, _Row, _Cell
-from werkzeug.datastructures import FileStorage
-class PPtExtractor:
-    @lru_cache(maxsize=128)
-    def generate_img_path(self, id: str, img_name: str) -> str:
-        if not isinstance(id, str):
-            raise ValueError("id must be a string")
-        if not isinstance(img_name, str):
-            raise ValueError("img_name must be a string")
-        return f"media/{id}/{img_name}"
-    def handle_shape(self, shape: Shape, content_list: List[Dict[str, str]], media_dir: Path, img_map: Dict[Path, str], id: str, skip_image: bool):
-        if not isinstance(shape, Shape):
-            raise ValueError("Invalid shape object")
-        if not isinstance(content_list, list):
-            raise ValueError("content_list must be a list")
-        if not isinstance(media_dir, Path):
-            raise ValueError("media_dir must be a Path object")
-        if not isinstance(img_map, dict):
-            raise ValueError("img_map must be a dictionary")
-        if not isinstance(id, str):
-            raise ValueError("id must be a string")
-        if not isinstance(skip_image, bool):
-            raise ValueError("skip_image must be a boolean")
-        try:
-            shape_type = shape.shape_type
-            if shape.has_text_frame:
-                for paragraph in shape.text_frame.paragraphs:
-                    content_list.append(
-                        {
-                            "type": "text",
-                            "data": paragraph.text + "\n",
-                        }
-                    )
-            elif shape_type == MSO_SHAPE_TYPE.PICTURE and not skip_image:
-                shape: Picture
-                image: Image = shape.image
-                image_bytes = image.blob
-                img_path = media_dir.resolve().joinpath(f"pic-{len(img_map)}.{image.ext}")
-                if not media_dir.exists():
-                    media_dir.mkdir(parents=True, exist_ok=True)
-                if not os.access(media_dir, os.W_OK):
-                    raise PermissionError(f"Cannot write to directory: {media_dir}")
-                img_s3_path = self.generate_img_path(id, img_path.name)
-                img_map[img_path] = img_s3_path
-                content_list.append({"type": "image", "data": img_s3_path})
-                with open(img_path, "wb") as file:
-                    file.write(image_bytes)
-            elif shape_type == MSO_SHAPE_TYPE.TABLE:
-                shape: GraphicFrame
-                table: Table = shape.table
-                md = "\n"
-                for row_no, row in enumerate(table.rows):
-                    row: _Row
-                    md += "|"
-                    if row_no == 1:
-                        for col in row.cells:
-                            md += "---|"
-                        md += "\n|"
-                    for col in row.cells:
-                        cell: _Cell = col
-                        md += " " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
-                    md += "\n"
-                md += "\n"
-                content_list.append({"type": "md", "data": md})
-            elif shape_type == MSO_SHAPE_TYPE.GROUP:
-                shape: GroupShape
-                for sub_shape in shape.shapes:
-                    self.handle_shape(sub_shape, content_list, media_dir, img_map, id, skip_image)
-            else:
-                logging.info(f"Unknown shape type: {shape_type}, {type(shape)}")
-        except PermissionError as pe:
-            logging.error(f"Permission error: {pe}")
-        except IOError as ie:
-            logging.error(f"IO error: {ie}")
-        except Exception as e:
-            logging.error(f"Error handling shape: {e}")
-    def extract(self, presentation_source: Union[FileStorage, Path], id: str, dir: Path, media_dir: Path,
-                skip_image: bool):
-        if not isinstance(presentation_source, (FileStorage, Path)):
-            raise ValueError("presentation_source must be a FileStorage or Path object")
-        if not isinstance(id, str):
-            raise ValueError("id must be a string")
-        if not isinstance(dir, Path):
-            raise ValueError("dir must be a Path object")
-        if not isinstance(media_dir, Path):
-            raise ValueError("media_dir must be a Path object")
-        if not isinstance(skip_image, bool):
-            raise ValueError("skip_image must be a boolean")
-        pages = []
-        img_map = {}
-        try:
-            presentation: Presentation = Presentation(presentation_source)
-            for page_no, slide in enumerate(presentation.slides):
-                slide: Slide
-                page = {"page_no": page_no, "content_list": []}
-                for shape in slide.shapes:
-                    self.handle_shape(shape, page["content_list"], media_dir, img_map, id, skip_image)
-                pages.append(page)
-        except FileNotFoundError as fnfe:
-            logging.error(f"File not found: {fnfe}")
-        except PermissionError as pe:
-            logging.error(f"Permission error: {pe}")
-        except IOError as ie:
-            logging.error(f"IO error: {ie}")
-        except Exception as e:
-            logging.error(f"Error extracting presentation: {e}")
-        return pages
-    def run(self, id: str, file_path: Path, skip_image: bool = False):
-        if not isinstance(id, str):
-            raise ValueError("id must be a string")
-        if not isinstance(file_path, Path):
-            raise ValueError("file_path must be a Path object")
-        if not isinstance(skip_image, bool):
-            raise ValueError("skip_image must be a boolean")
-        media_dir = Path("media").resolve()
-        return self.extract(file_path, id, Path("."), media_dir, skip_image)
+import os
+from loguru import logger
+from functools import lru_cache
+from typing import List, Dict, Union
+from pathlib import Path
+from PIL.Image import Image
+from pptx import Presentation
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+from pptx.shapes.base import BaseShape as Shape
+from pptx.shapes.picture import Picture
+from pptx.shapes.graphfrm import GraphicFrame
+from pptx.shapes.group import GroupShape
+from pptx.slide import Slide
+from pptx.table import Table, _Row, _Cell
+from werkzeug.datastructures import FileStorage
+class PPtExtractor:
+    @lru_cache(maxsize=128)
+    def generate_img_path(self, id: str, img_name: str) -> str:
+        if not isinstance(id, str):
+            raise ValueError("id must be a string")
+        if not isinstance(img_name, str):
+            raise ValueError("img_name must be a string")
+        return f"media/{id}/{img_name}"
+    def handle_shape(self, shape: Shape, content_list: List[Dict[str, str]], media_dir: Path, img_map: Dict[Path, str], id: str, skip_image: bool):
+        if not isinstance(shape, Shape):
+            raise ValueError("Invalid shape object")
+        if not isinstance(content_list, list):
+            raise ValueError("content_list must be a list")
+        if not isinstance(media_dir, Path):
+            raise ValueError("media_dir must be a Path object")
+        if not isinstance(img_map, dict):
+            raise ValueError("img_map must be a dictionary")
+        if not isinstance(id, str):
+            raise ValueError("id must be a string")
+        if not isinstance(skip_image, bool):
+            raise ValueError("skip_image must be a boolean")
+        try:
+            shape_type = shape.shape_type
+            if shape.has_text_frame:
+                for paragraph in shape.text_frame.paragraphs:
+                    content_list.append(
+                        {
+                            "type": "text",
+                            "data": paragraph.text + "\n",
+                        }
+                    )
+            elif shape_type == MSO_SHAPE_TYPE.PICTURE and not skip_image:
+                shape: Picture
+                image: Image = shape.image
+                image_bytes = image.blob
+                img_path = media_dir.resolve().joinpath(f"pic-{len(img_map)}.{image.ext}")
+                if not media_dir.exists():
+                    media_dir.mkdir(parents=True, exist_ok=True)
+                if not os.access(media_dir, os.W_OK):
+                    raise PermissionError(f"Cannot write to directory: {media_dir}")
+                img_s3_path = self.generate_img_path(id, img_path.name)
+                img_map[img_path] = img_s3_path
+                content_list.append({"type": "image", "data": img_s3_path})
+                with open(img_path, "wb") as file:
+                    file.write(image_bytes)
+            elif shape_type == MSO_SHAPE_TYPE.TABLE:
+                shape: GraphicFrame
+                table: Table = shape.table
+                md = "\n"
+                for row_no, row in enumerate(table.rows):
+                    row: _Row
+                    md += "|"
+                    if row_no == 1:
+                        for col in row.cells:
+                            md += "---|"
+                        md += "\n|"
+                    for col in row.cells:
+                        cell: _Cell = col
+                        md += " " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
+                    md += "\n"
+                md += "\n"
+                content_list.append({"type": "md", "data": md})
+            elif shape_type == MSO_SHAPE_TYPE.GROUP:
+                shape: GroupShape
+                for sub_shape in shape.shapes:
+                    self.handle_shape(sub_shape, content_list, media_dir, img_map, id, skip_image)
+            else:
+                logger.info(f"Unknown shape type: {shape_type}, {type(shape)}")
+        except PermissionError as pe:
+            logger.error(f"Permission error: {pe}")
+        except IOError as ie:
+            logger.error(f"IO error: {ie}")
+        except Exception as e:
+            logger.error(f"Error handling shape: {e}")
+    def extract(self, presentation_source: Union[FileStorage, Path], id: str, dir: Path, media_dir: Path,
+                skip_image: bool):
+        if not isinstance(presentation_source, (FileStorage, Path)):
+            raise ValueError("presentation_source must be a FileStorage or Path object")
+        if not isinstance(id, str):
+            raise ValueError("id must be a string")
+        if not isinstance(dir, Path):
+            raise ValueError("dir must be a Path object")
+        if not isinstance(media_dir, Path):
+            raise ValueError("media_dir must be a Path object")
+        if not isinstance(skip_image, bool):
+            raise ValueError("skip_image must be a boolean")
+        pages = []
+        img_map = {}
+        try:
+            presentation: Presentation = Presentation(presentation_source)
+            for page_no, slide in enumerate(presentation.slides):
+                slide: Slide
+                page = {"page_no": page_no, "content_list": []}
+                for shape in slide.shapes:
+                    self.handle_shape(shape, page["content_list"], media_dir, img_map, id, skip_image)
+                pages.append(page)
+        except FileNotFoundError as fnfe:
+            logger.error(f"File not found: {fnfe}")
+        except PermissionError as pe:
+            logger.error(f"Permission error: {pe}")
+        except IOError as ie:
+            logger.error(f"IO error: {ie}")
+        except Exception as e:
+            logger.error(f"Error extracting presentation: {e}")
+        return pages
+    def run(self, id: str, file_path: Path, skip_image: bool = False):
+        if not isinstance(id, str):
+            raise ValueError("id must be a string")
+        if not isinstance(file_path, Path):
+            raise ValueError("file_path must be a Path object")
+        if not isinstance(skip_image, bool):
+            raise ValueError("skip_image must be a boolean")
+        media_dir = Path("media").resolve()
+        return self.extract(file_path, id, Path("."), media_dir, skip_image)

pydatamax 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

pydatamax 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl