PyPI - pydatamax - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.15.post2__py3-none-any.whl - Mend

pydatamax 0.1.14py3-none-any.whl → 0.1.15.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

datamax/__init__.py +1 -1
datamax/loader/core.py +118 -118
datamax/loader/minio_handler.py +171 -171
datamax/loader/oss_handler.py +191 -191
datamax/parser/__init__.py +2 -4
datamax/parser/base.py +76 -76
datamax/parser/core.py +406 -288
datamax/parser/csv_parser.py +31 -10
datamax/parser/doc_parser.py +466 -10
datamax/parser/docx_parser.py +449 -11
datamax/parser/epub_parser.py +41 -41
datamax/parser/html_parser.py +37 -37
datamax/parser/image_parser.py +34 -34
datamax/parser/json_parser.py +32 -10
datamax/parser/md_parser.py +72 -72
datamax/parser/pdf_parser.py +101 -101
datamax/parser/ppt_parser.py +70 -20
datamax/parser/pptx_parser.py +45 -45
datamax/parser/txt_parser.py +45 -45
datamax/parser/xls_parser.py +26 -26
datamax/parser/xlsx_parser.py +212 -215
datamax/utils/__init__.py +23 -2
datamax/utils/constants.py +58 -58
datamax/utils/data_cleaner.py +275 -237
datamax/utils/env_setup.py +79 -79
datamax/utils/gotocr_pdf.py +265 -265
datamax/utils/mineru_operator.py +62 -62
datamax/utils/paddleocr_pdf_operator.py +90 -90
datamax/utils/ppt_extract.py +140 -140
datamax/utils/qa_generator.py +369 -376
datamax/utils/tokenizer.py +21 -21
datamax/utils/uno_handler.py +426 -0
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/METADATA +117 -5
pydatamax-0.1.15.post2.dist-info/RECORD +38 -0
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/licenses/LICENSE +21 -21
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/top_level.txt +0 -1
pydatamax-0.1.14.dist-info/RECORD +0 -39
tests/__init__.py +0 -0
tests/test_basic.py +0 -20
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/WHEEL +0 -0

datamax/parser/image_parser.py CHANGED Viewed

@@ -1,34 +1,34 @@
-import os
-import pathlib
-import sys
-from datamax.utils import setup_environment
-setup_environment(use_gpu=True)
-os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
-from datamax.parser.base import MarkdownOutputVo
-ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
-sys.path.insert(0, str(ROOT_DIR))
-from datamax.parser.base import BaseLife
-from datamax.parser.pdf_parser import PdfParser
-from PIL import Image
-class ImageParser(BaseLife):
-    def __init__(self,file_path: str):
-        super().__init__()
-        self.file_path = file_path
-    def parse(self, file_path: str) -> MarkdownOutputVo:
-        try:
-            title = self.get_file_extension(file_path)
-            output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
-            image = Image.open(file_path)
-            image.save(output_pdf_path, 'PDF', resolution=100.0)
-            pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
-            output_vo = pdf_parser.parse(output_pdf_path)
-            if os.path.exists(output_pdf_path):
-                # shutil.rmtree(f'uploaded_files/markdown')
-                os.remove(output_pdf_path)
-            return output_vo
-        except Exception as e:
-            raise e
+import os
+import pathlib
+import sys
+from datamax.utils import setup_environment
+setup_environment(use_gpu=True)
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+from datamax.parser.base import MarkdownOutputVo
+ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
+sys.path.insert(0, str(ROOT_DIR))
+from datamax.parser.base import BaseLife
+from datamax.parser.pdf_parser import PdfParser
+from PIL import Image
+class ImageParser(BaseLife):
+    def __init__(self,file_path: str):
+        super().__init__()
+        self.file_path = file_path
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            title = os.path.splitext(os.path.basename(file_path))[0]
+            output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
+            image = Image.open(file_path)
+            image.save(output_pdf_path, 'PDF', resolution=100.0)
+            pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
+            output_vo = pdf_parser.parse(output_pdf_path)
+            if os.path.exists(output_pdf_path):
+                # shutil.rmtree(f'uploaded_files/markdown')
+                os.remove(output_pdf_path)
+            return output_vo
+        except Exception as e:
+            raise e

datamax/parser/json_parser.py CHANGED Viewed

@@ -1,10 +1,32 @@
-from datamax.parser.base import MarkdownOutputVo
-class Parser:
-    def __init__(self, file_path):
-        self.file_path = file_path
-    def parse(self) -> MarkdownOutputVo:
-        pass
+import json
+from datamax.parser.base import BaseLife, MarkdownOutputVo
+class JsonParser(BaseLife):
+    def __init__(self, file_path):
+        super().__init__()
+        self.file_path = file_path
+    @staticmethod
+    def read_json_file(file_path: str) -> str:
+        """Read and pretty print a JSON file."""
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        return json.dumps(data, indent=2, ensure_ascii=False)
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            content = self.read_json_file(file_path)
+            lifecycle = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type="LLM_ORIGIN",
+            )
+            output_vo = MarkdownOutputVo(self.get_file_extension(file_path), content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception as e:
+            raise e

datamax/parser/md_parser.py CHANGED Viewed

@@ -1,73 +1,73 @@
-import pathlib
-import sys
-from typing import Union
-ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
-sys.path.insert(0, str(ROOT_DIR))
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
-from loguru import logger
-class MarkdownParser(BaseLife):
-    """
-    Parser for Markdown files that follows the same pattern as PdfParser.
-    Handles .md and .markdown file extensions.
-    """
-    def __init__(self,
-                 file_path: Union[str, list],
-                 ):
-        super().__init__()
-        self.file_path = file_path
-    @staticmethod
-    def read_markdown_file(file_path: str) -> str:
-        """
-        Reads the content of a markdown file.
-        Args:
-            file_path: Path to the markdown file
-        Returns:
-            str: Content of the markdown file
-        """
-        try:
-            with open(file_path, 'r', encoding='utf-8') as f:
-                return f.read()
-        except Exception as e:
-            logger.error(f"Error reading markdown file {file_path}: {e}")
-            raise
-    def parse(self, file_path: str) -> MarkdownOutputVo:
-        """
-        Parses a markdown file and returns a MarkdownOutputVo.
-        Args:
-            file_path: Path to the markdown file
-        Returns:
-            MarkdownOutputVo: Structured output containing the markdown content
-        """
-        try:
-            title = self.get_file_extension(file_path)
-            # Read markdown content
-            md_content = self.read_markdown_file(file_path)
-            # Generate lifecycle metadata
-            lifecycle = self.generate_lifecycle(
-                source_file=file_path,
-                domain="Technology",
-                usage_purpose="Documentation",
-                life_type="LLM_ORIGIN"
-            )
-            # Create and return output VO
-            output_vo = MarkdownOutputVo(title, md_content)
-            output_vo.add_lifecycle(lifecycle)
-            return output_vo.to_dict()
-        except Exception as e:
-            logger.error(f"Failed to parse markdown file {file_path}: {e}")
+import pathlib
+import sys
+from typing import Union
+ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
+sys.path.insert(0, str(ROOT_DIR))
+from datamax.parser.base import BaseLife
+from datamax.parser.base import MarkdownOutputVo
+from loguru import logger
+import os
+class MarkdownParser(BaseLife):
+    """
+    Parser for Markdown files that follows the same pattern as PdfParser.
+    Handles .md and .markdown file extensions.
+    """
+    def __init__(self,
+                 file_path: Union[str, list],
+                 ):
+        super().__init__()
+        self.file_path = file_path
+    @staticmethod
+    def read_markdown_file(file_path: str) -> str:
+        """
+        Reads the content of a markdown file.
+        Args:
+            file_path: Path to the markdown file
+        Returns:
+            str: Content of the markdown file
+        """
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception as e:
+            logger.error(f"Error reading markdown file {file_path}: {e}")
+            raise
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        """
+        Parses a markdown file and returns a MarkdownOutputVo.
+        Args:
+            file_path: Path to the markdown file
+        Returns:
+            MarkdownOutputVo: Structured output containing the markdown content
+        """
+        try:
+            title = os.path.splitext(os.path.basename(file_path))[0]
+            # Read markdown content
+            md_content = self.read_markdown_file(file_path)
+            # Generate lifecycle metadata
+            lifecycle = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type="LLM_ORIGIN"
+            )
+            # Create and return output VO
+            output_vo = MarkdownOutputVo(title, md_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception as e:
+            logger.error(f"Failed to parse markdown file {file_path}: {e}")
             raise

datamax/parser/pdf_parser.py CHANGED Viewed

@@ -1,101 +1,101 @@
-import os
-import pathlib
-import sys
-import subprocess
-from typing import Union
-ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
-sys.path.insert(0, str(ROOT_DIR))
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
-from langchain_community.document_loaders import PyMuPDFLoader
-from loguru import logger
-from datamax.utils.mineru_operator import pdf_processor
-class PdfParser(BaseLife):
-    def __init__(self,
-                 file_path: Union[str, list],
-                 use_mineru: bool = False,
-                 ):
-        super().__init__()
-        self.file_path = file_path
-        self.use_mineru = use_mineru
-    def mineru_process(self, input_pdf_filename, output_dir):
-        proc = None
-        try:
-            logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
-            command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
-            proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            # 等待命令执行完成
-            stdout, stderr = proc.communicate()
-            # 检查命令是否成功执行
-            if proc.returncode != 0:
-                raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
-            logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
-        except Exception as e:
-            logger.error(f"Error: {e}")
-            if proc is not None:
-                proc.kill()
-                proc.wait()
-                logger.info("The process was terminated due to an error.")
-            raise  # Re-raise the exception to let the caller handle it
-        finally:
-            # 确保子进程已经结束
-            if proc is not None:
-                if proc.poll() is None:
-                    proc.kill()
-                    proc.wait()
-                    logger.info("The process was terminated due to timeout or completion.")
-    @staticmethod
-    def read_pdf_file(file_path) -> str:
-        try:
-            pdf_loader = PyMuPDFLoader(file_path)
-            pdf_documents = pdf_loader.load()
-            result_text = ''
-            for page in pdf_documents:
-                result_text += page.page_content
-            return result_text
-        except Exception as e:
-            raise e
-    def parse(self, file_path: str) -> MarkdownOutputVo:
-        try:
-            title = self.get_file_extension(file_path)
-            if self.use_mineru:
-                output_dir = 'uploaded_files'
-                output_folder_name = os.path.basename(file_path).replace(".pdf", "")
-                # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
-                # if os.path.exists(output_mineru):
-                #     pass
-                # else:
-                    # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
-                # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
-                # todo: 是否有必要跟api的默认保存路径保持一致
-                output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
-                if os.path.exists(output_mineru):
-                    mk_content = open(output_mineru, 'r', encoding='utf-8').read()
-                else:
-                    mk_content = pdf_processor.process_pdf(file_path)
-            else:
-                content = self.read_pdf_file(file_path=file_path)
-                mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
-            return output_vo.to_dict()
-        except Exception:
-            raise
+import os
+import pathlib
+import sys
+import subprocess
+from typing import Union
+ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
+sys.path.insert(0, str(ROOT_DIR))
+from datamax.parser.base import BaseLife
+from datamax.parser.base import MarkdownOutputVo
+from langchain_community.document_loaders import PyMuPDFLoader
+from loguru import logger
+from datamax.utils.mineru_operator import pdf_processor
+import os
+class PdfParser(BaseLife):
+    def __init__(self,
+                 file_path: Union[str, list],
+                 use_mineru: bool = False,
+                 ):
+        super().__init__()
+        self.file_path = file_path
+        self.use_mineru = use_mineru
+    def mineru_process(self, input_pdf_filename, output_dir):
+        proc = None
+        try:
+            logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
+            command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
+            proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            # 等待命令执行完成
+            stdout, stderr = proc.communicate()
+            # 检查命令是否成功执行
+            if proc.returncode != 0:
+                raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
+            logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            if proc is not None:
+                proc.kill()
+                proc.wait()
+                logger.info("The process was terminated due to an error.")
+            raise  # Re-raise the exception to let the caller handle it
+        finally:
+            # 确保子进程已经结束
+            if proc is not None:
+                if proc.poll() is None:
+                    proc.kill()
+                    proc.wait()
+                    logger.info("The process was terminated due to timeout or completion.")
+    @staticmethod
+    def read_pdf_file(file_path) -> str:
+        try:
+            pdf_loader = PyMuPDFLoader(file_path)
+            pdf_documents = pdf_loader.load()
+            result_text = ''
+            for page in pdf_documents:
+                result_text += page.page_content
+            return result_text
+        except Exception as e:
+            raise e
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            title = os.path.splitext(os.path.basename(file_path))[0]
+            if self.use_mineru:
+                output_dir = 'uploaded_files'
+                output_folder_name = os.path.basename(file_path).replace(".pdf", "")
+                # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
+                # if os.path.exists(output_mineru):
+                #     pass
+                # else:
+                    # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
+                # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
+                # todo: 是否有必要跟api的默认保存路径保持一致
+                output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
+                if os.path.exists(output_mineru):
+                    mk_content = open(output_mineru, 'r', encoding='utf-8').read()
+                else:
+                    mk_content = pdf_processor.process_pdf(file_path)
+            else:
+                content = self.read_pdf_file(file_path=file_path)
+                mk_content = content
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
+                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            output_vo = MarkdownOutputVo(title, mk_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception:
+            raise

datamax/parser/ppt_parser.py CHANGED Viewed

@@ -1,41 +1,83 @@
 import os
 import shutil
-import chardet
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import Union
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
+import chardet
+from datamax.parser.base import BaseLife, MarkdownOutputVo
 from datamax.utils.ppt_extract import PPtExtractor
+# 尝试导入UNO处理器
+try:
+    from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
+except ImportError:
+    HAS_UNO = False
 class PPtParser(BaseLife):
-    def __init__(self, file_path: Union[str, list]):
+    def __init__(self, file_path: Union[str, list], use_uno: bool = None):
         super().__init__()
         self.file_path = file_path
+        # 自动检测是否使用UNO（如果未指定）
+        if use_uno is None:
+            self.use_uno = HAS_UNO
+        else:
+            self.use_uno = use_uno and HAS_UNO
     def ppt_to_pptx(self, ppt_path: str, dir_path: str) -> str:
+        if self.use_uno:
+            # 使用UNO API进行转换
+            try:
+                pptx_path = convert_with_uno(ppt_path, "pptx", dir_path)
+                if not os.path.exists(pptx_path):
+                    raise Exception(
+                        f"> !!! File conversion failed {ppt_path} ==> {pptx_path}"
+                    )
+                else:
+                    return pptx_path
+            except Exception as e:
+                if (
+                    hasattr(self, "_fallback_to_subprocess")
+                    and self._fallback_to_subprocess
+                ):
+                    return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
+                raise
+        else:
+            # 使用传统的subprocess方式
+            return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
+    def _ppt_to_pptx_subprocess(self, ppt_path: str, dir_path: str) -> str:
+        """使用subprocess将.ppt文件转换为.pptx文件（传统方式）"""
         cmd = f'soffice --headless --convert-to pptx "{ppt_path}" --outdir "{dir_path}"'
-        process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        process = subprocess.Popen(
+            cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
         stdout, stderr = process.communicate()
         exit_code = process.returncode
         if exit_code == 0:
             pass
         else:
-            encoding = chardet.detect(stderr)['encoding']
+            encoding = chardet.detect(stderr)["encoding"]
             if encoding is None:
-                encoding = 'utf-8'
-            raise Exception(f"Error Output (detected encoding: {encoding}):", stderr.decode(encoding, errors='replace'))
+                encoding = "utf-8"
+            raise Exception(
+                f"Error Output (detected encoding: {encoding}):",
+                stderr.decode(encoding, errors="replace"),
+            )
         fname = str(Path(ppt_path).stem)
-        pptx_path = os.path.join(os.path.dirname(ppt_path), f'{fname}.pptx')
+        pptx_path = os.path.join(os.path.dirname(ppt_path), f"{fname}.pptx")
         if not os.path.exists(pptx_path):
             raise Exception(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}")
         else:
             return pptx_path
     def read_ppt_file(self, file_path: str):
         try:
             with tempfile.TemporaryDirectory() as temp_path:
                 temp_dir = Path(temp_path).resolve()
@@ -43,17 +85,21 @@ class PPtParser(BaseLife):
                 media_dir.mkdir()
                 tmp_file_path = temp_dir / "tmp.ppt"
                 shutil.copy(file_path, tmp_file_path)
-                pptx_file_path = self.ppt_to_pptx(ppt_path=str(tmp_file_path), dir_path=temp_path)
+                pptx_file_path = self.ppt_to_pptx(
+                    ppt_path=str(tmp_file_path), dir_path=temp_path
+                )
                 pptx_extractor = PPtExtractor()
-                pages_list = pptx_extractor.extract(Path(pptx_file_path), "tmp", temp_dir, media_dir, True)
-                contents = ''
+                pages_list = pptx_extractor.extract(
+                    Path(pptx_file_path), "tmp", temp_dir, media_dir, True
+                )
+                contents = ""
                 for index, page in enumerate(pages_list):
-                    page_content_list = page['content_list']
+                    page_content_list = page["content_list"]
                     for content in page_content_list:
-                        if content['type'] == 'image':
+                        if content["type"] == "image":
                             pass
-                        elif content['type'] == "text":
-                            data = content['data']
+                        elif content["type"] == "text":
+                            data = content["data"]
                             contents += data
                 return contents
         except Exception:
@@ -61,12 +107,16 @@ class PPtParser(BaseLife):
     def parse(self, file_path: str) -> MarkdownOutputVo:
         try:
-            title = self.get_file_extension(file_path)
+            title = os.path.splitext(os.path.basename(file_path))[0]
             content = self.read_ppt_file(file_path=file_path)
             # clean_text = clean_original_text(content)
             mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            lifecycle = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type="LLM_ORIGIN",
+            )
             output_vo = MarkdownOutputVo(title, mk_content)
             output_vo.add_lifecycle(lifecycle)
             return output_vo.to_dict()

pydatamax 0.1.14__py3-none-any.whl → 0.1.15.post2__py3-none-any.whl

pydatamax 0.1.14py3-none-any.whl → 0.1.15.post2py3-none-any.whl