PyPI - pydatamax - Versions diffs - 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

pydatamax 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

datamax/__init__.py +1 -1
datamax/loader/core.py +118 -118
datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
datamax/parser/__init__.py +2 -4
datamax/parser/base.py +76 -76
datamax/parser/core.py +406 -288
datamax/parser/csv_parser.py +31 -10
datamax/parser/doc_parser.py +525 -61
datamax/parser/docx_parser.py +512 -62
datamax/parser/epub_parser.py +41 -41
datamax/parser/html_parser.py +37 -37
datamax/parser/image_parser.py +34 -34
datamax/parser/json_parser.py +32 -10
datamax/parser/md_parser.py +72 -72
datamax/parser/pdf_parser.py +101 -101
datamax/parser/ppt_parser.py +70 -20
datamax/parser/pptx_parser.py +45 -45
datamax/parser/txt_parser.py +45 -45
datamax/parser/xls_parser.py +26 -26
datamax/parser/xlsx_parser.py +212 -208
datamax/utils/__init__.py +23 -2
datamax/utils/constants.py +58 -58
datamax/utils/data_cleaner.py +275 -237
datamax/utils/env_setup.py +79 -79
datamax/utils/gotocr_pdf.py +265 -265
datamax/utils/mineru_operator.py +62 -62
datamax/utils/paddleocr_pdf_operator.py +90 -90
datamax/utils/ppt_extract.py +140 -140
datamax/utils/qa_generator.py +369 -376
datamax/utils/tokenizer.py +21 -21
datamax/utils/uno_handler.py +426 -0
pydatamax-0.1.15.dist-info/METADATA +340 -0
pydatamax-0.1.15.dist-info/RECORD +38 -0
{pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
{pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
pydatamax-0.1.13.dist-info/METADATA +0 -280
pydatamax-0.1.13.dist-info/RECORD +0 -39
tests/__init__.py +0 -0
tests/test_basic.py +0 -20
{pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0

datamax/parser/ppt_parser.py CHANGED Viewed

@@ -1,41 +1,83 @@
 import os
 import shutil
-import chardet
 import subprocess
 import tempfile
 from pathlib import Path
 from typing import Union
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
+import chardet
+from datamax.parser.base import BaseLife, MarkdownOutputVo
 from datamax.utils.ppt_extract import PPtExtractor
+# 尝试导入UNO处理器
+try:
+    from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
+except ImportError:
+    HAS_UNO = False
 class PPtParser(BaseLife):
-    def __init__(self, file_path: Union[str, list]):
+    def __init__(self, file_path: Union[str, list], use_uno: bool = None):
         super().__init__()
         self.file_path = file_path
+        # 自动检测是否使用UNO（如果未指定）
+        if use_uno is None:
+            self.use_uno = HAS_UNO
+        else:
+            self.use_uno = use_uno and HAS_UNO
     def ppt_to_pptx(self, ppt_path: str, dir_path: str) -> str:
+        if self.use_uno:
+            # 使用UNO API进行转换
+            try:
+                pptx_path = convert_with_uno(ppt_path, "pptx", dir_path)
+                if not os.path.exists(pptx_path):
+                    raise Exception(
+                        f"> !!! File conversion failed {ppt_path} ==> {pptx_path}"
+                    )
+                else:
+                    return pptx_path
+            except Exception as e:
+                if (
+                    hasattr(self, "_fallback_to_subprocess")
+                    and self._fallback_to_subprocess
+                ):
+                    return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
+                raise
+        else:
+            # 使用传统的subprocess方式
+            return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
+    def _ppt_to_pptx_subprocess(self, ppt_path: str, dir_path: str) -> str:
+        """使用subprocess将.ppt文件转换为.pptx文件（传统方式）"""
         cmd = f'soffice --headless --convert-to pptx "{ppt_path}" --outdir "{dir_path}"'
-        process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        process = subprocess.Popen(
+            cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
         stdout, stderr = process.communicate()
         exit_code = process.returncode
         if exit_code == 0:
             pass
         else:
-            encoding = chardet.detect(stderr)['encoding']
+            encoding = chardet.detect(stderr)["encoding"]
             if encoding is None:
-                encoding = 'utf-8'
-            raise Exception(f"Error Output (detected encoding: {encoding}):", stderr.decode(encoding, errors='replace'))
+                encoding = "utf-8"
+            raise Exception(
+                f"Error Output (detected encoding: {encoding}):",
+                stderr.decode(encoding, errors="replace"),
+            )
         fname = str(Path(ppt_path).stem)
-        pptx_path = os.path.join(os.path.dirname(ppt_path), f'{fname}.pptx')
+        pptx_path = os.path.join(os.path.dirname(ppt_path), f"{fname}.pptx")
         if not os.path.exists(pptx_path):
             raise Exception(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}")
         else:
             return pptx_path
     def read_ppt_file(self, file_path: str):
         try:
             with tempfile.TemporaryDirectory() as temp_path:
                 temp_dir = Path(temp_path).resolve()
@@ -43,17 +85,21 @@ class PPtParser(BaseLife):
                 media_dir.mkdir()
                 tmp_file_path = temp_dir / "tmp.ppt"
                 shutil.copy(file_path, tmp_file_path)
-                pptx_file_path = self.ppt_to_pptx(ppt_path=str(tmp_file_path), dir_path=temp_path)
+                pptx_file_path = self.ppt_to_pptx(
+                    ppt_path=str(tmp_file_path), dir_path=temp_path
+                )
                 pptx_extractor = PPtExtractor()
-                pages_list = pptx_extractor.extract(Path(pptx_file_path), "tmp", temp_dir, media_dir, True)
-                contents = ''
+                pages_list = pptx_extractor.extract(
+                    Path(pptx_file_path), "tmp", temp_dir, media_dir, True
+                )
+                contents = ""
                 for index, page in enumerate(pages_list):
-                    page_content_list = page['content_list']
+                    page_content_list = page["content_list"]
                     for content in page_content_list:
-                        if content['type'] == 'image':
+                        if content["type"] == "image":
                             pass
-                        elif content['type'] == "text":
-                            data = content['data']
+                        elif content["type"] == "text":
+                            data = content["data"]
                             contents += data
                 return contents
         except Exception:
@@ -61,12 +107,16 @@ class PPtParser(BaseLife):
     def parse(self, file_path: str) -> MarkdownOutputVo:
         try:
-            title = self.get_file_extension(file_path)
+            title = os.path.splitext(os.path.basename(file_path))[0]
             content = self.read_ppt_file(file_path=file_path)
             # clean_text = clean_original_text(content)
             mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            lifecycle = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type="LLM_ORIGIN",
+            )
             output_vo = MarkdownOutputVo(title, mk_content)
             output_vo.add_lifecycle(lifecycle)
             return output_vo.to_dict()

datamax/parser/pptx_parser.py CHANGED Viewed

@@ -1,45 +1,45 @@
-import os
-from typing import Union
-from pptx import Presentation
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
-class PPtxParser(BaseLife):
-    def __init__(self, file_path: Union[str, list]):
-        super().__init__()
-        self.file_path = file_path
-    @staticmethod
-    def read_ppt_file(file_path: str):
-        try:
-            content = ''
-            prs = Presentation(file_path)
-            for slide in prs.slides:
-                for shape in slide.shapes:
-                    if shape.has_text_frame:
-                        content += shape.text + '\n'
-                    # if shape.shape_type == 13:
-                    #     if not os.path.exists("extracted_images"):
-                    #         os.makedirs("extracted_images")
-                    #     image = shape.image
-                    #     image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
-                    #     with open(image_filename, 'wb') as img_file:
-                    #         img_file.write(image.blob)
-                    #     content += ('[' + image_filename + ']')
-            return content
-        except Exception:
-            raise
-    def parse(self, file_path: str) -> MarkdownOutputVo:
-        try:
-            title = self.get_file_extension(file_path)
-            content = self.read_ppt_file(file_path=file_path)
-            mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
-            return output_vo.to_dict()
-        except Exception:
-            raise
+import os
+from typing import Union
+from pptx import Presentation
+from datamax.parser.base import BaseLife
+from datamax.parser.base import MarkdownOutputVo
+class PPtxParser(BaseLife):
+    def __init__(self, file_path: Union[str, list]):
+        super().__init__()
+        self.file_path = file_path
+    @staticmethod
+    def read_ppt_file(file_path: str):
+        try:
+            content = ''
+            prs = Presentation(file_path)
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if shape.has_text_frame:
+                        content += shape.text + '\n'
+                    # if shape.shape_type == 13:
+                    #     if not os.path.exists("extracted_images"):
+                    #         os.makedirs("extracted_images")
+                    #     image = shape.image
+                    #     image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
+                    #     with open(image_filename, 'wb') as img_file:
+                    #         img_file.write(image.blob)
+                    #     content += ('[' + image_filename + ']')
+            return content
+        except Exception:
+            raise
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            title = os.path.splitext(os.path.basename(file_path))[0]
+            content = self.read_ppt_file(file_path=file_path)
+            mk_content = content
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
+                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            output_vo = MarkdownOutputVo(title, mk_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception:
+            raise

datamax/parser/txt_parser.py CHANGED Viewed

@@ -1,46 +1,46 @@
-import chardet
-from typing import Union
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
-class TxtParser(BaseLife):
-    def __init__(self, file_path: Union[str, list]):
-        super().__init__()
-        self.file_path = file_path
-    @staticmethod
-    def detect_encoding(file_path: str):
-        try:
-            with open(file_path, 'rb') as f:
-                result = chardet.detect(f.read())
-                return result['encoding']
-        except Exception as e:
-            raise e
-    @staticmethod
-    def read_txt_file(file_path: str) -> str:
-        """
-        Reads the Txt file in the specified path and returns its contents.
-        :param file_path: indicates the path of the Txt file to be read.
-        :return: str: Txt file contents.
-        """
-        try:
-            encoding = TxtParser.detect_encoding(file_path)
-            with open(file_path, 'r', encoding=encoding) as file:
-                return file.read()
-        except Exception as e:
-            raise e
-    def parse(self, file_path: str) -> MarkdownOutputVo:
-        try:
-            title = self.get_file_extension(file_path)
-            content = self.read_txt_file(file_path=file_path)  # 真实数据是从load加载
-            mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
-            return output_vo.to_dict()
-        except Exception as e:
+import chardet
+from typing import Union
+from datamax.parser.base import BaseLife
+from datamax.parser.base import MarkdownOutputVo
+import os
+class TxtParser(BaseLife):
+    def __init__(self, file_path: Union[str, list]):
+        super().__init__()
+        self.file_path = file_path
+    @staticmethod
+    def detect_encoding(file_path: str):
+        try:
+            with open(file_path, 'rb') as f:
+                result = chardet.detect(f.read())
+                return result['encoding']
+        except Exception as e:
+            raise e
+    @staticmethod
+    def read_txt_file(file_path: str) -> str:
+        """
+        Reads the Txt file in the specified path and returns its contents.
+        :param file_path: indicates the path of the Txt file to be read.
+        :return: str: Txt file contents.
+        """
+        try:
+            encoding = TxtParser.detect_encoding(file_path)
+            with open(file_path, 'r', encoding=encoding) as file:
+                return file.read()
+        except Exception as e:
+            raise e
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            title = os.path.splitext(os.path.basename(file_path))[0]
+            content = self.read_txt_file(file_path=file_path)  # 真实数据是从load加载
+            mk_content = content
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
+                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            output_vo = MarkdownOutputVo(title, mk_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception as e:
             raise e

datamax/parser/xls_parser.py CHANGED Viewed

@@ -1,26 +1,26 @@
-from datamax.parser.base import MarkdownOutputVo
-from datamax.parser.base import BaseLife
-import pandas as pd
-import warnings
-warnings.filterwarnings("ignore")
-class XlsParser(BaseLife):
-    """xlsx or xls table use markitdown from Microsoft  so magic for table!"""
-    def __init__(self, file_path):
-        super().__init__()
-        self.file_path = file_path
-    def parse(self, file_path: str) -> MarkdownOutputVo:
-        try:
-            df = pd.read_excel(file_path)
-            mk_content = df.to_markdown(index=False)
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
-            output_vo.add_lifecycle(lifecycle)
-            return output_vo.to_dict()
-        except Exception as e:
-            raise e
+from datamax.parser.base import MarkdownOutputVo
+from datamax.parser.base import BaseLife
+import pandas as pd
+import warnings
+warnings.filterwarnings("ignore")
+class XlsParser(BaseLife):
+    """xlsx or xls table use markitdown from Microsoft  so magic for table!"""
+    def __init__(self, file_path):
+        super().__init__()
+        self.file_path = file_path
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            df = pd.read_excel(file_path)
+            mk_content = df.to_markdown(index=False)
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
+                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception as e:
+            raise e

pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

pydatamax 0.1.13py3-none-any.whl → 0.1.15py3-none-any.whl