PyPI - pydatamax - Versions diffs - 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl - Mend

pydatamax 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

datamax/__init__.py +1 -1
datamax/loader/core.py +118 -118
datamax/loader/minio_handler.py +171 -171
datamax/loader/oss_handler.py +191 -191
datamax/parser/__init__.py +2 -4
datamax/parser/base.py +76 -76
datamax/parser/core.py +406 -288
datamax/parser/csv_parser.py +31 -10
datamax/parser/doc_parser.py +466 -10
datamax/parser/docx_parser.py +449 -11
datamax/parser/epub_parser.py +41 -41
datamax/parser/html_parser.py +37 -37
datamax/parser/image_parser.py +34 -34
datamax/parser/json_parser.py +32 -10
datamax/parser/md_parser.py +72 -72
datamax/parser/pdf_parser.py +101 -101
datamax/parser/ppt_parser.py +70 -20
datamax/parser/pptx_parser.py +45 -45
datamax/parser/txt_parser.py +45 -45
datamax/parser/xls_parser.py +26 -26
datamax/parser/xlsx_parser.py +212 -215
datamax/utils/__init__.py +23 -2
datamax/utils/constants.py +58 -58
datamax/utils/data_cleaner.py +275 -237
datamax/utils/env_setup.py +79 -79
datamax/utils/gotocr_pdf.py +265 -265
datamax/utils/mineru_operator.py +62 -62
datamax/utils/paddleocr_pdf_operator.py +90 -90
datamax/utils/ppt_extract.py +140 -140
datamax/utils/qa_generator.py +369 -376
datamax/utils/tokenizer.py +21 -21
datamax/utils/uno_handler.py +426 -0
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/METADATA +117 -5
pydatamax-0.1.15.dist-info/RECORD +38 -0
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
pydatamax-0.1.14.dist-info/RECORD +0 -39
tests/__init__.py +0 -0
tests/test_basic.py +0 -20
{pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0

datamax/parser/pptx_parser.py CHANGED Viewed

@@ -1,45 +1,45 @@
-import os
-from typing import Union
-from pptx import Presentation
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
-class PPtxParser(BaseLife):
-    def __init__(self, file_path: Union[str, list]):
-        super().__init__()
-        self.file_path = file_path
-    @staticmethod
-    def read_ppt_file(file_path: str):
-        try:
-            content = ''
-            prs = Presentation(file_path)
-            for slide in prs.slides:
-                for shape in slide.shapes:
-                    if shape.has_text_frame:
-                        content += shape.text + '\n'
-                    # if shape.shape_type == 13:
-                    #     if not os.path.exists("extracted_images"):
-                    #         os.makedirs("extracted_images")
-                    #     image = shape.image
-                    #     image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
-                    #     with open(image_filename, 'wb') as img_file:
-                    #         img_file.write(image.blob)
-                    #     content += ('[' + image_filename + ']')
-            return content
-        except Exception:
-            raise
-    def parse(self, file_path: str) -> MarkdownOutputVo:
-        try:
-            title = self.get_file_extension(file_path)
-            content = self.read_ppt_file(file_path=file_path)
-            mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
-            return output_vo.to_dict()
-        except Exception:
-            raise
+import os
+from typing import Union
+from pptx import Presentation
+from datamax.parser.base import BaseLife
+from datamax.parser.base import MarkdownOutputVo
+class PPtxParser(BaseLife):
+    def __init__(self, file_path: Union[str, list]):
+        super().__init__()
+        self.file_path = file_path
+    @staticmethod
+    def read_ppt_file(file_path: str):
+        try:
+            content = ''
+            prs = Presentation(file_path)
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if shape.has_text_frame:
+                        content += shape.text + '\n'
+                    # if shape.shape_type == 13:
+                    #     if not os.path.exists("extracted_images"):
+                    #         os.makedirs("extracted_images")
+                    #     image = shape.image
+                    #     image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
+                    #     with open(image_filename, 'wb') as img_file:
+                    #         img_file.write(image.blob)
+                    #     content += ('[' + image_filename + ']')
+            return content
+        except Exception:
+            raise
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            title = os.path.splitext(os.path.basename(file_path))[0]
+            content = self.read_ppt_file(file_path=file_path)
+            mk_content = content
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
+                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            output_vo = MarkdownOutputVo(title, mk_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception:
+            raise

datamax/parser/txt_parser.py CHANGED Viewed

@@ -1,46 +1,46 @@
-import chardet
-from typing import Union
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
-class TxtParser(BaseLife):
-    def __init__(self, file_path: Union[str, list]):
-        super().__init__()
-        self.file_path = file_path
-    @staticmethod
-    def detect_encoding(file_path: str):
-        try:
-            with open(file_path, 'rb') as f:
-                result = chardet.detect(f.read())
-                return result['encoding']
-        except Exception as e:
-            raise e
-    @staticmethod
-    def read_txt_file(file_path: str) -> str:
-        """
-        Reads the Txt file in the specified path and returns its contents.
-        :param file_path: indicates the path of the Txt file to be read.
-        :return: str: Txt file contents.
-        """
-        try:
-            encoding = TxtParser.detect_encoding(file_path)
-            with open(file_path, 'r', encoding=encoding) as file:
-                return file.read()
-        except Exception as e:
-            raise e
-    def parse(self, file_path: str) -> MarkdownOutputVo:
-        try:
-            title = self.get_file_extension(file_path)
-            content = self.read_txt_file(file_path=file_path)  # 真实数据是从load加载
-            mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
-            return output_vo.to_dict()
-        except Exception as e:
+import chardet
+from typing import Union
+from datamax.parser.base import BaseLife
+from datamax.parser.base import MarkdownOutputVo
+import os
+class TxtParser(BaseLife):
+    def __init__(self, file_path: Union[str, list]):
+        super().__init__()
+        self.file_path = file_path
+    @staticmethod
+    def detect_encoding(file_path: str):
+        try:
+            with open(file_path, 'rb') as f:
+                result = chardet.detect(f.read())
+                return result['encoding']
+        except Exception as e:
+            raise e
+    @staticmethod
+    def read_txt_file(file_path: str) -> str:
+        """
+        Reads the Txt file in the specified path and returns its contents.
+        :param file_path: indicates the path of the Txt file to be read.
+        :return: str: Txt file contents.
+        """
+        try:
+            encoding = TxtParser.detect_encoding(file_path)
+            with open(file_path, 'r', encoding=encoding) as file:
+                return file.read()
+        except Exception as e:
+            raise e
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            title = os.path.splitext(os.path.basename(file_path))[0]
+            content = self.read_txt_file(file_path=file_path)  # 真实数据是从load加载
+            mk_content = content
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
+                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            output_vo = MarkdownOutputVo(title, mk_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception as e:
             raise e

datamax/parser/xls_parser.py CHANGED Viewed

@@ -1,26 +1,26 @@
-from datamax.parser.base import MarkdownOutputVo
-from datamax.parser.base import BaseLife
-import pandas as pd
-import warnings
-warnings.filterwarnings("ignore")
-class XlsParser(BaseLife):
-    """xlsx or xls table use markitdown from Microsoft  so magic for table!"""
-    def __init__(self, file_path):
-        super().__init__()
-        self.file_path = file_path
-    def parse(self, file_path: str) -> MarkdownOutputVo:
-        try:
-            df = pd.read_excel(file_path)
-            mk_content = df.to_markdown(index=False)
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
-            output_vo.add_lifecycle(lifecycle)
-            return output_vo.to_dict()
-        except Exception as e:
-            raise e
+from datamax.parser.base import MarkdownOutputVo
+from datamax.parser.base import BaseLife
+import pandas as pd
+import warnings
+warnings.filterwarnings("ignore")
+class XlsParser(BaseLife):
+    """xlsx or xls table use markitdown from Microsoft  so magic for table!"""
+    def __init__(self, file_path):
+        super().__init__()
+        self.file_path = file_path
+    def parse(self, file_path: str) -> MarkdownOutputVo:
+        try:
+            df = pd.read_excel(file_path)
+            mk_content = df.to_markdown(index=False)
+            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
+                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
+            output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
+            output_vo.add_lifecycle(lifecycle)
+            return output_vo.to_dict()
+        except Exception as e:
+            raise e

pydatamax 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

pydatamax 0.1.14py3-none-any.whl → 0.1.15py3-none-any.whl