PyPI - pydatamax - Versions diffs - 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl - Mend

pydatamax 0.1.16.post1py3-none-any.whl → 0.1.16.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

datamax/loader/core.py +67 -42
datamax/loader/minio_handler.py +38 -19
datamax/parser/__init__.py +2 -1
datamax/parser/base.py +46 -22
datamax/parser/core.py +215 -126
datamax/parser/csv_parser.py +25 -5
datamax/parser/doc_parser.py +230 -141
datamax/parser/docx_parser.py +275 -186
datamax/parser/epub_parser.py +49 -13
datamax/parser/html_parser.py +36 -16
datamax/parser/image_parser.py +52 -14
datamax/parser/json_parser.py +26 -5
datamax/parser/md_parser.py +40 -21
datamax/parser/pdf_parser.py +69 -29
datamax/parser/ppt_parser.py +41 -9
datamax/parser/pptx_parser.py +49 -21
datamax/parser/txt_parser.py +45 -14
datamax/parser/xls_parser.py +34 -6
datamax/parser/xlsx_parser.py +58 -51
datamax/utils/__init__.py +2 -1
datamax/utils/data_cleaner.py +36 -22
datamax/utils/env_setup.py +25 -18
datamax/utils/gotocr_pdf.py +13 -13
datamax/utils/lifecycle_types.py +18 -0
datamax/utils/mineru_operator.py +17 -15
datamax/utils/paddleocr_pdf_operator.py +34 -19
datamax/utils/ppt_extract.py +34 -11
datamax/utils/qa_generator.py +332 -44
datamax/utils/tokenizer.py +10 -9
datamax/utils/uno_handler.py +84 -72
{pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
pydatamax-0.1.16.post1.dist-info/RECORD +0 -38
{pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
{pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
{pydatamax-0.1.16.post1.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0

datamax/parser/core.py CHANGED Viewed

@@ -1,14 +1,16 @@
-import os
+import importlib
 import json
+import os
 import time
-import importlib
+from pathlib import Path
+from typing import Dict, List, Union
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 from loguru import logger
-from typing import List, Union, Dict
 from openai import OpenAI
-from pathlib import Path
 from datamax.utils import data_cleaner
-from datamax.utils.qa_generator import generatr_qa_pairs
-from langchain.text_splitter import RecursiveCharacterTextSplitter
+from datamax.utils.qa_generator import generate_qa_from_content
 class ModelInvoker:
@@ -32,10 +34,9 @@ class ModelInvoker:
 class ParserFactory:
     @staticmethod
     def create_parser(
-            file_path: str,
-            use_mineru: bool = False,
-            to_markdown: bool = False,
-            timeout: int = 1200
+        file_path: str,
+        use_mineru: bool = False,
+        to_markdown: bool = False,
     ):
         """
         Create a parser instance based on the file extension.
@@ -43,36 +44,35 @@ class ParserFactory:
         :param to_markdown: Flag to indicate whether the output should be in Markdown format.
                     (only supported files in .doc or .docx format)
         :param use_mineru: Flag to indicate whether MinerU should be used. (only supported files in .pdf format)
-        :param timeout: Timeout for the request .(only supported files in .xlsx format)
         :return: An instance of the parser class corresponding to the file extension.
         """
         file_extension = os.path.splitext(file_path)[1].lower()
         parser_class_name = {
-            '.md': 'MarkdownParser',
-            '.docx': 'DocxParser',
-            '.doc': 'DocParser',
-            '.epub': 'EpubParser',
-            '.html': 'HtmlParser',
-            '.txt': 'TxtParser',
-            '.pptx': 'PPtxParser',
-            '.ppt': 'PPtParser',
-            '.pdf': 'PdfParser',
-            '.jpg': 'ImageParser',
-            '.jpeg': 'ImageParser',
-            '.png': 'ImageParser',
-            '.webp': 'ImageParser',
-            '.xlsx': 'XlsxParser',
-            '.xls': 'XlsParser'
+            ".md": "MarkdownParser",
+            ".docx": "DocxParser",
+            ".doc": "DocParser",
+            ".epub": "EpubParser",
+            ".html": "HtmlParser",
+            ".txt": "TxtParser",
+            ".pptx": "PPtxParser",
+            ".ppt": "PPtParser",
+            ".pdf": "PdfParser",
+            ".jpg": "ImageParser",
+            ".jpeg": "ImageParser",
+            ".png": "ImageParser",
+            ".webp": "ImageParser",
+            ".xlsx": "XlsxParser",
+            ".xls": "XlsParser",
         }.get(file_extension)
         if not parser_class_name:
             return None
-        if file_extension in ['.jpg', 'jpeg', '.png', '.webp']:
-            module_name = f'datamax.parser.image_parser'
+        if file_extension in [".jpg", "jpeg", ".png", ".webp"]:
+            module_name = f"datamax.parser.image_parser"
         else:
             # Dynamically determine the module name based on the file extension
-            module_name = f'datamax.parser.{file_extension[1:]}_parser'
+            module_name = f"datamax.parser.{file_extension[1:]}_parser"
         try:
             # Dynamically import the module and get the class
@@ -80,44 +80,38 @@ class ParserFactory:
             parser_class = getattr(module, parser_class_name)
             # Special handling for PdfParser arguments
-            if parser_class_name == 'PdfParser':
+            if parser_class_name == "PdfParser":
                 return parser_class(
                     file_path=file_path,
                     use_mineru=use_mineru,
                 )
-            elif parser_class_name == 'DocxParser' or parser_class_name == 'DocParser':
+            elif parser_class_name == "DocxParser" or parser_class_name == "DocParser":
                 return parser_class(
                     file_path=file_path, to_markdown=to_markdown, use_uno=True
                 )
-            elif parser_class_name == 'XlsxParser':
-                return parser_class(
-                    file_path=file_path,
-                    timeout=timeout
-                )
+            elif parser_class_name == "XlsxParser":
+                return parser_class(file_path=file_path)
             else:
-                return parser_class(
-                    file_path=file_path
-                )
+                return parser_class(file_path=file_path)
         except (ImportError, AttributeError) as e:
             raise e
 class DataMax:
-    def __init__(self,
-                 file_path: Union[str, list] = '',
-                 use_mineru: bool = False,
-                 to_markdown: bool = False,
-                 timeout: int = 1200,
-                 ttl: int = 3600
-                 ):
+    def __init__(
+        self,
+        file_path: Union[str, list] = "",
+        use_mineru: bool = False,
+        to_markdown: bool = False,
+        ttl: int = 3600,
+    ):
         """
         Initialize the DataMaxParser with file path and parsing options.
         :param file_path: The path to the file or directory to be parsed.
         :param use_mineru: Flag to indicate whether MinerU should be used.
         :param to_markdown: Flag to indicate whether the output should be in Markdown format.
-        :param timeout: Timeout for the request.
         :param ttl: Time to live for the cache.
         """
         self.file_path = file_path
@@ -125,10 +119,9 @@ class DataMax:
         self.to_markdown = to_markdown
         self.parsed_data = None
         self.model_invoker = ModelInvoker()
-        self.timeout = timeout
         self._cache = {}
         self.ttl = ttl
     def set_data(self, file_name, parsed_data):
         """
         Set cached data
@@ -137,8 +130,13 @@ class DataMax:
         """
         logger.info(f"cache ttl is {self.ttl}s")
         if self.ttl > 0:
-            self._cache[file_name] = {'data': parsed_data, 'ttl': time.time() + self.ttl}
-            logger.info(f"✅ [Cache Updated] Cached data for {file_name}, ttl: {self._cache[file_name]['ttl']}")
+            self._cache[file_name] = {
+                "data": parsed_data,
+                "ttl": time.time() + self.ttl,
+            }
+            logger.info(
+                f"✅ [Cache Updated] Cached data for {file_name}, ttl: {self._cache[file_name]['ttl']}"
+            )
     def get_data(self):
         """
@@ -151,12 +149,21 @@ class DataMax:
                 parsed_data = []
                 for f in self.file_path:
                     file_name = os.path.basename(f)
-                    if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
+                    if (
+                        file_name in self._cache
+                        and self._cache[file_name]["ttl"] > time.time()
+                    ):
                         logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
-                        parsed_data.append(self._cache[file_name]['data'])
+                        parsed_data.append(self._cache[file_name]["data"])
                     else:
-                        logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
-                        self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
+                        logger.info(
+                            f"⏳ [Cache Miss] No cached data for {file_name}, parsing..."
+                        )
+                        self._cache = {
+                            k: v
+                            for k, v in self._cache.items()
+                            if v["ttl"] > time.time()
+                        }
                         res_data = self._parse_file(f)
                         parsed_data.append(res_data)
                         self.set_data(file_name, res_data)
@@ -164,29 +171,49 @@ class DataMax:
             elif isinstance(self.file_path, str) and os.path.isfile(self.file_path):
                 file_name = os.path.basename(self.file_path)
-                if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
+                if (
+                    file_name in self._cache
+                    and self._cache[file_name]["ttl"] > time.time()
+                ):
                     logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
-                    return self._cache[file_name]['data']
+                    return self._cache[file_name]["data"]
                 else:
-                    logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
-                    self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
+                    logger.info(
+                        f"⏳ [Cache Miss] No cached data for {file_name}, parsing..."
+                    )
+                    self._cache = {
+                        k: v for k, v in self._cache.items() if v["ttl"] > time.time()
+                    }
                     parsed_data = self._parse_file(self.file_path)
                     self.parsed_data = parsed_data
                     self.set_data(file_name, parsed_data)
                     return parsed_data
             elif isinstance(self.file_path, str) and os.path.isdir(self.file_path):
-                file_list = [str(file) for file in list(Path(self.file_path).rglob('*.*'))]
+                file_list = [
+                    str(file) for file in list(Path(self.file_path).rglob("*.*"))
+                ]
                 parsed_data = []
                 for f in file_list:
                     if os.path.isfile(f):
                         file_name = os.path.basename(f)
-                        if file_name in self._cache and self._cache[file_name]['ttl'] > time.time():
-                            logger.info(f"✅ [Cache Hit] Using cached data for {file_name}")
-                            parsed_data.append(self._cache[file_name]['data'])
+                        if (
+                            file_name in self._cache
+                            and self._cache[file_name]["ttl"] > time.time()
+                        ):
+                            logger.info(
+                                f"✅ [Cache Hit] Using cached data for {file_name}"
+                            )
+                            parsed_data.append(self._cache[file_name]["data"])
                         else:
-                            logger.info(f"⏳ [Cache Miss] No cached data for {file_name}, parsing...")
-                            self._cache = {k: v for k, v in self._cache.items() if v['ttl'] > time.time()}
+                            logger.info(
+                                f"⏳ [Cache Miss] No cached data for {file_name}, parsing..."
+                            )
+                            self._cache = {
+                                k: v
+                                for k, v in self._cache.items()
+                                if v["ttl"] > time.time()
+                            }
                             res_data = self._parse_file(f)
                             parsed_data.append(res_data)
                             self.set_data(file_name, res_data)
@@ -201,53 +228,99 @@ class DataMax:
         """
         Clean data
-        methods include AbnormalCleaner， TextFilter， PrivacyDesensitization which is 1 2 3
+        methods include AbnormalCleaner, TextFilter, PrivacyDesensitization which are 1, 2, 3
-        :return:
+        :return: Cleaned data
         """
         if text:
             cleaned_text = text
         elif self.parsed_data:
-            cleaned_text = self.parsed_data.get('content')
+            cleaned_text = self.parsed_data.get("content")
         else:
             raise ValueError("No data to clean.")
         for method in method_list:
-            if method == 'abnormal':
-                cleaned_text = data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
-            elif method == 'filter':
+            if method == "abnormal":
+                cleaned_text = (
+                    data_cleaner.AbnormalCleaner(cleaned_text).to_clean().get("text")
+                )
+            elif method == "filter":
                 cleaned_text = data_cleaner.TextFilter(cleaned_text).to_filter()
-                cleaned_text = cleaned_text.get("text") if cleaned_text else ''
-            elif method == 'private':
-                cleaned_text = data_cleaner.PrivacyDesensitization(cleaned_text).to_private().get("text")
+                cleaned_text = cleaned_text.get("text") if cleaned_text else ""
+            elif method == "private":
+                cleaned_text = (
+                    data_cleaner.PrivacyDesensitization(cleaned_text)
+                    .to_private()
+                    .get("text")
+                )
         if self.parsed_data:
             origin_dict = self.parsed_data
-            origin_dict['content'] = cleaned_text
+            origin_dict["content"] = cleaned_text
             self.parsed_data = None
             return origin_dict
         else:
             return cleaned_text
-    def get_pre_label(self,
-                      api_key: str,
-                      base_url: str,
-                      model_name: str,
-                      chunk_size: int = 500,
-                      chunk_overlap: int = 100,
-                      question_number: int = 5,
-                      max_workers: int = 5,
-                      messages: List[Dict[str, str]] = None):
-        return generatr_qa_pairs(
+    def get_pre_label(
+        self,
+        api_key: str,
+        base_url: str,
+        model_name: str,
+        chunk_size: int = 500,
+        chunk_overlap: int = 100,
+        question_number: int = 5,
+        max_workers: int = 5,
+        language: str = "zh",
+        messages: List[Dict[str, str]] = None,
+    ):
+        """
+        Generate pre-labeling data based on processed document content instead of file path
+        :param api_key: API key
+        :param base_url: API base URL
+        :param model_name: Model name
+        :param chunk_size: Chunk size
+        :param chunk_overlap: Overlap length
+        :param question_number: Number of questions generated per chunk
+        :param max_workers: Number of concurrent workers
+        :param language: Language for QA generation ("zh" for Chinese, "en" for English)
+        :param messages: Custom messages
+        :return: List of QA pairs
+        """
+        # First get the processed data
+        processed_data = self.get_data()
+        # If it's a list (multiple files), merge all content
+        if isinstance(processed_data, list):
+            content_list = []
+            for data in processed_data:
+                if isinstance(data, dict) and "content" in data:
+                    content_list.append(data["content"])
+                elif isinstance(data, str):
+                    content_list.append(data)
+            content = "\n\n".join(content_list)
+        # If it's a dictionary for a single file
+        elif isinstance(processed_data, dict) and "content" in processed_data:
+            content = processed_data["content"]
+        # If it's a string
+        elif isinstance(processed_data, str):
+            content = processed_data
+        else:
+            raise ValueError("Unable to extract content field from processed data")
+        # Generate QA pairs using content instead of reading files
+        return generate_qa_from_content(
+            content=content,
             api_key=api_key,
             base_url=base_url,
             model_name=model_name,
             chunk_size=chunk_size,
             chunk_overlap=chunk_overlap,
             question_number=question_number,
+            language=language,
             max_workers=max_workers,
             message=messages,
-            file_path=self.file_path
         )
     def save_label_data(self, label_data: list, save_file_name: str = None):
@@ -262,27 +335,30 @@ class DataMax:
             if isinstance(self.file_path, str):
                 save_file_name = os.path.splitext(os.path.basename(self.file_path))[0]
             else:
-                save_file_name = 'label_data'
+                save_file_name = "label_data"
         if isinstance(label_data, list):
-            with open(save_file_name + '.jsonl', 'w', encoding='utf-8') as f:
+            with open(save_file_name + ".jsonl", "w", encoding="utf-8") as f:
                 for qa_entry in label_data:
                     f.write(json.dumps(qa_entry, ensure_ascii=False) + "\n")
-            logger.info(f"✅ [Label Data Saved] Label data saved to {save_file_name}.jsonl")
+            logger.info(
+                f"✅ [Label Data Saved] Label data saved to {save_file_name}.jsonl"
+            )
-    @staticmethod
-    def split_text_into_paragraphs(text: str, max_length:int = 500, chunk_overlap: int = 100):
+    @staticmethod
+    def split_text_into_paragraphs(
+        text: str, max_length: int = 500, chunk_overlap: int = 100
+    ):
         """
         Split text into paragraphs by sentence boundaries, each paragraph not exceeding max_length characters.
         Paragraphs will have chunk_overlap characters of overlap between them.
         """
-        import re
+        import re
         # Split sentences using Chinese punctuation marks
-        sentences = re.split('(?<=[。！？])', text)
+        sentences = re.split("(?<=[。！？])", text)
         paragraphs = []
-        current_paragraph = ''
-        overlap_buffer = ''
+        current_paragraph = ""
+        overlap_buffer = ""
         for sentence in sentences:
             # If current paragraph plus new sentence doesn't exceed max length
@@ -293,20 +369,26 @@ class DataMax:
                     # Add current paragraph to results
                     paragraphs.append(current_paragraph)
                     # Save overlap portion
-                    overlap_buffer = current_paragraph[-chunk_overlap:] if chunk_overlap > 0 else ''
+                    overlap_buffer = (
+                        current_paragraph[-chunk_overlap:] if chunk_overlap > 0 else ""
+                    )
                 # Start new paragraph with overlap
                 current_paragraph = overlap_buffer + sentence
-                overlap_buffer = ''
+                overlap_buffer = ""
                 # Handle overly long sentences
                 while len(current_paragraph) > max_length:
                     # Split long paragraph
                     split_point = max_length - len(overlap_buffer)
                     paragraphs.append(current_paragraph[:split_point])
                     # Update overlap buffer
-                    overlap_buffer = current_paragraph[split_point - chunk_overlap:split_point] if chunk_overlap > 0 else ''
+                    overlap_buffer = (
+                        current_paragraph[split_point - chunk_overlap : split_point]
+                        if chunk_overlap > 0
+                        else ""
+                    )
                     current_paragraph = overlap_buffer + current_paragraph[split_point:]
-                    overlap_buffer = ''
+                    overlap_buffer = ""
         # Add the last paragraph
         if current_paragraph:
@@ -315,10 +397,12 @@ class DataMax:
         return paragraphs
     @staticmethod
-    def split_with_langchain(text: str, chunk_size: int = 500, chunk_overlap: int = 100):
+    def split_with_langchain(
+        text: str, chunk_size: int = 500, chunk_overlap: int = 100
+    ):
         """
         Split text using LangChain's intelligent text splitting
         :param text: Text to be split
         :param chunk_size: Maximum length of each chunk
         :param chunk_overlap: Number of overlapping characters between chunks
@@ -333,14 +417,15 @@ class DataMax:
         return text_splitter.split_text(text)
     def split_data(
-            self,
-            parsed_data: Union[str, dict] = None,
-            chunk_size: int = 500,
-            chunk_overlap: int = 100,
-            use_langchain: bool = False):
+        self,
+        parsed_data: Union[str, dict] = None,
+        chunk_size: int = 500,
+        chunk_overlap: int = 100,
+        use_langchain: bool = False,
+    ):
         """
         Improved splitting method with LangChain option
         :param use_langchain: Whether to use LangChain for splitting
         :param parsed_data: Data to be split, either string or dict
         :param chunk_size: Maximum length of each chunk
@@ -351,36 +436,41 @@ class DataMax:
             self.parsed_data = parsed_data
         if not self.parsed_data:
             raise ValueError("No data to split.")
         if use_langchain:
             if isinstance(self.parsed_data, str):
-                return self.split_with_langchain(self.parsed_data, chunk_size, chunk_overlap)
+                return self.split_with_langchain(
+                    self.parsed_data, chunk_size, chunk_overlap
+                )
             elif isinstance(self.parsed_data, dict):
-                if 'content' not in self.parsed_data:
+                if "content" not in self.parsed_data:
                     raise ValueError("Input dict must contain 'content' key")
-                chunks = self.split_with_langchain(self.parsed_data['content'], chunk_size, chunk_overlap)
+                chunks = self.split_with_langchain(
+                    self.parsed_data["content"], chunk_size, chunk_overlap
+                )
                 result = self.parsed_data.copy()
-                result['content'] = chunks
+                result["content"] = chunks
                 return result
         # Handle string input
         if isinstance(self.parsed_data, str):
-            return self.split_text_into_paragraphs(self.parsed_data, chunk_size, chunk_overlap)
+            return self.split_text_into_paragraphs(
+                self.parsed_data, chunk_size, chunk_overlap
+            )
         # Handle dict input
         elif isinstance(self.parsed_data, dict):
-            if 'content' not in self.parsed_data:
+            if "content" not in self.parsed_data:
                 raise ValueError("Input dict must contain 'content' key")
-            content = self.parsed_data['content']
+            content = self.parsed_data["content"]
             chunks = self.split_text_into_paragraphs(content, chunk_size, chunk_overlap)
             result = self.parsed_data.copy()
-            result['content'] = chunks
+            result["content"] = chunks
             return result
         else:
             raise ValueError("Unsupported input type")
     def _parse_file(self, file_path):
         """
@@ -394,7 +484,6 @@ class DataMax:
                 use_mineru=self.use_mineru,
                 file_path=file_path,
                 to_markdown=self.to_markdown,
-                timeout=self.timeout
             )
             if parser:
                 return parser.parse(file_path=file_path)
@@ -402,5 +491,5 @@ class DataMax:
             raise e
-if __name__ == '__main__':
-    pass
+if __name__ == "__main__":
+    pass

datamax/parser/csv_parser.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import pandas as pd
 from datamax.parser.base import BaseLife, MarkdownOutputVo
+from datamax.utils.lifecycle_types import LifeType
 class CsvParser(BaseLife):
@@ -16,16 +17,35 @@ class CsvParser(BaseLife):
     def parse(self, file_path: str) -> MarkdownOutputVo:
         try:
+            # 1) 处理开始
+            extension = self.get_file_extension(file_path)
+            lc_start = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                life_type=LifeType.DATA_PROCESSING,
+                usage_purpose="Parsing",
+            )
+            # 2) 核心解析
             df = self.read_csv_file(file_path)
             mk_content = df.to_markdown(index=False)
-            lifecycle = self.generate_lifecycle(
+            # 3) 处理结束或失败
+            lc_end = self.generate_lifecycle(
                 source_file=file_path,
                 domain="Technology",
-                usage_purpose="Documentation",
-                life_type="LLM_ORIGIN",
+                life_type=(
+                    LifeType.DATA_PROCESSED
+                    if mk_content.strip()
+                    else LifeType.DATA_PROCESS_FAILED
+                ),
+                usage_purpose="Parsing",
             )
-            output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
-            output_vo.add_lifecycle(lifecycle)
+            # 4) 封装输出并添加生命周期
+            output_vo = MarkdownOutputVo(extension, mk_content)
+            output_vo.add_lifecycle(lc_start)
+            output_vo.add_lifecycle(lc_end)
             return output_vo.to_dict()
         except Exception as e:
             raise e

pydatamax 0.1.16.post1__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

pydatamax 0.1.16.post1py3-none-any.whl → 0.1.16.post2py3-none-any.whl