PyPI - pydatamax - Versions diffs - 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl - Mend

pydatamax 0.1.16py3-none-any.whl → 0.1.16.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

datamax/loader/core.py +67 -42
datamax/loader/minio_handler.py +38 -19
datamax/parser/__init__.py +2 -1
datamax/parser/base.py +46 -22
datamax/parser/core.py +215 -126
datamax/parser/csv_parser.py +25 -5
datamax/parser/doc_parser.py +230 -141
datamax/parser/docx_parser.py +275 -186
datamax/parser/epub_parser.py +49 -13
datamax/parser/html_parser.py +36 -16
datamax/parser/image_parser.py +52 -14
datamax/parser/json_parser.py +26 -5
datamax/parser/md_parser.py +40 -21
datamax/parser/pdf_parser.py +69 -29
datamax/parser/ppt_parser.py +41 -9
datamax/parser/pptx_parser.py +49 -21
datamax/parser/txt_parser.py +45 -14
datamax/parser/xls_parser.py +34 -6
datamax/parser/xlsx_parser.py +58 -51
datamax/utils/__init__.py +2 -1
datamax/utils/data_cleaner.py +36 -22
datamax/utils/env_setup.py +25 -18
datamax/utils/gotocr_pdf.py +13 -13
datamax/utils/lifecycle_types.py +18 -0
datamax/utils/mineru_operator.py +17 -15
datamax/utils/paddleocr_pdf_operator.py +34 -19
datamax/utils/ppt_extract.py +34 -11
datamax/utils/qa_generator.py +332 -44
datamax/utils/tokenizer.py +10 -9
datamax/utils/uno_handler.py +91 -68
{pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
pydatamax-0.1.16.dist-info/RECORD +0 -38
{pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
{pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
{pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0

datamax/parser/epub_parser.py CHANGED Viewed

@@ -1,10 +1,14 @@
-import ebooklib
+import os
 from typing import Union
+import ebooklib
+import loguru
 from bs4 import BeautifulSoup
 from ebooklib import epub
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
-import os
+from datamax.parser.base import BaseLife, MarkdownOutputVo
+from datamax.utils.lifecycle_types import LifeType
 class EpubParser(BaseLife):
     def __init__(self, file_path: Union[str, list]):
@@ -18,10 +22,10 @@ class EpubParser(BaseLife):
             content = ""
             for item in book.get_items():
                 if item.get_type() == ebooklib.ITEM_DOCUMENT:
-                    chapter_content = item.get_content().decode('utf-8')
-                    soup = BeautifulSoup(chapter_content, 'html.parser')
+                    chapter_content = item.get_content().decode("utf-8")
+                    soup = BeautifulSoup(chapter_content, "html.parser")
                     text = soup.get_text()
-                    text = text.replace('\u3000', ' ')
+                    text = text.replace("\u3000", " ")
                     content += text
             return content
         except Exception as e:
@@ -29,13 +33,45 @@ class EpubParser(BaseLife):
     def parse(self, file_path: str) -> MarkdownOutputVo:
         try:
-            title = os.path.splitext(os.path.basename(file_path))[0]
+            extension = self.get_file_extension(file_path)
+            # 1) 开始处理
+            start_lc = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type=LifeType.DATA_PROCESSING,
+            )
+            # 2) 读取EPUB内容
             content = self.read_epub_file(file_path=file_path)
             mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
+            # 3) 创建输出 VO 并添加开始事件
+            output_vo = MarkdownOutputVo(extension, mk_content)
+            output_vo.add_lifecycle(start_lc)
+            # 4) 处理完成
+            end_lc = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type=LifeType.DATA_PROCESSED,
+            )
+            output_vo.add_lifecycle(end_lc)
             return output_vo.to_dict()
         except Exception as e:
-            raise e
+            loguru.logger.error(f"Failed to parse epub file {file_path}: {e}")
+            # 失败时记录一次失败生命周期（可选）
+            fail_lc = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type=LifeType.DATA_PROCESS_FAILED,
+            )
+            # 若需返回 VO：
+            # output_vo = MarkdownOutputVo(self.get_file_extension(file_path), "")
+            # output_vo.add_lifecycle(fail_lc)
+            raise

datamax/parser/html_parser.py CHANGED Viewed

@@ -1,13 +1,10 @@
 from typing import Union
-import pathlib
-import sys
-ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
-sys.path.insert(0, str(ROOT_DIR))
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
 from bs4 import BeautifulSoup
-import os
+from datamax.parser.base import BaseLife, MarkdownOutputVo
+from datamax.utils.lifecycle_types import LifeType
 class HtmlParser(BaseLife):
     def __init__(self, file_path: Union[str, list]):
@@ -17,22 +14,45 @@ class HtmlParser(BaseLife):
     @staticmethod
     def read_html_file(file_path: str) -> str:
         try:
-            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                 data = f.read()
-                soup = BeautifulSoup(data, 'html.parser')
-                return soup.get_text(separator='\n', strip=True)
+                soup = BeautifulSoup(data, "html.parser")
+                return soup.get_text(separator="\n", strip=True)
         except Exception:
             raise
     def parse(self, file_path: str) -> MarkdownOutputVo:
         try:
-            title = os.path.splitext(os.path.basename(file_path))[0]
+            # 1) 提取扩展名并生成“处理开始”事件
+            extension = self.get_file_extension(file_path)
+            lc_start = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                life_type=LifeType.DATA_PROCESSING,
+                usage_purpose="Parsing",
+            )
+            # 2) 核心解析
             content = self.read_html_file(file_path=file_path)
             mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
+            # 3) 根据内容生成“处理完成”或“处理失败”事件
+            lc_end = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                life_type=(
+                    LifeType.DATA_PROCESSED
+                    if mk_content.strip()
+                    else LifeType.DATA_PROCESS_FAILED
+                ),
+                usage_purpose="Parsing",
+            )
+            # 4) 封装输出并添加生命周期
+            output_vo = MarkdownOutputVo(extension, mk_content)
+            output_vo.add_lifecycle(lc_start)
+            output_vo.add_lifecycle(lc_end)
             return output_vo.to_dict()
         except Exception:
-            raise
+            raise

datamax/parser/image_parser.py CHANGED Viewed

@@ -1,34 +1,72 @@
 import os
 import pathlib
 import sys
 from datamax.utils import setup_environment
 setup_environment(use_gpu=True)
-os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
-from datamax.parser.base import MarkdownOutputVo
+os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
 ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
 sys.path.insert(0, str(ROOT_DIR))
+from PIL import Image
 from datamax.parser.base import BaseLife
 from datamax.parser.pdf_parser import PdfParser
-from PIL import Image
+from datamax.utils.lifecycle_types import LifeType
 class ImageParser(BaseLife):
-    def __init__(self,file_path: str):
+    def __init__(self, file_path: str):
         super().__init__()
         self.file_path = file_path
-    def parse(self, file_path: str) -> MarkdownOutputVo:
+    def parse(self, file_path: str):
         try:
-            title = os.path.splitext(os.path.basename(file_path))[0]
-            output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
-            image = Image.open(file_path)
-            image.save(output_pdf_path, 'PDF', resolution=100.0)
+            # 1) 处理开始：生成 DATA_PROCESSING 事件
+            extension = self.get_file_extension(file_path)
+            lc_start = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                life_type=LifeType.DATA_PROCESSING,
+                usage_purpose="Parsing",
+            )
+            # 【1】改用 pathlib.Path.stem 获取“基础名”
+            base_name = pathlib.Path(file_path).stem
+            output_pdf_path = f"{base_name}.pdf"
+            # 转换图片为 PDF
+            img = Image.open(file_path)
+            img.save(output_pdf_path, "PDF", resolution=100.0)
+            # 委托 PdfParser 解析，传入扩展名已由 PdfParser 内部获取
             pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
-            output_vo = pdf_parser.parse(output_pdf_path)
+            result = pdf_parser.parse(output_pdf_path)
+            # 清理临时文件
             if os.path.exists(output_pdf_path):
-                # shutil.rmtree(f'uploaded_files/markdown')
                 os.remove(output_pdf_path)
-            return output_vo
-        except Exception as e:
-            raise e
+            # 2) 处理结束：根据内容是否非空生成 DATA_PROCESSED 或 DATA_PROCESS_FAILED
+            content = result.get("content", "")
+            lc_end = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                life_type=(
+                    LifeType.DATA_PROCESSED
+                    if content.strip()
+                    else LifeType.DATA_PROCESS_FAILED
+                ),
+                usage_purpose="Parsing",
+            )
+            # 3) 合并生命周期：先插入 start，再追加 end
+            lifecycle = result.get("lifecycle", [])
+            lifecycle.insert(0, lc_start.to_dict())
+            lifecycle.append(lc_end.to_dict())
+            result["lifecycle"] = lifecycle
+            return result
+        except Exception:
+            raise

datamax/parser/json_parser.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import json
 from datamax.parser.base import BaseLife, MarkdownOutputVo
+from datamax.utils.lifecycle_types import LifeType
 class JsonParser(BaseLife):
@@ -18,15 +19,35 @@ class JsonParser(BaseLife):
     def parse(self, file_path: str) -> MarkdownOutputVo:
         try:
+            # 1) 处理开始：DATA_PROCESSING
+            extension = self.get_file_extension(file_path)
+            lc_start = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                life_type=LifeType.DATA_PROCESSING,
+                usage_purpose="Parsing",
+            )
+            # 2) 核心解析：读取并格式化 JSON
             content = self.read_json_file(file_path)
-            lifecycle = self.generate_lifecycle(
+            # 3) 处理结束：DATA_PROCESSED 或 DATA_PROCESS_FAILED
+            lc_end = self.generate_lifecycle(
                 source_file=file_path,
                 domain="Technology",
-                usage_purpose="Documentation",
-                life_type="LLM_ORIGIN",
+                life_type=(
+                    LifeType.DATA_PROCESSED
+                    if content.strip()
+                    else LifeType.DATA_PROCESS_FAILED
+                ),
+                usage_purpose="Parsing",
             )
-            output_vo = MarkdownOutputVo(self.get_file_extension(file_path), content)
-            output_vo.add_lifecycle(lifecycle)
+            # 4) 封装输出并添加这两条生命周期
+            output_vo = MarkdownOutputVo(extension, content)
+            output_vo.add_lifecycle(lc_start)
+            output_vo.add_lifecycle(lc_end)
             return output_vo.to_dict()
         except Exception as e:
             raise e

datamax/parser/md_parser.py CHANGED Viewed

@@ -1,13 +1,11 @@
-import pathlib
-import sys
 from typing import Union
-ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
-sys.path.insert(0, str(ROOT_DIR))
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
+import loguru
 from loguru import logger
-import os
+from datamax.parser.base import BaseLife, MarkdownOutputVo
+from datamax.utils.lifecycle_types import LifeType
 class MarkdownParser(BaseLife):
     """
@@ -15,9 +13,10 @@ class MarkdownParser(BaseLife):
     Handles .md and .markdown file extensions.
     """
-    def __init__(self,
-                 file_path: Union[str, list],
-                 ):
+    def __init__(
+        self,
+        file_path: Union[str, list],
+    ):
         super().__init__()
         self.file_path = file_path
@@ -33,7 +32,7 @@ class MarkdownParser(BaseLife):
             str: Content of the markdown file
         """
         try:
-            with open(file_path, 'r', encoding='utf-8') as f:
+            with open(file_path, "r", encoding="utf-8") as f:
                 return f.read()
         except Exception as e:
             logger.error(f"Error reading markdown file {file_path}: {e}")
@@ -50,24 +49,44 @@ class MarkdownParser(BaseLife):
             MarkdownOutputVo: Structured output containing the markdown content
         """
         try:
-            title = os.path.splitext(os.path.basename(file_path))[0]
+            extension = self.get_file_extension(file_path)
+            # 1) 生成“开始处理”生命周期
+            start_lc = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type=LifeType.DATA_PROCESSING,
+            )
-            # Read markdown content
+            # 2) 读取 Markdown 内容
             md_content = self.read_markdown_file(file_path)
-            # Generate lifecycle metadata
-            lifecycle = self.generate_lifecycle(
+            # 3) 创建输出 VO，并添加开始事件
+            output_vo = MarkdownOutputVo(extension, md_content)
+            output_vo.add_lifecycle(start_lc)
+            # 4) 生成“处理完成”生命周期
+            end_lc = self.generate_lifecycle(
                 source_file=file_path,
                 domain="Technology",
                 usage_purpose="Documentation",
-                life_type="LLM_ORIGIN"
+                life_type=LifeType.DATA_PROCESSED,
             )
+            output_vo.add_lifecycle(end_lc)
-            # Create and return output VO
-            output_vo = MarkdownOutputVo(title, md_content)
-            output_vo.add_lifecycle(lifecycle)
             return output_vo.to_dict()
         except Exception as e:
-            logger.error(f"Failed to parse markdown file {file_path}: {e}")
-            raise
+            loguru.logger.error(f"Failed to parse markdown file {file_path}: {e}")
+            # （可选）记录一次失败生命周期
+            fail_lc = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type=LifeType.DATA_PROCESS_FAILED,
+            )
+            # 如果想在失败时也返回 VO，可以这样做：
+            # output_vo = MarkdownOutputVo(self.get_file_extension(file_path), "")
+            # output_vo.add_lifecycle(fail_lc)
+            raise

datamax/parser/pdf_parser.py CHANGED Viewed

@@ -1,24 +1,22 @@
 import os
-import pathlib
-import sys
 import subprocess
 from typing import Union
-ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
-sys.path.insert(0, str(ROOT_DIR))
-from datamax.parser.base import BaseLife
-from datamax.parser.base import MarkdownOutputVo
 from langchain_community.document_loaders import PyMuPDFLoader
 from loguru import logger
+from datamax.parser.base import BaseLife, MarkdownOutputVo
+from datamax.utils.lifecycle_types import LifeType
 from datamax.utils.mineru_operator import pdf_processor
-import os
 class PdfParser(BaseLife):
-    def __init__(self,
-                 file_path: Union[str, list],
-                 use_mineru: bool = False,
-                 ):
+    def __init__(
+        self,
+        file_path: Union[str, list],
+        use_mineru: bool = False,
+    ):
         super().__init__()
         self.file_path = file_path
@@ -27,17 +25,25 @@ class PdfParser(BaseLife):
     def mineru_process(self, input_pdf_filename, output_dir):
         proc = None
         try:
-            logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
-            command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
-            proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info(
+                f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!"
+            )
+            command = ["magic-pdf", "-p", input_pdf_filename, "-o", output_dir]
+            proc = subprocess.Popen(
+                command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+            )
             # 等待命令执行完成
             stdout, stderr = proc.communicate()
             # 检查命令是否成功执行
             if proc.returncode != 0:
-                raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
+                raise Exception(
+                    f"mineru failed with return code {proc.returncode}: {stderr.decode()}"
+                )
-            logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
+            logger.info(
+                f"Markdown saved in {output_dir}, input file is {input_pdf_filename}"
+            )
         except Exception as e:
             logger.error(f"Error: {e}")
@@ -53,14 +59,16 @@ class PdfParser(BaseLife):
                 if proc.poll() is None:
                     proc.kill()
                     proc.wait()
-                    logger.info("The process was terminated due to timeout or completion.")
+                    logger.info(
+                        "The process was terminated due to timeout or completion."
+                    )
     @staticmethod
     def read_pdf_file(file_path) -> str:
         try:
             pdf_loader = PyMuPDFLoader(file_path)
             pdf_documents = pdf_loader.load()
-            result_text = ''
+            result_text = ""
             for page in pdf_documents:
                 result_text += page.page_content
             return result_text
@@ -68,34 +76,66 @@ class PdfParser(BaseLife):
             raise e
     def parse(self, file_path: str) -> MarkdownOutputVo:
+        lc_start = self.generate_lifecycle(
+            source_file=file_path,
+            domain="Technology",
+            usage_purpose="Documentation",
+            life_type=LifeType.DATA_PROCESSING,
+        )
+        logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
         try:
-            title = os.path.splitext(os.path.basename(file_path))[0]
+            extension = self.get_file_extension(file_path)
             if self.use_mineru:
-                output_dir = 'uploaded_files'
+                output_dir = "uploaded_files"
                 output_folder_name = os.path.basename(file_path).replace(".pdf", "")
                 # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
                 # if os.path.exists(output_mineru):
                 #     pass
                 # else:
-                    # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
+                # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
                 # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
                 # todo: 是否有必要跟api的默认保存路径保持一致
-                output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
+                output_mineru = f"{output_dir}/markdown/{output_folder_name}.md"
                 if os.path.exists(output_mineru):
-                    mk_content = open(output_mineru, 'r', encoding='utf-8').read()
+                    mk_content = open(output_mineru, "r", encoding="utf-8").read()
                 else:
                     mk_content = pdf_processor.process_pdf(file_path)
             else:
                 content = self.read_pdf_file(file_path=file_path)
                 mk_content = content
-            lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
-                                                usage_purpose="Documentation", life_type="LLM_ORIGIN")
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
+            # —— 生命周期：处理完成 —— #
+            lc_end = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type=LifeType.DATA_PROCESSED,
+            )
+            logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
+            output_vo = MarkdownOutputVo(extension, mk_content)
+            output_vo.add_lifecycle(lc_start)
+            output_vo.add_lifecycle(lc_end)
             return output_vo.to_dict()
-        except Exception:
-            raise
+        except Exception as e:
+            # —— 生命周期：处理失败 —— #
+            lc_fail = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type=LifeType.DATA_PROCESS_FAILED,
+            )
+            logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
+            raise Exception(
+                {
+                    "error": str(e),
+                    "file_path": file_path,
+                    "lifecycle": [lc_fail.to_dict()],
+                }
+            )

datamax/parser/ppt_parser.py CHANGED Viewed

@@ -6,8 +6,10 @@ from pathlib import Path
 from typing import Union
 import chardet
+from loguru import logger
 from datamax.parser.base import BaseLife, MarkdownOutputVo
+from datamax.utils.lifecycle_types import LifeType
 from datamax.utils.ppt_extract import PPtExtractor
 # 尝试导入UNO处理器
@@ -17,7 +19,7 @@ except ImportError:
     HAS_UNO = False
-class PPtParser(BaseLife):
+class PptParser(BaseLife):
     def __init__(self, file_path: Union[str, list], use_uno: bool = None):
         super().__init__()
         self.file_path = file_path
@@ -106,19 +108,49 @@ class PPtParser(BaseLife):
             raise
     def parse(self, file_path: str) -> MarkdownOutputVo:
+        # —— 生命周期：开始处理 PPT —— #
+        lc_start = self.generate_lifecycle(
+            source_file=file_path,
+            domain="Technology",
+            usage_purpose="Documentation",
+            life_type=LifeType.DATA_PROCESSING,
+        )
+        logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
         try:
-            title = os.path.splitext(os.path.basename(file_path))[0]
+            extension = self.get_file_extension(file_path)
             content = self.read_ppt_file(file_path=file_path)
-            # clean_text = clean_original_text(content)
             mk_content = content
-            lifecycle = self.generate_lifecycle(
+            # —— 生命周期：处理完成 —— #
+            lc_end = self.generate_lifecycle(
                 source_file=file_path,
                 domain="Technology",
                 usage_purpose="Documentation",
-                life_type="LLM_ORIGIN",
+                life_type=LifeType.DATA_PROCESSED,
             )
-            output_vo = MarkdownOutputVo(title, mk_content)
-            output_vo.add_lifecycle(lifecycle)
+            logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
+            output_vo = MarkdownOutputVo(extension, mk_content)
+            output_vo.add_lifecycle(lc_start)
+            output_vo.add_lifecycle(lc_end)
             return output_vo.to_dict()
-        except Exception:
-            raise
+        except Exception as e:
+            # —— 生命周期：处理失败 —— #
+            lc_fail = self.generate_lifecycle(
+                source_file=file_path,
+                domain="Technology",
+                usage_purpose="Documentation",
+                life_type=LifeType.DATA_PROCESS_FAILED,
+            )
+            logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
+            # 返回包含失败生命周期的异常信息
+            raise Exception(
+                {
+                    "error": str(e),
+                    "file_path": file_path,
+                    "lifecycle": [lc_fail.to_dict()],
+                }
+            )

pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

pydatamax 0.1.16py3-none-any.whl → 0.1.16.post2py3-none-any.whl