pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. datamax/loader/core.py +67 -42
  2. datamax/loader/minio_handler.py +38 -19
  3. datamax/parser/__init__.py +2 -1
  4. datamax/parser/base.py +46 -22
  5. datamax/parser/core.py +215 -126
  6. datamax/parser/csv_parser.py +25 -5
  7. datamax/parser/doc_parser.py +230 -141
  8. datamax/parser/docx_parser.py +275 -186
  9. datamax/parser/epub_parser.py +49 -13
  10. datamax/parser/html_parser.py +36 -16
  11. datamax/parser/image_parser.py +52 -14
  12. datamax/parser/json_parser.py +26 -5
  13. datamax/parser/md_parser.py +40 -21
  14. datamax/parser/pdf_parser.py +69 -29
  15. datamax/parser/ppt_parser.py +41 -9
  16. datamax/parser/pptx_parser.py +49 -21
  17. datamax/parser/txt_parser.py +45 -14
  18. datamax/parser/xls_parser.py +34 -6
  19. datamax/parser/xlsx_parser.py +58 -51
  20. datamax/utils/__init__.py +2 -1
  21. datamax/utils/data_cleaner.py +36 -22
  22. datamax/utils/env_setup.py +25 -18
  23. datamax/utils/gotocr_pdf.py +13 -13
  24. datamax/utils/lifecycle_types.py +18 -0
  25. datamax/utils/mineru_operator.py +17 -15
  26. datamax/utils/paddleocr_pdf_operator.py +34 -19
  27. datamax/utils/ppt_extract.py +34 -11
  28. datamax/utils/qa_generator.py +332 -44
  29. datamax/utils/tokenizer.py +10 -9
  30. datamax/utils/uno_handler.py +91 -68
  31. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
  32. pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
  33. pydatamax-0.1.16.dist-info/RECORD +0 -38
  34. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
  35. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
  36. {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,14 @@
1
- import ebooklib
1
+ import os
2
2
  from typing import Union
3
+
4
+ import ebooklib
5
+ import loguru
3
6
  from bs4 import BeautifulSoup
4
7
  from ebooklib import epub
5
- from datamax.parser.base import BaseLife
6
- from datamax.parser.base import MarkdownOutputVo
7
- import os
8
+
9
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
10
+ from datamax.utils.lifecycle_types import LifeType
11
+
8
12
 
9
13
  class EpubParser(BaseLife):
10
14
  def __init__(self, file_path: Union[str, list]):
@@ -18,10 +22,10 @@ class EpubParser(BaseLife):
18
22
  content = ""
19
23
  for item in book.get_items():
20
24
  if item.get_type() == ebooklib.ITEM_DOCUMENT:
21
- chapter_content = item.get_content().decode('utf-8')
22
- soup = BeautifulSoup(chapter_content, 'html.parser')
25
+ chapter_content = item.get_content().decode("utf-8")
26
+ soup = BeautifulSoup(chapter_content, "html.parser")
23
27
  text = soup.get_text()
24
- text = text.replace('\u3000', ' ')
28
+ text = text.replace("\u3000", " ")
25
29
  content += text
26
30
  return content
27
31
  except Exception as e:
@@ -29,13 +33,45 @@ class EpubParser(BaseLife):
29
33
 
30
34
  def parse(self, file_path: str) -> MarkdownOutputVo:
31
35
  try:
32
- title = os.path.splitext(os.path.basename(file_path))[0]
36
+ extension = self.get_file_extension(file_path)
37
+
38
+ # 1) 开始处理
39
+ start_lc = self.generate_lifecycle(
40
+ source_file=file_path,
41
+ domain="Technology",
42
+ usage_purpose="Documentation",
43
+ life_type=LifeType.DATA_PROCESSING,
44
+ )
45
+
46
+ # 2) 读取EPUB内容
33
47
  content = self.read_epub_file(file_path=file_path)
34
48
  mk_content = content
35
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
36
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
37
- output_vo = MarkdownOutputVo(title, mk_content)
38
- output_vo.add_lifecycle(lifecycle)
49
+
50
+ # 3) 创建输出 VO 并添加开始事件
51
+ output_vo = MarkdownOutputVo(extension, mk_content)
52
+ output_vo.add_lifecycle(start_lc)
53
+
54
+ # 4) 处理完成
55
+ end_lc = self.generate_lifecycle(
56
+ source_file=file_path,
57
+ domain="Technology",
58
+ usage_purpose="Documentation",
59
+ life_type=LifeType.DATA_PROCESSED,
60
+ )
61
+ output_vo.add_lifecycle(end_lc)
62
+
39
63
  return output_vo.to_dict()
64
+
40
65
  except Exception as e:
41
- raise e
66
+ loguru.logger.error(f"Failed to parse epub file {file_path}: {e}")
67
+ # 失败时记录一次失败生命周期(可选)
68
+ fail_lc = self.generate_lifecycle(
69
+ source_file=file_path,
70
+ domain="Technology",
71
+ usage_purpose="Documentation",
72
+ life_type=LifeType.DATA_PROCESS_FAILED,
73
+ )
74
+ # 若需返回 VO:
75
+ # output_vo = MarkdownOutputVo(self.get_file_extension(file_path), "")
76
+ # output_vo.add_lifecycle(fail_lc)
77
+ raise
@@ -1,13 +1,10 @@
1
1
  from typing import Union
2
- import pathlib
3
- import sys
4
2
 
5
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
- sys.path.insert(0, str(ROOT_DIR))
7
- from datamax.parser.base import BaseLife
8
- from datamax.parser.base import MarkdownOutputVo
9
3
  from bs4 import BeautifulSoup
10
- import os
4
+
5
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
6
+ from datamax.utils.lifecycle_types import LifeType
7
+
11
8
 
12
9
  class HtmlParser(BaseLife):
13
10
  def __init__(self, file_path: Union[str, list]):
@@ -17,22 +14,45 @@ class HtmlParser(BaseLife):
17
14
  @staticmethod
18
15
  def read_html_file(file_path: str) -> str:
19
16
  try:
20
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
17
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
21
18
  data = f.read()
22
- soup = BeautifulSoup(data, 'html.parser')
23
- return soup.get_text(separator='\n', strip=True)
19
+ soup = BeautifulSoup(data, "html.parser")
20
+ return soup.get_text(separator="\n", strip=True)
24
21
  except Exception:
25
22
  raise
26
23
 
27
24
  def parse(self, file_path: str) -> MarkdownOutputVo:
28
25
  try:
29
- title = os.path.splitext(os.path.basename(file_path))[0]
26
+ # 1) 提取扩展名并生成“处理开始”事件
27
+ extension = self.get_file_extension(file_path)
28
+ lc_start = self.generate_lifecycle(
29
+ source_file=file_path,
30
+ domain="Technology",
31
+ life_type=LifeType.DATA_PROCESSING,
32
+ usage_purpose="Parsing",
33
+ )
34
+
35
+ # 2) 核心解析
30
36
  content = self.read_html_file(file_path=file_path)
31
37
  mk_content = content
32
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
33
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
34
- output_vo = MarkdownOutputVo(title, mk_content)
35
- output_vo.add_lifecycle(lifecycle)
38
+
39
+ # 3) 根据内容生成“处理完成”或“处理失败”事件
40
+ lc_end = self.generate_lifecycle(
41
+ source_file=file_path,
42
+ domain="Technology",
43
+ life_type=(
44
+ LifeType.DATA_PROCESSED
45
+ if mk_content.strip()
46
+ else LifeType.DATA_PROCESS_FAILED
47
+ ),
48
+ usage_purpose="Parsing",
49
+ )
50
+
51
+ # 4) 封装输出并添加生命周期
52
+ output_vo = MarkdownOutputVo(extension, mk_content)
53
+ output_vo.add_lifecycle(lc_start)
54
+ output_vo.add_lifecycle(lc_end)
36
55
  return output_vo.to_dict()
56
+
37
57
  except Exception:
38
- raise
58
+ raise
@@ -1,34 +1,72 @@
1
1
  import os
2
2
  import pathlib
3
3
  import sys
4
+
4
5
  from datamax.utils import setup_environment
5
6
 
6
7
  setup_environment(use_gpu=True)
7
- os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
8
- from datamax.parser.base import MarkdownOutputVo
8
+ os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
9
+
9
10
 
10
11
  ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
11
12
  sys.path.insert(0, str(ROOT_DIR))
13
+ from PIL import Image
14
+
12
15
  from datamax.parser.base import BaseLife
13
16
  from datamax.parser.pdf_parser import PdfParser
14
- from PIL import Image
17
+ from datamax.utils.lifecycle_types import LifeType
18
+
15
19
 
16
20
  class ImageParser(BaseLife):
17
- def __init__(self,file_path: str):
21
+ def __init__(self, file_path: str):
18
22
  super().__init__()
19
23
  self.file_path = file_path
20
24
 
21
- def parse(self, file_path: str) -> MarkdownOutputVo:
25
+ def parse(self, file_path: str):
22
26
  try:
23
- title = os.path.splitext(os.path.basename(file_path))[0]
24
- output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
25
- image = Image.open(file_path)
26
- image.save(output_pdf_path, 'PDF', resolution=100.0)
27
+ # 1) 处理开始:生成 DATA_PROCESSING 事件
28
+ extension = self.get_file_extension(file_path)
29
+ lc_start = self.generate_lifecycle(
30
+ source_file=file_path,
31
+ domain="Technology",
32
+ life_type=LifeType.DATA_PROCESSING,
33
+ usage_purpose="Parsing",
34
+ )
35
+ # 【1】改用 pathlib.Path.stem 获取“基础名”
36
+ base_name = pathlib.Path(file_path).stem
37
+ output_pdf_path = f"{base_name}.pdf"
38
+
39
+ # 转换图片为 PDF
40
+ img = Image.open(file_path)
41
+ img.save(output_pdf_path, "PDF", resolution=100.0)
42
+
43
+ # 委托 PdfParser 解析,传入扩展名已由 PdfParser 内部获取
27
44
  pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
28
- output_vo = pdf_parser.parse(output_pdf_path)
45
+ result = pdf_parser.parse(output_pdf_path)
46
+
47
+ # 清理临时文件
29
48
  if os.path.exists(output_pdf_path):
30
- # shutil.rmtree(f'uploaded_files/markdown')
31
49
  os.remove(output_pdf_path)
32
- return output_vo
33
- except Exception as e:
34
- raise e
50
+ # 2) 处理结束:根据内容是否非空生成 DATA_PROCESSED 或 DATA_PROCESS_FAILED
51
+ content = result.get("content", "")
52
+ lc_end = self.generate_lifecycle(
53
+ source_file=file_path,
54
+ domain="Technology",
55
+ life_type=(
56
+ LifeType.DATA_PROCESSED
57
+ if content.strip()
58
+ else LifeType.DATA_PROCESS_FAILED
59
+ ),
60
+ usage_purpose="Parsing",
61
+ )
62
+
63
+ # 3) 合并生命周期:先插入 start,再追加 end
64
+ lifecycle = result.get("lifecycle", [])
65
+ lifecycle.insert(0, lc_start.to_dict())
66
+ lifecycle.append(lc_end.to_dict())
67
+ result["lifecycle"] = lifecycle
68
+
69
+ return result
70
+
71
+ except Exception:
72
+ raise
@@ -1,6 +1,7 @@
1
1
  import json
2
2
 
3
3
  from datamax.parser.base import BaseLife, MarkdownOutputVo
4
+ from datamax.utils.lifecycle_types import LifeType
4
5
 
5
6
 
6
7
  class JsonParser(BaseLife):
@@ -18,15 +19,35 @@ class JsonParser(BaseLife):
18
19
 
19
20
  def parse(self, file_path: str) -> MarkdownOutputVo:
20
21
  try:
22
+ # 1) 处理开始:DATA_PROCESSING
23
+ extension = self.get_file_extension(file_path)
24
+ lc_start = self.generate_lifecycle(
25
+ source_file=file_path,
26
+ domain="Technology",
27
+ life_type=LifeType.DATA_PROCESSING,
28
+ usage_purpose="Parsing",
29
+ )
30
+
31
+ # 2) 核心解析:读取并格式化 JSON
21
32
  content = self.read_json_file(file_path)
22
- lifecycle = self.generate_lifecycle(
33
+
34
+ # 3) 处理结束:DATA_PROCESSED 或 DATA_PROCESS_FAILED
35
+ lc_end = self.generate_lifecycle(
23
36
  source_file=file_path,
24
37
  domain="Technology",
25
- usage_purpose="Documentation",
26
- life_type="LLM_ORIGIN",
38
+ life_type=(
39
+ LifeType.DATA_PROCESSED
40
+ if content.strip()
41
+ else LifeType.DATA_PROCESS_FAILED
42
+ ),
43
+ usage_purpose="Parsing",
27
44
  )
28
- output_vo = MarkdownOutputVo(self.get_file_extension(file_path), content)
29
- output_vo.add_lifecycle(lifecycle)
45
+
46
+ # 4) 封装输出并添加这两条生命周期
47
+ output_vo = MarkdownOutputVo(extension, content)
48
+ output_vo.add_lifecycle(lc_start)
49
+ output_vo.add_lifecycle(lc_end)
30
50
  return output_vo.to_dict()
51
+
31
52
  except Exception as e:
32
53
  raise e
@@ -1,13 +1,11 @@
1
- import pathlib
2
- import sys
3
1
  from typing import Union
4
2
 
5
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
- sys.path.insert(0, str(ROOT_DIR))
7
- from datamax.parser.base import BaseLife
8
- from datamax.parser.base import MarkdownOutputVo
3
+ import loguru
9
4
  from loguru import logger
10
- import os
5
+
6
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
7
+ from datamax.utils.lifecycle_types import LifeType
8
+
11
9
 
12
10
  class MarkdownParser(BaseLife):
13
11
  """
@@ -15,9 +13,10 @@ class MarkdownParser(BaseLife):
15
13
  Handles .md and .markdown file extensions.
16
14
  """
17
15
 
18
- def __init__(self,
19
- file_path: Union[str, list],
20
- ):
16
+ def __init__(
17
+ self,
18
+ file_path: Union[str, list],
19
+ ):
21
20
  super().__init__()
22
21
  self.file_path = file_path
23
22
 
@@ -33,7 +32,7 @@ class MarkdownParser(BaseLife):
33
32
  str: Content of the markdown file
34
33
  """
35
34
  try:
36
- with open(file_path, 'r', encoding='utf-8') as f:
35
+ with open(file_path, "r", encoding="utf-8") as f:
37
36
  return f.read()
38
37
  except Exception as e:
39
38
  logger.error(f"Error reading markdown file {file_path}: {e}")
@@ -50,24 +49,44 @@ class MarkdownParser(BaseLife):
50
49
  MarkdownOutputVo: Structured output containing the markdown content
51
50
  """
52
51
  try:
53
- title = os.path.splitext(os.path.basename(file_path))[0]
52
+ extension = self.get_file_extension(file_path)
53
+
54
+ # 1) 生成“开始处理”生命周期
55
+ start_lc = self.generate_lifecycle(
56
+ source_file=file_path,
57
+ domain="Technology",
58
+ usage_purpose="Documentation",
59
+ life_type=LifeType.DATA_PROCESSING,
60
+ )
54
61
 
55
- # Read markdown content
62
+ # 2) 读取 Markdown 内容
56
63
  md_content = self.read_markdown_file(file_path)
57
64
 
58
- # Generate lifecycle metadata
59
- lifecycle = self.generate_lifecycle(
65
+ # 3) 创建输出 VO,并添加开始事件
66
+ output_vo = MarkdownOutputVo(extension, md_content)
67
+ output_vo.add_lifecycle(start_lc)
68
+
69
+ # 4) 生成“处理完成”生命周期
70
+ end_lc = self.generate_lifecycle(
60
71
  source_file=file_path,
61
72
  domain="Technology",
62
73
  usage_purpose="Documentation",
63
- life_type="LLM_ORIGIN"
74
+ life_type=LifeType.DATA_PROCESSED,
64
75
  )
76
+ output_vo.add_lifecycle(end_lc)
65
77
 
66
- # Create and return output VO
67
- output_vo = MarkdownOutputVo(title, md_content)
68
- output_vo.add_lifecycle(lifecycle)
69
78
  return output_vo.to_dict()
70
79
 
71
80
  except Exception as e:
72
- logger.error(f"Failed to parse markdown file {file_path}: {e}")
73
- raise
81
+ loguru.logger.error(f"Failed to parse markdown file {file_path}: {e}")
82
+ # (可选)记录一次失败生命周期
83
+ fail_lc = self.generate_lifecycle(
84
+ source_file=file_path,
85
+ domain="Technology",
86
+ usage_purpose="Documentation",
87
+ life_type=LifeType.DATA_PROCESS_FAILED,
88
+ )
89
+ # 如果想在失败时也返回 VO,可以这样做:
90
+ # output_vo = MarkdownOutputVo(self.get_file_extension(file_path), "")
91
+ # output_vo.add_lifecycle(fail_lc)
92
+ raise
@@ -1,24 +1,22 @@
1
1
  import os
2
- import pathlib
3
- import sys
4
2
  import subprocess
5
3
  from typing import Union
6
4
 
7
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
8
- sys.path.insert(0, str(ROOT_DIR))
9
- from datamax.parser.base import BaseLife
10
- from datamax.parser.base import MarkdownOutputVo
11
5
  from langchain_community.document_loaders import PyMuPDFLoader
12
6
  from loguru import logger
7
+
8
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
9
+ from datamax.utils.lifecycle_types import LifeType
13
10
  from datamax.utils.mineru_operator import pdf_processor
14
- import os
11
+
15
12
 
16
13
  class PdfParser(BaseLife):
17
14
 
18
- def __init__(self,
19
- file_path: Union[str, list],
20
- use_mineru: bool = False,
21
- ):
15
+ def __init__(
16
+ self,
17
+ file_path: Union[str, list],
18
+ use_mineru: bool = False,
19
+ ):
22
20
  super().__init__()
23
21
 
24
22
  self.file_path = file_path
@@ -27,17 +25,25 @@ class PdfParser(BaseLife):
27
25
  def mineru_process(self, input_pdf_filename, output_dir):
28
26
  proc = None
29
27
  try:
30
- logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
31
- command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
32
- proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
28
+ logger.info(
29
+ f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!"
30
+ )
31
+ command = ["magic-pdf", "-p", input_pdf_filename, "-o", output_dir]
32
+ proc = subprocess.Popen(
33
+ command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
34
+ )
33
35
 
34
36
  # 等待命令执行完成
35
37
  stdout, stderr = proc.communicate()
36
38
  # 检查命令是否成功执行
37
39
  if proc.returncode != 0:
38
- raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
40
+ raise Exception(
41
+ f"mineru failed with return code {proc.returncode}: {stderr.decode()}"
42
+ )
39
43
 
40
- logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
44
+ logger.info(
45
+ f"Markdown saved in {output_dir}, input file is {input_pdf_filename}"
46
+ )
41
47
 
42
48
  except Exception as e:
43
49
  logger.error(f"Error: {e}")
@@ -53,14 +59,16 @@ class PdfParser(BaseLife):
53
59
  if proc.poll() is None:
54
60
  proc.kill()
55
61
  proc.wait()
56
- logger.info("The process was terminated due to timeout or completion.")
62
+ logger.info(
63
+ "The process was terminated due to timeout or completion."
64
+ )
57
65
 
58
66
  @staticmethod
59
67
  def read_pdf_file(file_path) -> str:
60
68
  try:
61
69
  pdf_loader = PyMuPDFLoader(file_path)
62
70
  pdf_documents = pdf_loader.load()
63
- result_text = ''
71
+ result_text = ""
64
72
  for page in pdf_documents:
65
73
  result_text += page.page_content
66
74
  return result_text
@@ -68,34 +76,66 @@ class PdfParser(BaseLife):
68
76
  raise e
69
77
 
70
78
  def parse(self, file_path: str) -> MarkdownOutputVo:
79
+
80
+ lc_start = self.generate_lifecycle(
81
+ source_file=file_path,
82
+ domain="Technology",
83
+ usage_purpose="Documentation",
84
+ life_type=LifeType.DATA_PROCESSING,
85
+ )
86
+ logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
71
87
  try:
72
- title = os.path.splitext(os.path.basename(file_path))[0]
88
+ extension = self.get_file_extension(file_path)
73
89
 
74
90
  if self.use_mineru:
75
- output_dir = 'uploaded_files'
91
+ output_dir = "uploaded_files"
76
92
  output_folder_name = os.path.basename(file_path).replace(".pdf", "")
77
93
  # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
78
94
  # if os.path.exists(output_mineru):
79
95
  # pass
80
96
  # else:
81
- # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
97
+ # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
82
98
  # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
83
99
 
84
100
  # todo: 是否有必要跟api的默认保存路径保持一致
85
- output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
101
+ output_mineru = f"{output_dir}/markdown/{output_folder_name}.md"
86
102
 
87
103
  if os.path.exists(output_mineru):
88
- mk_content = open(output_mineru, 'r', encoding='utf-8').read()
104
+ mk_content = open(output_mineru, "r", encoding="utf-8").read()
89
105
  else:
90
106
  mk_content = pdf_processor.process_pdf(file_path)
91
107
  else:
92
108
  content = self.read_pdf_file(file_path=file_path)
93
109
  mk_content = content
94
110
 
95
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
96
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
97
- output_vo = MarkdownOutputVo(title, mk_content)
98
- output_vo.add_lifecycle(lifecycle)
111
+ # —— 生命周期:处理完成 —— #
112
+ lc_end = self.generate_lifecycle(
113
+ source_file=file_path,
114
+ domain="Technology",
115
+ usage_purpose="Documentation",
116
+ life_type=LifeType.DATA_PROCESSED,
117
+ )
118
+ logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
119
+
120
+ output_vo = MarkdownOutputVo(extension, mk_content)
121
+ output_vo.add_lifecycle(lc_start)
122
+ output_vo.add_lifecycle(lc_end)
99
123
  return output_vo.to_dict()
100
- except Exception:
101
- raise
124
+
125
+ except Exception as e:
126
+ # —— 生命周期:处理失败 —— #
127
+ lc_fail = self.generate_lifecycle(
128
+ source_file=file_path,
129
+ domain="Technology",
130
+ usage_purpose="Documentation",
131
+ life_type=LifeType.DATA_PROCESS_FAILED,
132
+ )
133
+ logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
134
+
135
+ raise Exception(
136
+ {
137
+ "error": str(e),
138
+ "file_path": file_path,
139
+ "lifecycle": [lc_fail.to_dict()],
140
+ }
141
+ )
@@ -6,8 +6,10 @@ from pathlib import Path
6
6
  from typing import Union
7
7
 
8
8
  import chardet
9
+ from loguru import logger
9
10
 
10
11
  from datamax.parser.base import BaseLife, MarkdownOutputVo
12
+ from datamax.utils.lifecycle_types import LifeType
11
13
  from datamax.utils.ppt_extract import PPtExtractor
12
14
 
13
15
  # 尝试导入UNO处理器
@@ -17,7 +19,7 @@ except ImportError:
17
19
  HAS_UNO = False
18
20
 
19
21
 
20
- class PPtParser(BaseLife):
22
+ class PptParser(BaseLife):
21
23
  def __init__(self, file_path: Union[str, list], use_uno: bool = None):
22
24
  super().__init__()
23
25
  self.file_path = file_path
@@ -106,19 +108,49 @@ class PPtParser(BaseLife):
106
108
  raise
107
109
 
108
110
  def parse(self, file_path: str) -> MarkdownOutputVo:
111
+ # —— 生命周期:开始处理 PPT —— #
112
+ lc_start = self.generate_lifecycle(
113
+ source_file=file_path,
114
+ domain="Technology",
115
+ usage_purpose="Documentation",
116
+ life_type=LifeType.DATA_PROCESSING,
117
+ )
118
+ logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
119
+
109
120
  try:
110
- title = os.path.splitext(os.path.basename(file_path))[0]
121
+ extension = self.get_file_extension(file_path)
111
122
  content = self.read_ppt_file(file_path=file_path)
112
- # clean_text = clean_original_text(content)
113
123
  mk_content = content
114
- lifecycle = self.generate_lifecycle(
124
+
125
+ # —— 生命周期:处理完成 —— #
126
+ lc_end = self.generate_lifecycle(
115
127
  source_file=file_path,
116
128
  domain="Technology",
117
129
  usage_purpose="Documentation",
118
- life_type="LLM_ORIGIN",
130
+ life_type=LifeType.DATA_PROCESSED,
119
131
  )
120
- output_vo = MarkdownOutputVo(title, mk_content)
121
- output_vo.add_lifecycle(lifecycle)
132
+ logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
133
+
134
+ output_vo = MarkdownOutputVo(extension, mk_content)
135
+ output_vo.add_lifecycle(lc_start)
136
+ output_vo.add_lifecycle(lc_end)
122
137
  return output_vo.to_dict()
123
- except Exception:
124
- raise
138
+
139
+ except Exception as e:
140
+ # —— 生命周期:处理失败 —— #
141
+ lc_fail = self.generate_lifecycle(
142
+ source_file=file_path,
143
+ domain="Technology",
144
+ usage_purpose="Documentation",
145
+ life_type=LifeType.DATA_PROCESS_FAILED,
146
+ )
147
+ logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
148
+
149
+ # 返回包含失败生命周期的异常信息
150
+ raise Exception(
151
+ {
152
+ "error": str(e),
153
+ "file_path": file_path,
154
+ "lifecycle": [lc_fail.to_dict()],
155
+ }
156
+ )