pydatamax 0.1.16__py3-none-any.whl → 0.1.16.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/loader/core.py +67 -42
- datamax/loader/minio_handler.py +38 -19
- datamax/parser/__init__.py +2 -1
- datamax/parser/base.py +46 -22
- datamax/parser/core.py +215 -126
- datamax/parser/csv_parser.py +25 -5
- datamax/parser/doc_parser.py +230 -141
- datamax/parser/docx_parser.py +275 -186
- datamax/parser/epub_parser.py +49 -13
- datamax/parser/html_parser.py +36 -16
- datamax/parser/image_parser.py +52 -14
- datamax/parser/json_parser.py +26 -5
- datamax/parser/md_parser.py +40 -21
- datamax/parser/pdf_parser.py +69 -29
- datamax/parser/ppt_parser.py +41 -9
- datamax/parser/pptx_parser.py +49 -21
- datamax/parser/txt_parser.py +45 -14
- datamax/parser/xls_parser.py +34 -6
- datamax/parser/xlsx_parser.py +58 -51
- datamax/utils/__init__.py +2 -1
- datamax/utils/data_cleaner.py +36 -22
- datamax/utils/env_setup.py +25 -18
- datamax/utils/gotocr_pdf.py +13 -13
- datamax/utils/lifecycle_types.py +18 -0
- datamax/utils/mineru_operator.py +17 -15
- datamax/utils/paddleocr_pdf_operator.py +34 -19
- datamax/utils/ppt_extract.py +34 -11
- datamax/utils/qa_generator.py +332 -44
- datamax/utils/tokenizer.py +10 -9
- datamax/utils/uno_handler.py +91 -68
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/METADATA +54 -2
- pydatamax-0.1.16.post2.dist-info/RECORD +39 -0
- pydatamax-0.1.16.dist-info/RECORD +0 -38
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/WHEEL +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/licenses/LICENSE +0 -0
- {pydatamax-0.1.16.dist-info → pydatamax-0.1.16.post2.dist-info}/top_level.txt +0 -0
datamax/parser/epub_parser.py
CHANGED
@@ -1,10 +1,14 @@
|
|
1
|
-
import
|
1
|
+
import os
|
2
2
|
from typing import Union
|
3
|
+
|
4
|
+
import ebooklib
|
5
|
+
import loguru
|
3
6
|
from bs4 import BeautifulSoup
|
4
7
|
from ebooklib import epub
|
5
|
-
|
6
|
-
from datamax.parser.base import MarkdownOutputVo
|
7
|
-
import
|
8
|
+
|
9
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
10
|
+
from datamax.utils.lifecycle_types import LifeType
|
11
|
+
|
8
12
|
|
9
13
|
class EpubParser(BaseLife):
|
10
14
|
def __init__(self, file_path: Union[str, list]):
|
@@ -18,10 +22,10 @@ class EpubParser(BaseLife):
|
|
18
22
|
content = ""
|
19
23
|
for item in book.get_items():
|
20
24
|
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
21
|
-
chapter_content = item.get_content().decode(
|
22
|
-
soup = BeautifulSoup(chapter_content,
|
25
|
+
chapter_content = item.get_content().decode("utf-8")
|
26
|
+
soup = BeautifulSoup(chapter_content, "html.parser")
|
23
27
|
text = soup.get_text()
|
24
|
-
text = text.replace(
|
28
|
+
text = text.replace("\u3000", " ")
|
25
29
|
content += text
|
26
30
|
return content
|
27
31
|
except Exception as e:
|
@@ -29,13 +33,45 @@ class EpubParser(BaseLife):
|
|
29
33
|
|
30
34
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
31
35
|
try:
|
32
|
-
|
36
|
+
extension = self.get_file_extension(file_path)
|
37
|
+
|
38
|
+
# 1) 开始处理
|
39
|
+
start_lc = self.generate_lifecycle(
|
40
|
+
source_file=file_path,
|
41
|
+
domain="Technology",
|
42
|
+
usage_purpose="Documentation",
|
43
|
+
life_type=LifeType.DATA_PROCESSING,
|
44
|
+
)
|
45
|
+
|
46
|
+
# 2) 读取EPUB内容
|
33
47
|
content = self.read_epub_file(file_path=file_path)
|
34
48
|
mk_content = content
|
35
|
-
|
36
|
-
|
37
|
-
output_vo = MarkdownOutputVo(
|
38
|
-
output_vo.add_lifecycle(
|
49
|
+
|
50
|
+
# 3) 创建输出 VO 并添加开始事件
|
51
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
52
|
+
output_vo.add_lifecycle(start_lc)
|
53
|
+
|
54
|
+
# 4) 处理完成
|
55
|
+
end_lc = self.generate_lifecycle(
|
56
|
+
source_file=file_path,
|
57
|
+
domain="Technology",
|
58
|
+
usage_purpose="Documentation",
|
59
|
+
life_type=LifeType.DATA_PROCESSED,
|
60
|
+
)
|
61
|
+
output_vo.add_lifecycle(end_lc)
|
62
|
+
|
39
63
|
return output_vo.to_dict()
|
64
|
+
|
40
65
|
except Exception as e:
|
41
|
-
|
66
|
+
loguru.logger.error(f"Failed to parse epub file {file_path}: {e}")
|
67
|
+
# 失败时记录一次失败生命周期(可选)
|
68
|
+
fail_lc = self.generate_lifecycle(
|
69
|
+
source_file=file_path,
|
70
|
+
domain="Technology",
|
71
|
+
usage_purpose="Documentation",
|
72
|
+
life_type=LifeType.DATA_PROCESS_FAILED,
|
73
|
+
)
|
74
|
+
# 若需返回 VO:
|
75
|
+
# output_vo = MarkdownOutputVo(self.get_file_extension(file_path), "")
|
76
|
+
# output_vo.add_lifecycle(fail_lc)
|
77
|
+
raise
|
datamax/parser/html_parser.py
CHANGED
@@ -1,13 +1,10 @@
|
|
1
1
|
from typing import Union
|
2
|
-
import pathlib
|
3
|
-
import sys
|
4
2
|
|
5
|
-
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
6
|
-
sys.path.insert(0, str(ROOT_DIR))
|
7
|
-
from datamax.parser.base import BaseLife
|
8
|
-
from datamax.parser.base import MarkdownOutputVo
|
9
3
|
from bs4 import BeautifulSoup
|
10
|
-
|
4
|
+
|
5
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
6
|
+
from datamax.utils.lifecycle_types import LifeType
|
7
|
+
|
11
8
|
|
12
9
|
class HtmlParser(BaseLife):
|
13
10
|
def __init__(self, file_path: Union[str, list]):
|
@@ -17,22 +14,45 @@ class HtmlParser(BaseLife):
|
|
17
14
|
@staticmethod
|
18
15
|
def read_html_file(file_path: str) -> str:
|
19
16
|
try:
|
20
|
-
with open(file_path,
|
17
|
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
21
18
|
data = f.read()
|
22
|
-
soup = BeautifulSoup(data,
|
23
|
-
return soup.get_text(separator=
|
19
|
+
soup = BeautifulSoup(data, "html.parser")
|
20
|
+
return soup.get_text(separator="\n", strip=True)
|
24
21
|
except Exception:
|
25
22
|
raise
|
26
23
|
|
27
24
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
28
25
|
try:
|
29
|
-
|
26
|
+
# 1) 提取扩展名并生成“处理开始”事件
|
27
|
+
extension = self.get_file_extension(file_path)
|
28
|
+
lc_start = self.generate_lifecycle(
|
29
|
+
source_file=file_path,
|
30
|
+
domain="Technology",
|
31
|
+
life_type=LifeType.DATA_PROCESSING,
|
32
|
+
usage_purpose="Parsing",
|
33
|
+
)
|
34
|
+
|
35
|
+
# 2) 核心解析
|
30
36
|
content = self.read_html_file(file_path=file_path)
|
31
37
|
mk_content = content
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
38
|
+
|
39
|
+
# 3) 根据内容生成“处理完成”或“处理失败”事件
|
40
|
+
lc_end = self.generate_lifecycle(
|
41
|
+
source_file=file_path,
|
42
|
+
domain="Technology",
|
43
|
+
life_type=(
|
44
|
+
LifeType.DATA_PROCESSED
|
45
|
+
if mk_content.strip()
|
46
|
+
else LifeType.DATA_PROCESS_FAILED
|
47
|
+
),
|
48
|
+
usage_purpose="Parsing",
|
49
|
+
)
|
50
|
+
|
51
|
+
# 4) 封装输出并添加生命周期
|
52
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
53
|
+
output_vo.add_lifecycle(lc_start)
|
54
|
+
output_vo.add_lifecycle(lc_end)
|
36
55
|
return output_vo.to_dict()
|
56
|
+
|
37
57
|
except Exception:
|
38
|
-
raise
|
58
|
+
raise
|
datamax/parser/image_parser.py
CHANGED
@@ -1,34 +1,72 @@
|
|
1
1
|
import os
|
2
2
|
import pathlib
|
3
3
|
import sys
|
4
|
+
|
4
5
|
from datamax.utils import setup_environment
|
5
6
|
|
6
7
|
setup_environment(use_gpu=True)
|
7
|
-
os.environ[
|
8
|
-
|
8
|
+
os.environ["KMP_DUPLICATE_LIB_OK"] = "True"
|
9
|
+
|
9
10
|
|
10
11
|
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
11
12
|
sys.path.insert(0, str(ROOT_DIR))
|
13
|
+
from PIL import Image
|
14
|
+
|
12
15
|
from datamax.parser.base import BaseLife
|
13
16
|
from datamax.parser.pdf_parser import PdfParser
|
14
|
-
from
|
17
|
+
from datamax.utils.lifecycle_types import LifeType
|
18
|
+
|
15
19
|
|
16
20
|
class ImageParser(BaseLife):
|
17
|
-
def __init__(self,file_path: str):
|
21
|
+
def __init__(self, file_path: str):
|
18
22
|
super().__init__()
|
19
23
|
self.file_path = file_path
|
20
24
|
|
21
|
-
def parse(self, file_path: str)
|
25
|
+
def parse(self, file_path: str):
|
22
26
|
try:
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
+
# 1) 处理开始:生成 DATA_PROCESSING 事件
|
28
|
+
extension = self.get_file_extension(file_path)
|
29
|
+
lc_start = self.generate_lifecycle(
|
30
|
+
source_file=file_path,
|
31
|
+
domain="Technology",
|
32
|
+
life_type=LifeType.DATA_PROCESSING,
|
33
|
+
usage_purpose="Parsing",
|
34
|
+
)
|
35
|
+
# 【1】改用 pathlib.Path.stem 获取“基础名”
|
36
|
+
base_name = pathlib.Path(file_path).stem
|
37
|
+
output_pdf_path = f"{base_name}.pdf"
|
38
|
+
|
39
|
+
# 转换图片为 PDF
|
40
|
+
img = Image.open(file_path)
|
41
|
+
img.save(output_pdf_path, "PDF", resolution=100.0)
|
42
|
+
|
43
|
+
# 委托 PdfParser 解析,传入扩展名已由 PdfParser 内部获取
|
27
44
|
pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
|
28
|
-
|
45
|
+
result = pdf_parser.parse(output_pdf_path)
|
46
|
+
|
47
|
+
# 清理临时文件
|
29
48
|
if os.path.exists(output_pdf_path):
|
30
|
-
# shutil.rmtree(f'uploaded_files/markdown')
|
31
49
|
os.remove(output_pdf_path)
|
32
|
-
|
33
|
-
|
34
|
-
|
50
|
+
# 2) 处理结束:根据内容是否非空生成 DATA_PROCESSED 或 DATA_PROCESS_FAILED
|
51
|
+
content = result.get("content", "")
|
52
|
+
lc_end = self.generate_lifecycle(
|
53
|
+
source_file=file_path,
|
54
|
+
domain="Technology",
|
55
|
+
life_type=(
|
56
|
+
LifeType.DATA_PROCESSED
|
57
|
+
if content.strip()
|
58
|
+
else LifeType.DATA_PROCESS_FAILED
|
59
|
+
),
|
60
|
+
usage_purpose="Parsing",
|
61
|
+
)
|
62
|
+
|
63
|
+
# 3) 合并生命周期:先插入 start,再追加 end
|
64
|
+
lifecycle = result.get("lifecycle", [])
|
65
|
+
lifecycle.insert(0, lc_start.to_dict())
|
66
|
+
lifecycle.append(lc_end.to_dict())
|
67
|
+
result["lifecycle"] = lifecycle
|
68
|
+
|
69
|
+
return result
|
70
|
+
|
71
|
+
except Exception:
|
72
|
+
raise
|
datamax/parser/json_parser.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
import json
|
2
2
|
|
3
3
|
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
4
|
+
from datamax.utils.lifecycle_types import LifeType
|
4
5
|
|
5
6
|
|
6
7
|
class JsonParser(BaseLife):
|
@@ -18,15 +19,35 @@ class JsonParser(BaseLife):
|
|
18
19
|
|
19
20
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
20
21
|
try:
|
22
|
+
# 1) 处理开始:DATA_PROCESSING
|
23
|
+
extension = self.get_file_extension(file_path)
|
24
|
+
lc_start = self.generate_lifecycle(
|
25
|
+
source_file=file_path,
|
26
|
+
domain="Technology",
|
27
|
+
life_type=LifeType.DATA_PROCESSING,
|
28
|
+
usage_purpose="Parsing",
|
29
|
+
)
|
30
|
+
|
31
|
+
# 2) 核心解析:读取并格式化 JSON
|
21
32
|
content = self.read_json_file(file_path)
|
22
|
-
|
33
|
+
|
34
|
+
# 3) 处理结束:DATA_PROCESSED 或 DATA_PROCESS_FAILED
|
35
|
+
lc_end = self.generate_lifecycle(
|
23
36
|
source_file=file_path,
|
24
37
|
domain="Technology",
|
25
|
-
|
26
|
-
|
38
|
+
life_type=(
|
39
|
+
LifeType.DATA_PROCESSED
|
40
|
+
if content.strip()
|
41
|
+
else LifeType.DATA_PROCESS_FAILED
|
42
|
+
),
|
43
|
+
usage_purpose="Parsing",
|
27
44
|
)
|
28
|
-
|
29
|
-
|
45
|
+
|
46
|
+
# 4) 封装输出并添加这两条生命周期
|
47
|
+
output_vo = MarkdownOutputVo(extension, content)
|
48
|
+
output_vo.add_lifecycle(lc_start)
|
49
|
+
output_vo.add_lifecycle(lc_end)
|
30
50
|
return output_vo.to_dict()
|
51
|
+
|
31
52
|
except Exception as e:
|
32
53
|
raise e
|
datamax/parser/md_parser.py
CHANGED
@@ -1,13 +1,11 @@
|
|
1
|
-
import pathlib
|
2
|
-
import sys
|
3
1
|
from typing import Union
|
4
2
|
|
5
|
-
|
6
|
-
sys.path.insert(0, str(ROOT_DIR))
|
7
|
-
from datamax.parser.base import BaseLife
|
8
|
-
from datamax.parser.base import MarkdownOutputVo
|
3
|
+
import loguru
|
9
4
|
from loguru import logger
|
10
|
-
|
5
|
+
|
6
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
7
|
+
from datamax.utils.lifecycle_types import LifeType
|
8
|
+
|
11
9
|
|
12
10
|
class MarkdownParser(BaseLife):
|
13
11
|
"""
|
@@ -15,9 +13,10 @@ class MarkdownParser(BaseLife):
|
|
15
13
|
Handles .md and .markdown file extensions.
|
16
14
|
"""
|
17
15
|
|
18
|
-
def __init__(
|
19
|
-
|
20
|
-
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
file_path: Union[str, list],
|
19
|
+
):
|
21
20
|
super().__init__()
|
22
21
|
self.file_path = file_path
|
23
22
|
|
@@ -33,7 +32,7 @@ class MarkdownParser(BaseLife):
|
|
33
32
|
str: Content of the markdown file
|
34
33
|
"""
|
35
34
|
try:
|
36
|
-
with open(file_path,
|
35
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
37
36
|
return f.read()
|
38
37
|
except Exception as e:
|
39
38
|
logger.error(f"Error reading markdown file {file_path}: {e}")
|
@@ -50,24 +49,44 @@ class MarkdownParser(BaseLife):
|
|
50
49
|
MarkdownOutputVo: Structured output containing the markdown content
|
51
50
|
"""
|
52
51
|
try:
|
53
|
-
|
52
|
+
extension = self.get_file_extension(file_path)
|
53
|
+
|
54
|
+
# 1) 生成“开始处理”生命周期
|
55
|
+
start_lc = self.generate_lifecycle(
|
56
|
+
source_file=file_path,
|
57
|
+
domain="Technology",
|
58
|
+
usage_purpose="Documentation",
|
59
|
+
life_type=LifeType.DATA_PROCESSING,
|
60
|
+
)
|
54
61
|
|
55
|
-
#
|
62
|
+
# 2) 读取 Markdown 内容
|
56
63
|
md_content = self.read_markdown_file(file_path)
|
57
64
|
|
58
|
-
#
|
59
|
-
|
65
|
+
# 3) 创建输出 VO,并添加开始事件
|
66
|
+
output_vo = MarkdownOutputVo(extension, md_content)
|
67
|
+
output_vo.add_lifecycle(start_lc)
|
68
|
+
|
69
|
+
# 4) 生成“处理完成”生命周期
|
70
|
+
end_lc = self.generate_lifecycle(
|
60
71
|
source_file=file_path,
|
61
72
|
domain="Technology",
|
62
73
|
usage_purpose="Documentation",
|
63
|
-
life_type=
|
74
|
+
life_type=LifeType.DATA_PROCESSED,
|
64
75
|
)
|
76
|
+
output_vo.add_lifecycle(end_lc)
|
65
77
|
|
66
|
-
# Create and return output VO
|
67
|
-
output_vo = MarkdownOutputVo(title, md_content)
|
68
|
-
output_vo.add_lifecycle(lifecycle)
|
69
78
|
return output_vo.to_dict()
|
70
79
|
|
71
80
|
except Exception as e:
|
72
|
-
logger.error(f"Failed to parse markdown file {file_path}: {e}")
|
73
|
-
|
81
|
+
loguru.logger.error(f"Failed to parse markdown file {file_path}: {e}")
|
82
|
+
# (可选)记录一次失败生命周期
|
83
|
+
fail_lc = self.generate_lifecycle(
|
84
|
+
source_file=file_path,
|
85
|
+
domain="Technology",
|
86
|
+
usage_purpose="Documentation",
|
87
|
+
life_type=LifeType.DATA_PROCESS_FAILED,
|
88
|
+
)
|
89
|
+
# 如果想在失败时也返回 VO,可以这样做:
|
90
|
+
# output_vo = MarkdownOutputVo(self.get_file_extension(file_path), "")
|
91
|
+
# output_vo.add_lifecycle(fail_lc)
|
92
|
+
raise
|
datamax/parser/pdf_parser.py
CHANGED
@@ -1,24 +1,22 @@
|
|
1
1
|
import os
|
2
|
-
import pathlib
|
3
|
-
import sys
|
4
2
|
import subprocess
|
5
3
|
from typing import Union
|
6
4
|
|
7
|
-
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
8
|
-
sys.path.insert(0, str(ROOT_DIR))
|
9
|
-
from datamax.parser.base import BaseLife
|
10
|
-
from datamax.parser.base import MarkdownOutputVo
|
11
5
|
from langchain_community.document_loaders import PyMuPDFLoader
|
12
6
|
from loguru import logger
|
7
|
+
|
8
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
9
|
+
from datamax.utils.lifecycle_types import LifeType
|
13
10
|
from datamax.utils.mineru_operator import pdf_processor
|
14
|
-
|
11
|
+
|
15
12
|
|
16
13
|
class PdfParser(BaseLife):
|
17
14
|
|
18
|
-
def __init__(
|
19
|
-
|
20
|
-
|
21
|
-
|
15
|
+
def __init__(
|
16
|
+
self,
|
17
|
+
file_path: Union[str, list],
|
18
|
+
use_mineru: bool = False,
|
19
|
+
):
|
22
20
|
super().__init__()
|
23
21
|
|
24
22
|
self.file_path = file_path
|
@@ -27,17 +25,25 @@ class PdfParser(BaseLife):
|
|
27
25
|
def mineru_process(self, input_pdf_filename, output_dir):
|
28
26
|
proc = None
|
29
27
|
try:
|
30
|
-
logger.info(
|
31
|
-
|
32
|
-
|
28
|
+
logger.info(
|
29
|
+
f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!"
|
30
|
+
)
|
31
|
+
command = ["magic-pdf", "-p", input_pdf_filename, "-o", output_dir]
|
32
|
+
proc = subprocess.Popen(
|
33
|
+
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
34
|
+
)
|
33
35
|
|
34
36
|
# 等待命令执行完成
|
35
37
|
stdout, stderr = proc.communicate()
|
36
38
|
# 检查命令是否成功执行
|
37
39
|
if proc.returncode != 0:
|
38
|
-
raise Exception(
|
40
|
+
raise Exception(
|
41
|
+
f"mineru failed with return code {proc.returncode}: {stderr.decode()}"
|
42
|
+
)
|
39
43
|
|
40
|
-
logger.info(
|
44
|
+
logger.info(
|
45
|
+
f"Markdown saved in {output_dir}, input file is {input_pdf_filename}"
|
46
|
+
)
|
41
47
|
|
42
48
|
except Exception as e:
|
43
49
|
logger.error(f"Error: {e}")
|
@@ -53,14 +59,16 @@ class PdfParser(BaseLife):
|
|
53
59
|
if proc.poll() is None:
|
54
60
|
proc.kill()
|
55
61
|
proc.wait()
|
56
|
-
logger.info(
|
62
|
+
logger.info(
|
63
|
+
"The process was terminated due to timeout or completion."
|
64
|
+
)
|
57
65
|
|
58
66
|
@staticmethod
|
59
67
|
def read_pdf_file(file_path) -> str:
|
60
68
|
try:
|
61
69
|
pdf_loader = PyMuPDFLoader(file_path)
|
62
70
|
pdf_documents = pdf_loader.load()
|
63
|
-
result_text =
|
71
|
+
result_text = ""
|
64
72
|
for page in pdf_documents:
|
65
73
|
result_text += page.page_content
|
66
74
|
return result_text
|
@@ -68,34 +76,66 @@ class PdfParser(BaseLife):
|
|
68
76
|
raise e
|
69
77
|
|
70
78
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
79
|
+
|
80
|
+
lc_start = self.generate_lifecycle(
|
81
|
+
source_file=file_path,
|
82
|
+
domain="Technology",
|
83
|
+
usage_purpose="Documentation",
|
84
|
+
life_type=LifeType.DATA_PROCESSING,
|
85
|
+
)
|
86
|
+
logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
|
71
87
|
try:
|
72
|
-
|
88
|
+
extension = self.get_file_extension(file_path)
|
73
89
|
|
74
90
|
if self.use_mineru:
|
75
|
-
output_dir =
|
91
|
+
output_dir = "uploaded_files"
|
76
92
|
output_folder_name = os.path.basename(file_path).replace(".pdf", "")
|
77
93
|
# output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
|
78
94
|
# if os.path.exists(output_mineru):
|
79
95
|
# pass
|
80
96
|
# else:
|
81
|
-
|
97
|
+
# self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
|
82
98
|
# mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
83
99
|
|
84
100
|
# todo: 是否有必要跟api的默认保存路径保持一致
|
85
|
-
output_mineru = f
|
101
|
+
output_mineru = f"{output_dir}/markdown/{output_folder_name}.md"
|
86
102
|
|
87
103
|
if os.path.exists(output_mineru):
|
88
|
-
mk_content = open(output_mineru,
|
104
|
+
mk_content = open(output_mineru, "r", encoding="utf-8").read()
|
89
105
|
else:
|
90
106
|
mk_content = pdf_processor.process_pdf(file_path)
|
91
107
|
else:
|
92
108
|
content = self.read_pdf_file(file_path=file_path)
|
93
109
|
mk_content = content
|
94
110
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
111
|
+
# —— 生命周期:处理完成 —— #
|
112
|
+
lc_end = self.generate_lifecycle(
|
113
|
+
source_file=file_path,
|
114
|
+
domain="Technology",
|
115
|
+
usage_purpose="Documentation",
|
116
|
+
life_type=LifeType.DATA_PROCESSED,
|
117
|
+
)
|
118
|
+
logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
|
119
|
+
|
120
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
121
|
+
output_vo.add_lifecycle(lc_start)
|
122
|
+
output_vo.add_lifecycle(lc_end)
|
99
123
|
return output_vo.to_dict()
|
100
|
-
|
101
|
-
|
124
|
+
|
125
|
+
except Exception as e:
|
126
|
+
# —— 生命周期:处理失败 —— #
|
127
|
+
lc_fail = self.generate_lifecycle(
|
128
|
+
source_file=file_path,
|
129
|
+
domain="Technology",
|
130
|
+
usage_purpose="Documentation",
|
131
|
+
life_type=LifeType.DATA_PROCESS_FAILED,
|
132
|
+
)
|
133
|
+
logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
|
134
|
+
|
135
|
+
raise Exception(
|
136
|
+
{
|
137
|
+
"error": str(e),
|
138
|
+
"file_path": file_path,
|
139
|
+
"lifecycle": [lc_fail.to_dict()],
|
140
|
+
}
|
141
|
+
)
|
datamax/parser/ppt_parser.py
CHANGED
@@ -6,8 +6,10 @@ from pathlib import Path
|
|
6
6
|
from typing import Union
|
7
7
|
|
8
8
|
import chardet
|
9
|
+
from loguru import logger
|
9
10
|
|
10
11
|
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
12
|
+
from datamax.utils.lifecycle_types import LifeType
|
11
13
|
from datamax.utils.ppt_extract import PPtExtractor
|
12
14
|
|
13
15
|
# 尝试导入UNO处理器
|
@@ -17,7 +19,7 @@ except ImportError:
|
|
17
19
|
HAS_UNO = False
|
18
20
|
|
19
21
|
|
20
|
-
class
|
22
|
+
class PptParser(BaseLife):
|
21
23
|
def __init__(self, file_path: Union[str, list], use_uno: bool = None):
|
22
24
|
super().__init__()
|
23
25
|
self.file_path = file_path
|
@@ -106,19 +108,49 @@ class PPtParser(BaseLife):
|
|
106
108
|
raise
|
107
109
|
|
108
110
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
111
|
+
# —— 生命周期:开始处理 PPT —— #
|
112
|
+
lc_start = self.generate_lifecycle(
|
113
|
+
source_file=file_path,
|
114
|
+
domain="Technology",
|
115
|
+
usage_purpose="Documentation",
|
116
|
+
life_type=LifeType.DATA_PROCESSING,
|
117
|
+
)
|
118
|
+
logger.debug("⚙️ DATA_PROCESSING 生命周期已生成")
|
119
|
+
|
109
120
|
try:
|
110
|
-
|
121
|
+
extension = self.get_file_extension(file_path)
|
111
122
|
content = self.read_ppt_file(file_path=file_path)
|
112
|
-
# clean_text = clean_original_text(content)
|
113
123
|
mk_content = content
|
114
|
-
|
124
|
+
|
125
|
+
# —— 生命周期:处理完成 —— #
|
126
|
+
lc_end = self.generate_lifecycle(
|
115
127
|
source_file=file_path,
|
116
128
|
domain="Technology",
|
117
129
|
usage_purpose="Documentation",
|
118
|
-
life_type=
|
130
|
+
life_type=LifeType.DATA_PROCESSED,
|
119
131
|
)
|
120
|
-
|
121
|
-
|
132
|
+
logger.debug("⚙️ DATA_PROCESSED 生命周期已生成")
|
133
|
+
|
134
|
+
output_vo = MarkdownOutputVo(extension, mk_content)
|
135
|
+
output_vo.add_lifecycle(lc_start)
|
136
|
+
output_vo.add_lifecycle(lc_end)
|
122
137
|
return output_vo.to_dict()
|
123
|
-
|
124
|
-
|
138
|
+
|
139
|
+
except Exception as e:
|
140
|
+
# —— 生命周期:处理失败 —— #
|
141
|
+
lc_fail = self.generate_lifecycle(
|
142
|
+
source_file=file_path,
|
143
|
+
domain="Technology",
|
144
|
+
usage_purpose="Documentation",
|
145
|
+
life_type=LifeType.DATA_PROCESS_FAILED,
|
146
|
+
)
|
147
|
+
logger.debug("⚙️ DATA_PROCESS_FAILED 生命周期已生成")
|
148
|
+
|
149
|
+
# 返回包含失败生命周期的异常信息
|
150
|
+
raise Exception(
|
151
|
+
{
|
152
|
+
"error": str(e),
|
153
|
+
"file_path": file_path,
|
154
|
+
"lifecycle": [lc_fail.to_dict()],
|
155
|
+
}
|
156
|
+
)
|