pydatamax 0.1.14__py3-none-any.whl → 0.1.15.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/minio_handler.py +171 -171
- datamax/loader/oss_handler.py +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +466 -10
- datamax/parser/docx_parser.py +449 -11
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -215
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/METADATA +117 -5
- pydatamax-0.1.15.post2.dist-info/RECORD +38 -0
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.14.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.post2.dist-info}/WHEEL +0 -0
datamax/parser/image_parser.py
CHANGED
@@ -1,34 +1,34 @@
|
|
1
|
-
import os
|
2
|
-
import pathlib
|
3
|
-
import sys
|
4
|
-
from datamax.utils import setup_environment
|
5
|
-
|
6
|
-
setup_environment(use_gpu=True)
|
7
|
-
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
8
|
-
from datamax.parser.base import MarkdownOutputVo
|
9
|
-
|
10
|
-
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
11
|
-
sys.path.insert(0, str(ROOT_DIR))
|
12
|
-
from datamax.parser.base import BaseLife
|
13
|
-
from datamax.parser.pdf_parser import PdfParser
|
14
|
-
from PIL import Image
|
15
|
-
|
16
|
-
class ImageParser(BaseLife):
|
17
|
-
def __init__(self,file_path: str):
|
18
|
-
super().__init__()
|
19
|
-
self.file_path = file_path
|
20
|
-
|
21
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
22
|
-
try:
|
23
|
-
title =
|
24
|
-
output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
|
25
|
-
image = Image.open(file_path)
|
26
|
-
image.save(output_pdf_path, 'PDF', resolution=100.0)
|
27
|
-
pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
|
28
|
-
output_vo = pdf_parser.parse(output_pdf_path)
|
29
|
-
if os.path.exists(output_pdf_path):
|
30
|
-
# shutil.rmtree(f'uploaded_files/markdown')
|
31
|
-
os.remove(output_pdf_path)
|
32
|
-
return output_vo
|
33
|
-
except Exception as e:
|
34
|
-
raise e
|
1
|
+
import os
|
2
|
+
import pathlib
|
3
|
+
import sys
|
4
|
+
from datamax.utils import setup_environment
|
5
|
+
|
6
|
+
setup_environment(use_gpu=True)
|
7
|
+
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
8
|
+
from datamax.parser.base import MarkdownOutputVo
|
9
|
+
|
10
|
+
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
11
|
+
sys.path.insert(0, str(ROOT_DIR))
|
12
|
+
from datamax.parser.base import BaseLife
|
13
|
+
from datamax.parser.pdf_parser import PdfParser
|
14
|
+
from PIL import Image
|
15
|
+
|
16
|
+
class ImageParser(BaseLife):
|
17
|
+
def __init__(self,file_path: str):
|
18
|
+
super().__init__()
|
19
|
+
self.file_path = file_path
|
20
|
+
|
21
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
22
|
+
try:
|
23
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
24
|
+
output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
|
25
|
+
image = Image.open(file_path)
|
26
|
+
image.save(output_pdf_path, 'PDF', resolution=100.0)
|
27
|
+
pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
|
28
|
+
output_vo = pdf_parser.parse(output_pdf_path)
|
29
|
+
if os.path.exists(output_pdf_path):
|
30
|
+
# shutil.rmtree(f'uploaded_files/markdown')
|
31
|
+
os.remove(output_pdf_path)
|
32
|
+
return output_vo
|
33
|
+
except Exception as e:
|
34
|
+
raise e
|
datamax/parser/json_parser.py
CHANGED
@@ -1,10 +1,32 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
1
|
+
import json
|
2
|
+
|
3
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
4
|
+
|
5
|
+
|
6
|
+
class JsonParser(BaseLife):
|
7
|
+
|
8
|
+
def __init__(self, file_path):
|
9
|
+
super().__init__()
|
10
|
+
self.file_path = file_path
|
11
|
+
|
12
|
+
@staticmethod
|
13
|
+
def read_json_file(file_path: str) -> str:
|
14
|
+
"""Read and pretty print a JSON file."""
|
15
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
16
|
+
data = json.load(f)
|
17
|
+
return json.dumps(data, indent=2, ensure_ascii=False)
|
18
|
+
|
19
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
20
|
+
try:
|
21
|
+
content = self.read_json_file(file_path)
|
22
|
+
lifecycle = self.generate_lifecycle(
|
23
|
+
source_file=file_path,
|
24
|
+
domain="Technology",
|
25
|
+
usage_purpose="Documentation",
|
26
|
+
life_type="LLM_ORIGIN",
|
27
|
+
)
|
28
|
+
output_vo = MarkdownOutputVo(self.get_file_extension(file_path), content)
|
29
|
+
output_vo.add_lifecycle(lifecycle)
|
30
|
+
return output_vo.to_dict()
|
31
|
+
except Exception as e:
|
32
|
+
raise e
|
datamax/parser/md_parser.py
CHANGED
@@ -1,73 +1,73 @@
|
|
1
|
-
import pathlib
|
2
|
-
import sys
|
3
|
-
from typing import Union
|
4
|
-
|
5
|
-
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
6
|
-
sys.path.insert(0, str(ROOT_DIR))
|
7
|
-
from datamax.parser.base import BaseLife
|
8
|
-
from datamax.parser.base import MarkdownOutputVo
|
9
|
-
from loguru import logger
|
10
|
-
|
11
|
-
|
12
|
-
class MarkdownParser(BaseLife):
|
13
|
-
"""
|
14
|
-
Parser for Markdown files that follows the same pattern as PdfParser.
|
15
|
-
Handles .md and .markdown file extensions.
|
16
|
-
"""
|
17
|
-
|
18
|
-
def __init__(self,
|
19
|
-
file_path: Union[str, list],
|
20
|
-
):
|
21
|
-
super().__init__()
|
22
|
-
self.file_path = file_path
|
23
|
-
|
24
|
-
@staticmethod
|
25
|
-
def read_markdown_file(file_path: str) -> str:
|
26
|
-
"""
|
27
|
-
Reads the content of a markdown file.
|
28
|
-
|
29
|
-
Args:
|
30
|
-
file_path: Path to the markdown file
|
31
|
-
|
32
|
-
Returns:
|
33
|
-
str: Content of the markdown file
|
34
|
-
"""
|
35
|
-
try:
|
36
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
37
|
-
return f.read()
|
38
|
-
except Exception as e:
|
39
|
-
logger.error(f"Error reading markdown file {file_path}: {e}")
|
40
|
-
raise
|
41
|
-
|
42
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
43
|
-
"""
|
44
|
-
Parses a markdown file and returns a MarkdownOutputVo.
|
45
|
-
|
46
|
-
Args:
|
47
|
-
file_path: Path to the markdown file
|
48
|
-
|
49
|
-
Returns:
|
50
|
-
MarkdownOutputVo: Structured output containing the markdown content
|
51
|
-
"""
|
52
|
-
try:
|
53
|
-
title =
|
54
|
-
|
55
|
-
# Read markdown content
|
56
|
-
md_content = self.read_markdown_file(file_path)
|
57
|
-
|
58
|
-
# Generate lifecycle metadata
|
59
|
-
lifecycle = self.generate_lifecycle(
|
60
|
-
source_file=file_path,
|
61
|
-
domain="Technology",
|
62
|
-
usage_purpose="Documentation",
|
63
|
-
life_type="LLM_ORIGIN"
|
64
|
-
)
|
65
|
-
|
66
|
-
# Create and return output VO
|
67
|
-
output_vo = MarkdownOutputVo(title, md_content)
|
68
|
-
output_vo.add_lifecycle(lifecycle)
|
69
|
-
return output_vo.to_dict()
|
70
|
-
|
71
|
-
except Exception as e:
|
72
|
-
logger.error(f"Failed to parse markdown file {file_path}: {e}")
|
1
|
+
import pathlib
|
2
|
+
import sys
|
3
|
+
from typing import Union
|
4
|
+
|
5
|
+
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
6
|
+
sys.path.insert(0, str(ROOT_DIR))
|
7
|
+
from datamax.parser.base import BaseLife
|
8
|
+
from datamax.parser.base import MarkdownOutputVo
|
9
|
+
from loguru import logger
|
10
|
+
import os
|
11
|
+
|
12
|
+
class MarkdownParser(BaseLife):
|
13
|
+
"""
|
14
|
+
Parser for Markdown files that follows the same pattern as PdfParser.
|
15
|
+
Handles .md and .markdown file extensions.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(self,
|
19
|
+
file_path: Union[str, list],
|
20
|
+
):
|
21
|
+
super().__init__()
|
22
|
+
self.file_path = file_path
|
23
|
+
|
24
|
+
@staticmethod
|
25
|
+
def read_markdown_file(file_path: str) -> str:
|
26
|
+
"""
|
27
|
+
Reads the content of a markdown file.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
file_path: Path to the markdown file
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
str: Content of the markdown file
|
34
|
+
"""
|
35
|
+
try:
|
36
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
37
|
+
return f.read()
|
38
|
+
except Exception as e:
|
39
|
+
logger.error(f"Error reading markdown file {file_path}: {e}")
|
40
|
+
raise
|
41
|
+
|
42
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
43
|
+
"""
|
44
|
+
Parses a markdown file and returns a MarkdownOutputVo.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
file_path: Path to the markdown file
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
MarkdownOutputVo: Structured output containing the markdown content
|
51
|
+
"""
|
52
|
+
try:
|
53
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
54
|
+
|
55
|
+
# Read markdown content
|
56
|
+
md_content = self.read_markdown_file(file_path)
|
57
|
+
|
58
|
+
# Generate lifecycle metadata
|
59
|
+
lifecycle = self.generate_lifecycle(
|
60
|
+
source_file=file_path,
|
61
|
+
domain="Technology",
|
62
|
+
usage_purpose="Documentation",
|
63
|
+
life_type="LLM_ORIGIN"
|
64
|
+
)
|
65
|
+
|
66
|
+
# Create and return output VO
|
67
|
+
output_vo = MarkdownOutputVo(title, md_content)
|
68
|
+
output_vo.add_lifecycle(lifecycle)
|
69
|
+
return output_vo.to_dict()
|
70
|
+
|
71
|
+
except Exception as e:
|
72
|
+
logger.error(f"Failed to parse markdown file {file_path}: {e}")
|
73
73
|
raise
|
datamax/parser/pdf_parser.py
CHANGED
@@ -1,101 +1,101 @@
|
|
1
|
-
import os
|
2
|
-
import pathlib
|
3
|
-
import sys
|
4
|
-
import subprocess
|
5
|
-
from typing import Union
|
6
|
-
|
7
|
-
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
8
|
-
sys.path.insert(0, str(ROOT_DIR))
|
9
|
-
from datamax.parser.base import BaseLife
|
10
|
-
from datamax.parser.base import MarkdownOutputVo
|
11
|
-
from langchain_community.document_loaders import PyMuPDFLoader
|
12
|
-
from loguru import logger
|
13
|
-
from datamax.utils.mineru_operator import pdf_processor
|
14
|
-
|
15
|
-
|
16
|
-
class PdfParser(BaseLife):
|
17
|
-
|
18
|
-
def __init__(self,
|
19
|
-
file_path: Union[str, list],
|
20
|
-
use_mineru: bool = False,
|
21
|
-
):
|
22
|
-
super().__init__()
|
23
|
-
|
24
|
-
self.file_path = file_path
|
25
|
-
self.use_mineru = use_mineru
|
26
|
-
|
27
|
-
def mineru_process(self, input_pdf_filename, output_dir):
|
28
|
-
proc = None
|
29
|
-
try:
|
30
|
-
logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
|
31
|
-
command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
|
32
|
-
proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
33
|
-
|
34
|
-
# 等待命令执行完成
|
35
|
-
stdout, stderr = proc.communicate()
|
36
|
-
# 检查命令是否成功执行
|
37
|
-
if proc.returncode != 0:
|
38
|
-
raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
|
39
|
-
|
40
|
-
logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
|
41
|
-
|
42
|
-
except Exception as e:
|
43
|
-
logger.error(f"Error: {e}")
|
44
|
-
if proc is not None:
|
45
|
-
proc.kill()
|
46
|
-
proc.wait()
|
47
|
-
logger.info("The process was terminated due to an error.")
|
48
|
-
raise # Re-raise the exception to let the caller handle it
|
49
|
-
|
50
|
-
finally:
|
51
|
-
# 确保子进程已经结束
|
52
|
-
if proc is not None:
|
53
|
-
if proc.poll() is None:
|
54
|
-
proc.kill()
|
55
|
-
proc.wait()
|
56
|
-
logger.info("The process was terminated due to timeout or completion.")
|
57
|
-
|
58
|
-
@staticmethod
|
59
|
-
def read_pdf_file(file_path) -> str:
|
60
|
-
try:
|
61
|
-
pdf_loader = PyMuPDFLoader(file_path)
|
62
|
-
pdf_documents = pdf_loader.load()
|
63
|
-
result_text = ''
|
64
|
-
for page in pdf_documents:
|
65
|
-
result_text += page.page_content
|
66
|
-
return result_text
|
67
|
-
except Exception as e:
|
68
|
-
raise e
|
69
|
-
|
70
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
71
|
-
try:
|
72
|
-
title =
|
73
|
-
|
74
|
-
if self.use_mineru:
|
75
|
-
output_dir = 'uploaded_files'
|
76
|
-
output_folder_name = os.path.basename(file_path).replace(".pdf", "")
|
77
|
-
# output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
|
78
|
-
# if os.path.exists(output_mineru):
|
79
|
-
# pass
|
80
|
-
# else:
|
81
|
-
# self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
|
82
|
-
# mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
83
|
-
|
84
|
-
# todo: 是否有必要跟api的默认保存路径保持一致
|
85
|
-
output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
|
86
|
-
|
87
|
-
if os.path.exists(output_mineru):
|
88
|
-
mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
89
|
-
else:
|
90
|
-
mk_content = pdf_processor.process_pdf(file_path)
|
91
|
-
else:
|
92
|
-
content = self.read_pdf_file(file_path=file_path)
|
93
|
-
mk_content = content
|
94
|
-
|
95
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
96
|
-
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
97
|
-
output_vo = MarkdownOutputVo(title, mk_content)
|
98
|
-
output_vo.add_lifecycle(lifecycle)
|
99
|
-
return output_vo.to_dict()
|
100
|
-
except Exception:
|
101
|
-
raise
|
1
|
+
import os
|
2
|
+
import pathlib
|
3
|
+
import sys
|
4
|
+
import subprocess
|
5
|
+
from typing import Union
|
6
|
+
|
7
|
+
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
8
|
+
sys.path.insert(0, str(ROOT_DIR))
|
9
|
+
from datamax.parser.base import BaseLife
|
10
|
+
from datamax.parser.base import MarkdownOutputVo
|
11
|
+
from langchain_community.document_loaders import PyMuPDFLoader
|
12
|
+
from loguru import logger
|
13
|
+
from datamax.utils.mineru_operator import pdf_processor
|
14
|
+
import os
|
15
|
+
|
16
|
+
class PdfParser(BaseLife):
|
17
|
+
|
18
|
+
def __init__(self,
|
19
|
+
file_path: Union[str, list],
|
20
|
+
use_mineru: bool = False,
|
21
|
+
):
|
22
|
+
super().__init__()
|
23
|
+
|
24
|
+
self.file_path = file_path
|
25
|
+
self.use_mineru = use_mineru
|
26
|
+
|
27
|
+
def mineru_process(self, input_pdf_filename, output_dir):
|
28
|
+
proc = None
|
29
|
+
try:
|
30
|
+
logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
|
31
|
+
command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
|
32
|
+
proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
33
|
+
|
34
|
+
# 等待命令执行完成
|
35
|
+
stdout, stderr = proc.communicate()
|
36
|
+
# 检查命令是否成功执行
|
37
|
+
if proc.returncode != 0:
|
38
|
+
raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
|
39
|
+
|
40
|
+
logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
|
41
|
+
|
42
|
+
except Exception as e:
|
43
|
+
logger.error(f"Error: {e}")
|
44
|
+
if proc is not None:
|
45
|
+
proc.kill()
|
46
|
+
proc.wait()
|
47
|
+
logger.info("The process was terminated due to an error.")
|
48
|
+
raise # Re-raise the exception to let the caller handle it
|
49
|
+
|
50
|
+
finally:
|
51
|
+
# 确保子进程已经结束
|
52
|
+
if proc is not None:
|
53
|
+
if proc.poll() is None:
|
54
|
+
proc.kill()
|
55
|
+
proc.wait()
|
56
|
+
logger.info("The process was terminated due to timeout or completion.")
|
57
|
+
|
58
|
+
@staticmethod
|
59
|
+
def read_pdf_file(file_path) -> str:
|
60
|
+
try:
|
61
|
+
pdf_loader = PyMuPDFLoader(file_path)
|
62
|
+
pdf_documents = pdf_loader.load()
|
63
|
+
result_text = ''
|
64
|
+
for page in pdf_documents:
|
65
|
+
result_text += page.page_content
|
66
|
+
return result_text
|
67
|
+
except Exception as e:
|
68
|
+
raise e
|
69
|
+
|
70
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
71
|
+
try:
|
72
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
73
|
+
|
74
|
+
if self.use_mineru:
|
75
|
+
output_dir = 'uploaded_files'
|
76
|
+
output_folder_name = os.path.basename(file_path).replace(".pdf", "")
|
77
|
+
# output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
|
78
|
+
# if os.path.exists(output_mineru):
|
79
|
+
# pass
|
80
|
+
# else:
|
81
|
+
# self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
|
82
|
+
# mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
83
|
+
|
84
|
+
# todo: 是否有必要跟api的默认保存路径保持一致
|
85
|
+
output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
|
86
|
+
|
87
|
+
if os.path.exists(output_mineru):
|
88
|
+
mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
89
|
+
else:
|
90
|
+
mk_content = pdf_processor.process_pdf(file_path)
|
91
|
+
else:
|
92
|
+
content = self.read_pdf_file(file_path=file_path)
|
93
|
+
mk_content = content
|
94
|
+
|
95
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
96
|
+
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
97
|
+
output_vo = MarkdownOutputVo(title, mk_content)
|
98
|
+
output_vo.add_lifecycle(lifecycle)
|
99
|
+
return output_vo.to_dict()
|
100
|
+
except Exception:
|
101
|
+
raise
|
datamax/parser/ppt_parser.py
CHANGED
@@ -1,41 +1,83 @@
|
|
1
1
|
import os
|
2
2
|
import shutil
|
3
|
-
import chardet
|
4
3
|
import subprocess
|
5
4
|
import tempfile
|
6
5
|
from pathlib import Path
|
7
6
|
from typing import Union
|
8
|
-
|
9
|
-
|
7
|
+
|
8
|
+
import chardet
|
9
|
+
|
10
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
10
11
|
from datamax.utils.ppt_extract import PPtExtractor
|
11
12
|
|
13
|
+
# 尝试导入UNO处理器
|
14
|
+
try:
|
15
|
+
from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
|
16
|
+
except ImportError:
|
17
|
+
HAS_UNO = False
|
18
|
+
|
12
19
|
|
13
20
|
class PPtParser(BaseLife):
|
14
|
-
def __init__(self, file_path: Union[str, list]):
|
21
|
+
def __init__(self, file_path: Union[str, list], use_uno: bool = None):
|
15
22
|
super().__init__()
|
16
23
|
self.file_path = file_path
|
17
24
|
|
25
|
+
# 自动检测是否使用UNO(如果未指定)
|
26
|
+
if use_uno is None:
|
27
|
+
self.use_uno = HAS_UNO
|
28
|
+
else:
|
29
|
+
self.use_uno = use_uno and HAS_UNO
|
30
|
+
|
18
31
|
def ppt_to_pptx(self, ppt_path: str, dir_path: str) -> str:
|
32
|
+
if self.use_uno:
|
33
|
+
# 使用UNO API进行转换
|
34
|
+
try:
|
35
|
+
pptx_path = convert_with_uno(ppt_path, "pptx", dir_path)
|
36
|
+
|
37
|
+
if not os.path.exists(pptx_path):
|
38
|
+
raise Exception(
|
39
|
+
f"> !!! File conversion failed {ppt_path} ==> {pptx_path}"
|
40
|
+
)
|
41
|
+
else:
|
42
|
+
return pptx_path
|
43
|
+
|
44
|
+
except Exception as e:
|
45
|
+
if (
|
46
|
+
hasattr(self, "_fallback_to_subprocess")
|
47
|
+
and self._fallback_to_subprocess
|
48
|
+
):
|
49
|
+
return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
|
50
|
+
raise
|
51
|
+
else:
|
52
|
+
# 使用传统的subprocess方式
|
53
|
+
return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
|
54
|
+
|
55
|
+
def _ppt_to_pptx_subprocess(self, ppt_path: str, dir_path: str) -> str:
|
56
|
+
"""使用subprocess将.ppt文件转换为.pptx文件(传统方式)"""
|
19
57
|
cmd = f'soffice --headless --convert-to pptx "{ppt_path}" --outdir "{dir_path}"'
|
20
|
-
process = subprocess.Popen(
|
58
|
+
process = subprocess.Popen(
|
59
|
+
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
60
|
+
)
|
21
61
|
stdout, stderr = process.communicate()
|
22
62
|
exit_code = process.returncode
|
23
63
|
if exit_code == 0:
|
24
64
|
pass
|
25
65
|
else:
|
26
|
-
encoding = chardet.detect(stderr)[
|
66
|
+
encoding = chardet.detect(stderr)["encoding"]
|
27
67
|
if encoding is None:
|
28
|
-
encoding =
|
29
|
-
raise Exception(
|
68
|
+
encoding = "utf-8"
|
69
|
+
raise Exception(
|
70
|
+
f"Error Output (detected encoding: {encoding}):",
|
71
|
+
stderr.decode(encoding, errors="replace"),
|
72
|
+
)
|
30
73
|
fname = str(Path(ppt_path).stem)
|
31
|
-
pptx_path = os.path.join(os.path.dirname(ppt_path), f
|
74
|
+
pptx_path = os.path.join(os.path.dirname(ppt_path), f"{fname}.pptx")
|
32
75
|
if not os.path.exists(pptx_path):
|
33
76
|
raise Exception(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}")
|
34
77
|
else:
|
35
78
|
return pptx_path
|
36
79
|
|
37
80
|
def read_ppt_file(self, file_path: str):
|
38
|
-
|
39
81
|
try:
|
40
82
|
with tempfile.TemporaryDirectory() as temp_path:
|
41
83
|
temp_dir = Path(temp_path).resolve()
|
@@ -43,17 +85,21 @@ class PPtParser(BaseLife):
|
|
43
85
|
media_dir.mkdir()
|
44
86
|
tmp_file_path = temp_dir / "tmp.ppt"
|
45
87
|
shutil.copy(file_path, tmp_file_path)
|
46
|
-
pptx_file_path = self.ppt_to_pptx(
|
88
|
+
pptx_file_path = self.ppt_to_pptx(
|
89
|
+
ppt_path=str(tmp_file_path), dir_path=temp_path
|
90
|
+
)
|
47
91
|
pptx_extractor = PPtExtractor()
|
48
|
-
pages_list = pptx_extractor.extract(
|
49
|
-
|
92
|
+
pages_list = pptx_extractor.extract(
|
93
|
+
Path(pptx_file_path), "tmp", temp_dir, media_dir, True
|
94
|
+
)
|
95
|
+
contents = ""
|
50
96
|
for index, page in enumerate(pages_list):
|
51
|
-
page_content_list = page[
|
97
|
+
page_content_list = page["content_list"]
|
52
98
|
for content in page_content_list:
|
53
|
-
if content[
|
99
|
+
if content["type"] == "image":
|
54
100
|
pass
|
55
|
-
elif content[
|
56
|
-
data = content[
|
101
|
+
elif content["type"] == "text":
|
102
|
+
data = content["data"]
|
57
103
|
contents += data
|
58
104
|
return contents
|
59
105
|
except Exception:
|
@@ -61,12 +107,16 @@ class PPtParser(BaseLife):
|
|
61
107
|
|
62
108
|
def parse(self, file_path: str) -> MarkdownOutputVo:
|
63
109
|
try:
|
64
|
-
title =
|
110
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
65
111
|
content = self.read_ppt_file(file_path=file_path)
|
66
112
|
# clean_text = clean_original_text(content)
|
67
113
|
mk_content = content
|
68
|
-
lifecycle = self.generate_lifecycle(
|
69
|
-
|
114
|
+
lifecycle = self.generate_lifecycle(
|
115
|
+
source_file=file_path,
|
116
|
+
domain="Technology",
|
117
|
+
usage_purpose="Documentation",
|
118
|
+
life_type="LLM_ORIGIN",
|
119
|
+
)
|
70
120
|
output_vo = MarkdownOutputVo(title, mk_content)
|
71
121
|
output_vo.add_lifecycle(lifecycle)
|
72
122
|
return output_vo.to_dict()
|