pydatamax 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/minio_handler.py +171 -171
  4. datamax/loader/oss_handler.py +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +466 -10
  10. datamax/parser/docx_parser.py +449 -11
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -215
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/METADATA +117 -5
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.14.dist-info/RECORD +0 -39
  38. tests/__init__.py +0 -0
  39. tests/test_basic.py +0 -20
  40. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
@@ -1,34 +1,34 @@
1
- import os
2
- import pathlib
3
- import sys
4
- from datamax.utils import setup_environment
5
-
6
- setup_environment(use_gpu=True)
7
- os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
8
- from datamax.parser.base import MarkdownOutputVo
9
-
10
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
11
- sys.path.insert(0, str(ROOT_DIR))
12
- from datamax.parser.base import BaseLife
13
- from datamax.parser.pdf_parser import PdfParser
14
- from PIL import Image
15
-
16
- class ImageParser(BaseLife):
17
- def __init__(self,file_path: str):
18
- super().__init__()
19
- self.file_path = file_path
20
-
21
- def parse(self, file_path: str) -> MarkdownOutputVo:
22
- try:
23
- title = self.get_file_extension(file_path)
24
- output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
25
- image = Image.open(file_path)
26
- image.save(output_pdf_path, 'PDF', resolution=100.0)
27
- pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
28
- output_vo = pdf_parser.parse(output_pdf_path)
29
- if os.path.exists(output_pdf_path):
30
- # shutil.rmtree(f'uploaded_files/markdown')
31
- os.remove(output_pdf_path)
32
- return output_vo
33
- except Exception as e:
34
- raise e
1
+ import os
2
+ import pathlib
3
+ import sys
4
+ from datamax.utils import setup_environment
5
+
6
+ setup_environment(use_gpu=True)
7
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
8
+ from datamax.parser.base import MarkdownOutputVo
9
+
10
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
11
+ sys.path.insert(0, str(ROOT_DIR))
12
+ from datamax.parser.base import BaseLife
13
+ from datamax.parser.pdf_parser import PdfParser
14
+ from PIL import Image
15
+
16
+ class ImageParser(BaseLife):
17
+ def __init__(self,file_path: str):
18
+ super().__init__()
19
+ self.file_path = file_path
20
+
21
+ def parse(self, file_path: str) -> MarkdownOutputVo:
22
+ try:
23
+ title = os.path.splitext(os.path.basename(file_path))[0]
24
+ output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
25
+ image = Image.open(file_path)
26
+ image.save(output_pdf_path, 'PDF', resolution=100.0)
27
+ pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
28
+ output_vo = pdf_parser.parse(output_pdf_path)
29
+ if os.path.exists(output_pdf_path):
30
+ # shutil.rmtree(f'uploaded_files/markdown')
31
+ os.remove(output_pdf_path)
32
+ return output_vo
33
+ except Exception as e:
34
+ raise e
@@ -1,10 +1,32 @@
1
- from datamax.parser.base import MarkdownOutputVo
2
-
3
-
4
- class Parser:
5
-
6
- def __init__(self, file_path):
7
- self.file_path = file_path
8
-
9
- def parse(self) -> MarkdownOutputVo:
10
- pass
1
+ import json
2
+
3
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
4
+
5
+
6
+ class JsonParser(BaseLife):
7
+
8
+ def __init__(self, file_path):
9
+ super().__init__()
10
+ self.file_path = file_path
11
+
12
+ @staticmethod
13
+ def read_json_file(file_path: str) -> str:
14
+ """Read and pretty print a JSON file."""
15
+ with open(file_path, "r", encoding="utf-8") as f:
16
+ data = json.load(f)
17
+ return json.dumps(data, indent=2, ensure_ascii=False)
18
+
19
+ def parse(self, file_path: str) -> MarkdownOutputVo:
20
+ try:
21
+ content = self.read_json_file(file_path)
22
+ lifecycle = self.generate_lifecycle(
23
+ source_file=file_path,
24
+ domain="Technology",
25
+ usage_purpose="Documentation",
26
+ life_type="LLM_ORIGIN",
27
+ )
28
+ output_vo = MarkdownOutputVo(self.get_file_extension(file_path), content)
29
+ output_vo.add_lifecycle(lifecycle)
30
+ return output_vo.to_dict()
31
+ except Exception as e:
32
+ raise e
@@ -1,73 +1,73 @@
1
- import pathlib
2
- import sys
3
- from typing import Union
4
-
5
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
- sys.path.insert(0, str(ROOT_DIR))
7
- from datamax.parser.base import BaseLife
8
- from datamax.parser.base import MarkdownOutputVo
9
- from loguru import logger
10
-
11
-
12
- class MarkdownParser(BaseLife):
13
- """
14
- Parser for Markdown files that follows the same pattern as PdfParser.
15
- Handles .md and .markdown file extensions.
16
- """
17
-
18
- def __init__(self,
19
- file_path: Union[str, list],
20
- ):
21
- super().__init__()
22
- self.file_path = file_path
23
-
24
- @staticmethod
25
- def read_markdown_file(file_path: str) -> str:
26
- """
27
- Reads the content of a markdown file.
28
-
29
- Args:
30
- file_path: Path to the markdown file
31
-
32
- Returns:
33
- str: Content of the markdown file
34
- """
35
- try:
36
- with open(file_path, 'r', encoding='utf-8') as f:
37
- return f.read()
38
- except Exception as e:
39
- logger.error(f"Error reading markdown file {file_path}: {e}")
40
- raise
41
-
42
- def parse(self, file_path: str) -> MarkdownOutputVo:
43
- """
44
- Parses a markdown file and returns a MarkdownOutputVo.
45
-
46
- Args:
47
- file_path: Path to the markdown file
48
-
49
- Returns:
50
- MarkdownOutputVo: Structured output containing the markdown content
51
- """
52
- try:
53
- title = self.get_file_extension(file_path)
54
-
55
- # Read markdown content
56
- md_content = self.read_markdown_file(file_path)
57
-
58
- # Generate lifecycle metadata
59
- lifecycle = self.generate_lifecycle(
60
- source_file=file_path,
61
- domain="Technology",
62
- usage_purpose="Documentation",
63
- life_type="LLM_ORIGIN"
64
- )
65
-
66
- # Create and return output VO
67
- output_vo = MarkdownOutputVo(title, md_content)
68
- output_vo.add_lifecycle(lifecycle)
69
- return output_vo.to_dict()
70
-
71
- except Exception as e:
72
- logger.error(f"Failed to parse markdown file {file_path}: {e}")
1
+ import pathlib
2
+ import sys
3
+ from typing import Union
4
+
5
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
+ sys.path.insert(0, str(ROOT_DIR))
7
+ from datamax.parser.base import BaseLife
8
+ from datamax.parser.base import MarkdownOutputVo
9
+ from loguru import logger
10
+ import os
11
+
12
+ class MarkdownParser(BaseLife):
13
+ """
14
+ Parser for Markdown files that follows the same pattern as PdfParser.
15
+ Handles .md and .markdown file extensions.
16
+ """
17
+
18
+ def __init__(self,
19
+ file_path: Union[str, list],
20
+ ):
21
+ super().__init__()
22
+ self.file_path = file_path
23
+
24
+ @staticmethod
25
+ def read_markdown_file(file_path: str) -> str:
26
+ """
27
+ Reads the content of a markdown file.
28
+
29
+ Args:
30
+ file_path: Path to the markdown file
31
+
32
+ Returns:
33
+ str: Content of the markdown file
34
+ """
35
+ try:
36
+ with open(file_path, 'r', encoding='utf-8') as f:
37
+ return f.read()
38
+ except Exception as e:
39
+ logger.error(f"Error reading markdown file {file_path}: {e}")
40
+ raise
41
+
42
+ def parse(self, file_path: str) -> MarkdownOutputVo:
43
+ """
44
+ Parses a markdown file and returns a MarkdownOutputVo.
45
+
46
+ Args:
47
+ file_path: Path to the markdown file
48
+
49
+ Returns:
50
+ MarkdownOutputVo: Structured output containing the markdown content
51
+ """
52
+ try:
53
+ title = os.path.splitext(os.path.basename(file_path))[0]
54
+
55
+ # Read markdown content
56
+ md_content = self.read_markdown_file(file_path)
57
+
58
+ # Generate lifecycle metadata
59
+ lifecycle = self.generate_lifecycle(
60
+ source_file=file_path,
61
+ domain="Technology",
62
+ usage_purpose="Documentation",
63
+ life_type="LLM_ORIGIN"
64
+ )
65
+
66
+ # Create and return output VO
67
+ output_vo = MarkdownOutputVo(title, md_content)
68
+ output_vo.add_lifecycle(lifecycle)
69
+ return output_vo.to_dict()
70
+
71
+ except Exception as e:
72
+ logger.error(f"Failed to parse markdown file {file_path}: {e}")
73
73
  raise
@@ -1,101 +1,101 @@
1
- import os
2
- import pathlib
3
- import sys
4
- import subprocess
5
- from typing import Union
6
-
7
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
8
- sys.path.insert(0, str(ROOT_DIR))
9
- from datamax.parser.base import BaseLife
10
- from datamax.parser.base import MarkdownOutputVo
11
- from langchain_community.document_loaders import PyMuPDFLoader
12
- from loguru import logger
13
- from datamax.utils.mineru_operator import pdf_processor
14
-
15
-
16
- class PdfParser(BaseLife):
17
-
18
- def __init__(self,
19
- file_path: Union[str, list],
20
- use_mineru: bool = False,
21
- ):
22
- super().__init__()
23
-
24
- self.file_path = file_path
25
- self.use_mineru = use_mineru
26
-
27
- def mineru_process(self, input_pdf_filename, output_dir):
28
- proc = None
29
- try:
30
- logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
31
- command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
32
- proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
33
-
34
- # 等待命令执行完成
35
- stdout, stderr = proc.communicate()
36
- # 检查命令是否成功执行
37
- if proc.returncode != 0:
38
- raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
39
-
40
- logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
41
-
42
- except Exception as e:
43
- logger.error(f"Error: {e}")
44
- if proc is not None:
45
- proc.kill()
46
- proc.wait()
47
- logger.info("The process was terminated due to an error.")
48
- raise # Re-raise the exception to let the caller handle it
49
-
50
- finally:
51
- # 确保子进程已经结束
52
- if proc is not None:
53
- if proc.poll() is None:
54
- proc.kill()
55
- proc.wait()
56
- logger.info("The process was terminated due to timeout or completion.")
57
-
58
- @staticmethod
59
- def read_pdf_file(file_path) -> str:
60
- try:
61
- pdf_loader = PyMuPDFLoader(file_path)
62
- pdf_documents = pdf_loader.load()
63
- result_text = ''
64
- for page in pdf_documents:
65
- result_text += page.page_content
66
- return result_text
67
- except Exception as e:
68
- raise e
69
-
70
- def parse(self, file_path: str) -> MarkdownOutputVo:
71
- try:
72
- title = self.get_file_extension(file_path)
73
-
74
- if self.use_mineru:
75
- output_dir = 'uploaded_files'
76
- output_folder_name = os.path.basename(file_path).replace(".pdf", "")
77
- # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
78
- # if os.path.exists(output_mineru):
79
- # pass
80
- # else:
81
- # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
82
- # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
83
-
84
- # todo: 是否有必要跟api的默认保存路径保持一致
85
- output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
86
-
87
- if os.path.exists(output_mineru):
88
- mk_content = open(output_mineru, 'r', encoding='utf-8').read()
89
- else:
90
- mk_content = pdf_processor.process_pdf(file_path)
91
- else:
92
- content = self.read_pdf_file(file_path=file_path)
93
- mk_content = content
94
-
95
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
96
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
97
- output_vo = MarkdownOutputVo(title, mk_content)
98
- output_vo.add_lifecycle(lifecycle)
99
- return output_vo.to_dict()
100
- except Exception:
101
- raise
1
+ import os
2
+ import pathlib
3
+ import sys
4
+ import subprocess
5
+ from typing import Union
6
+
7
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
8
+ sys.path.insert(0, str(ROOT_DIR))
9
+ from datamax.parser.base import BaseLife
10
+ from datamax.parser.base import MarkdownOutputVo
11
+ from langchain_community.document_loaders import PyMuPDFLoader
12
+ from loguru import logger
13
+ from datamax.utils.mineru_operator import pdf_processor
14
+ import os
15
+
16
+ class PdfParser(BaseLife):
17
+
18
+ def __init__(self,
19
+ file_path: Union[str, list],
20
+ use_mineru: bool = False,
21
+ ):
22
+ super().__init__()
23
+
24
+ self.file_path = file_path
25
+ self.use_mineru = use_mineru
26
+
27
+ def mineru_process(self, input_pdf_filename, output_dir):
28
+ proc = None
29
+ try:
30
+ logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
31
+ command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
32
+ proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
33
+
34
+ # 等待命令执行完成
35
+ stdout, stderr = proc.communicate()
36
+ # 检查命令是否成功执行
37
+ if proc.returncode != 0:
38
+ raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
39
+
40
+ logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
41
+
42
+ except Exception as e:
43
+ logger.error(f"Error: {e}")
44
+ if proc is not None:
45
+ proc.kill()
46
+ proc.wait()
47
+ logger.info("The process was terminated due to an error.")
48
+ raise # Re-raise the exception to let the caller handle it
49
+
50
+ finally:
51
+ # 确保子进程已经结束
52
+ if proc is not None:
53
+ if proc.poll() is None:
54
+ proc.kill()
55
+ proc.wait()
56
+ logger.info("The process was terminated due to timeout or completion.")
57
+
58
+ @staticmethod
59
+ def read_pdf_file(file_path) -> str:
60
+ try:
61
+ pdf_loader = PyMuPDFLoader(file_path)
62
+ pdf_documents = pdf_loader.load()
63
+ result_text = ''
64
+ for page in pdf_documents:
65
+ result_text += page.page_content
66
+ return result_text
67
+ except Exception as e:
68
+ raise e
69
+
70
+ def parse(self, file_path: str) -> MarkdownOutputVo:
71
+ try:
72
+ title = os.path.splitext(os.path.basename(file_path))[0]
73
+
74
+ if self.use_mineru:
75
+ output_dir = 'uploaded_files'
76
+ output_folder_name = os.path.basename(file_path).replace(".pdf", "")
77
+ # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
78
+ # if os.path.exists(output_mineru):
79
+ # pass
80
+ # else:
81
+ # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
82
+ # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
83
+
84
+ # todo: 是否有必要跟api的默认保存路径保持一致
85
+ output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
86
+
87
+ if os.path.exists(output_mineru):
88
+ mk_content = open(output_mineru, 'r', encoding='utf-8').read()
89
+ else:
90
+ mk_content = pdf_processor.process_pdf(file_path)
91
+ else:
92
+ content = self.read_pdf_file(file_path=file_path)
93
+ mk_content = content
94
+
95
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
96
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
97
+ output_vo = MarkdownOutputVo(title, mk_content)
98
+ output_vo.add_lifecycle(lifecycle)
99
+ return output_vo.to_dict()
100
+ except Exception:
101
+ raise
@@ -1,41 +1,83 @@
1
1
  import os
2
2
  import shutil
3
- import chardet
4
3
  import subprocess
5
4
  import tempfile
6
5
  from pathlib import Path
7
6
  from typing import Union
8
- from datamax.parser.base import BaseLife
9
- from datamax.parser.base import MarkdownOutputVo
7
+
8
+ import chardet
9
+
10
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
10
11
  from datamax.utils.ppt_extract import PPtExtractor
11
12
 
13
+ # 尝试导入UNO处理器
14
+ try:
15
+ from datamax.utils.uno_handler import HAS_UNO, convert_with_uno
16
+ except ImportError:
17
+ HAS_UNO = False
18
+
12
19
 
13
20
  class PPtParser(BaseLife):
14
- def __init__(self, file_path: Union[str, list]):
21
+ def __init__(self, file_path: Union[str, list], use_uno: bool = None):
15
22
  super().__init__()
16
23
  self.file_path = file_path
17
24
 
25
+ # 自动检测是否使用UNO(如果未指定)
26
+ if use_uno is None:
27
+ self.use_uno = HAS_UNO
28
+ else:
29
+ self.use_uno = use_uno and HAS_UNO
30
+
18
31
  def ppt_to_pptx(self, ppt_path: str, dir_path: str) -> str:
32
+ if self.use_uno:
33
+ # 使用UNO API进行转换
34
+ try:
35
+ pptx_path = convert_with_uno(ppt_path, "pptx", dir_path)
36
+
37
+ if not os.path.exists(pptx_path):
38
+ raise Exception(
39
+ f"> !!! File conversion failed {ppt_path} ==> {pptx_path}"
40
+ )
41
+ else:
42
+ return pptx_path
43
+
44
+ except Exception as e:
45
+ if (
46
+ hasattr(self, "_fallback_to_subprocess")
47
+ and self._fallback_to_subprocess
48
+ ):
49
+ return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
50
+ raise
51
+ else:
52
+ # 使用传统的subprocess方式
53
+ return self._ppt_to_pptx_subprocess(ppt_path, dir_path)
54
+
55
+ def _ppt_to_pptx_subprocess(self, ppt_path: str, dir_path: str) -> str:
56
+ """使用subprocess将.ppt文件转换为.pptx文件(传统方式)"""
19
57
  cmd = f'soffice --headless --convert-to pptx "{ppt_path}" --outdir "{dir_path}"'
20
- process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
58
+ process = subprocess.Popen(
59
+ cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
60
+ )
21
61
  stdout, stderr = process.communicate()
22
62
  exit_code = process.returncode
23
63
  if exit_code == 0:
24
64
  pass
25
65
  else:
26
- encoding = chardet.detect(stderr)['encoding']
66
+ encoding = chardet.detect(stderr)["encoding"]
27
67
  if encoding is None:
28
- encoding = 'utf-8'
29
- raise Exception(f"Error Output (detected encoding: {encoding}):", stderr.decode(encoding, errors='replace'))
68
+ encoding = "utf-8"
69
+ raise Exception(
70
+ f"Error Output (detected encoding: {encoding}):",
71
+ stderr.decode(encoding, errors="replace"),
72
+ )
30
73
  fname = str(Path(ppt_path).stem)
31
- pptx_path = os.path.join(os.path.dirname(ppt_path), f'{fname}.pptx')
74
+ pptx_path = os.path.join(os.path.dirname(ppt_path), f"{fname}.pptx")
32
75
  if not os.path.exists(pptx_path):
33
76
  raise Exception(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}")
34
77
  else:
35
78
  return pptx_path
36
79
 
37
80
  def read_ppt_file(self, file_path: str):
38
-
39
81
  try:
40
82
  with tempfile.TemporaryDirectory() as temp_path:
41
83
  temp_dir = Path(temp_path).resolve()
@@ -43,17 +85,21 @@ class PPtParser(BaseLife):
43
85
  media_dir.mkdir()
44
86
  tmp_file_path = temp_dir / "tmp.ppt"
45
87
  shutil.copy(file_path, tmp_file_path)
46
- pptx_file_path = self.ppt_to_pptx(ppt_path=str(tmp_file_path), dir_path=temp_path)
88
+ pptx_file_path = self.ppt_to_pptx(
89
+ ppt_path=str(tmp_file_path), dir_path=temp_path
90
+ )
47
91
  pptx_extractor = PPtExtractor()
48
- pages_list = pptx_extractor.extract(Path(pptx_file_path), "tmp", temp_dir, media_dir, True)
49
- contents = ''
92
+ pages_list = pptx_extractor.extract(
93
+ Path(pptx_file_path), "tmp", temp_dir, media_dir, True
94
+ )
95
+ contents = ""
50
96
  for index, page in enumerate(pages_list):
51
- page_content_list = page['content_list']
97
+ page_content_list = page["content_list"]
52
98
  for content in page_content_list:
53
- if content['type'] == 'image':
99
+ if content["type"] == "image":
54
100
  pass
55
- elif content['type'] == "text":
56
- data = content['data']
101
+ elif content["type"] == "text":
102
+ data = content["data"]
57
103
  contents += data
58
104
  return contents
59
105
  except Exception:
@@ -61,12 +107,16 @@ class PPtParser(BaseLife):
61
107
 
62
108
  def parse(self, file_path: str) -> MarkdownOutputVo:
63
109
  try:
64
- title = self.get_file_extension(file_path)
110
+ title = os.path.splitext(os.path.basename(file_path))[0]
65
111
  content = self.read_ppt_file(file_path=file_path)
66
112
  # clean_text = clean_original_text(content)
67
113
  mk_content = content
68
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
69
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
114
+ lifecycle = self.generate_lifecycle(
115
+ source_file=file_path,
116
+ domain="Technology",
117
+ usage_purpose="Documentation",
118
+ life_type="LLM_ORIGIN",
119
+ )
70
120
  output_vo = MarkdownOutputVo(title, mk_content)
71
121
  output_vo.add_lifecycle(lifecycle)
72
122
  return output_vo.to_dict()