pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
  4. datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +525 -61
  10. datamax/parser/docx_parser.py +512 -62
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -208
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. pydatamax-0.1.15.dist-info/METADATA +340 -0
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.13.dist-info/METADATA +0 -280
  38. pydatamax-0.1.13.dist-info/RECORD +0 -39
  39. tests/__init__.py +0 -0
  40. tests/test_basic.py +0 -20
  41. {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
@@ -1,41 +1,41 @@
1
- import ebooklib
2
- from typing import Union
3
- from bs4 import BeautifulSoup
4
- from ebooklib import epub
5
- from datamax.parser.base import BaseLife
6
- from datamax.parser.base import MarkdownOutputVo
7
-
8
-
9
- class EpubParser(BaseLife):
10
- def __init__(self, file_path: Union[str, list]):
11
- super().__init__()
12
- self.file_path = file_path
13
-
14
- @staticmethod
15
- def read_epub_file(file_path: str) -> str:
16
- try:
17
- book = epub.read_epub(file_path)
18
- content = ""
19
- for item in book.get_items():
20
- if item.get_type() == ebooklib.ITEM_DOCUMENT:
21
- chapter_content = item.get_content().decode('utf-8')
22
- soup = BeautifulSoup(chapter_content, 'html.parser')
23
- text = soup.get_text()
24
- text = text.replace('\u3000', ' ')
25
- content += text
26
- return content
27
- except Exception as e:
28
- raise e
29
-
30
- def parse(self, file_path: str) -> MarkdownOutputVo:
31
- try:
32
- title = self.get_file_extension(file_path)
33
- content = self.read_epub_file(file_path=file_path)
34
- mk_content = content
35
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
36
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
37
- output_vo = MarkdownOutputVo(title, mk_content)
38
- output_vo.add_lifecycle(lifecycle)
39
- return output_vo.to_dict()
40
- except Exception as e:
41
- raise e
1
+ import ebooklib
2
+ from typing import Union
3
+ from bs4 import BeautifulSoup
4
+ from ebooklib import epub
5
+ from datamax.parser.base import BaseLife
6
+ from datamax.parser.base import MarkdownOutputVo
7
+ import os
8
+
9
+ class EpubParser(BaseLife):
10
+ def __init__(self, file_path: Union[str, list]):
11
+ super().__init__()
12
+ self.file_path = file_path
13
+
14
+ @staticmethod
15
+ def read_epub_file(file_path: str) -> str:
16
+ try:
17
+ book = epub.read_epub(file_path)
18
+ content = ""
19
+ for item in book.get_items():
20
+ if item.get_type() == ebooklib.ITEM_DOCUMENT:
21
+ chapter_content = item.get_content().decode('utf-8')
22
+ soup = BeautifulSoup(chapter_content, 'html.parser')
23
+ text = soup.get_text()
24
+ text = text.replace('\u3000', ' ')
25
+ content += text
26
+ return content
27
+ except Exception as e:
28
+ raise e
29
+
30
+ def parse(self, file_path: str) -> MarkdownOutputVo:
31
+ try:
32
+ title = os.path.splitext(os.path.basename(file_path))[0]
33
+ content = self.read_epub_file(file_path=file_path)
34
+ mk_content = content
35
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
36
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
37
+ output_vo = MarkdownOutputVo(title, mk_content)
38
+ output_vo.add_lifecycle(lifecycle)
39
+ return output_vo.to_dict()
40
+ except Exception as e:
41
+ raise e
@@ -1,38 +1,38 @@
1
- from typing import Union
2
- import pathlib
3
- import sys
4
-
5
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
- sys.path.insert(0, str(ROOT_DIR))
7
- from datamax.parser.base import BaseLife
8
- from datamax.parser.base import MarkdownOutputVo
9
- from bs4 import BeautifulSoup
10
-
11
-
12
- class HtmlParser(BaseLife):
13
- def __init__(self, file_path: Union[str, list]):
14
- super().__init__()
15
- self.file_path = file_path
16
-
17
- @staticmethod
18
- def read_html_file(file_path: str) -> str:
19
- try:
20
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
21
- data = f.read()
22
- soup = BeautifulSoup(data, 'html.parser')
23
- return soup.get_text(separator='\n', strip=True)
24
- except Exception:
25
- raise
26
-
27
- def parse(self, file_path: str) -> MarkdownOutputVo:
28
- try:
29
- title = self.get_file_extension(file_path)
30
- content = self.read_html_file(file_path=file_path)
31
- mk_content = content
32
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
33
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
34
- output_vo = MarkdownOutputVo(title, mk_content)
35
- output_vo.add_lifecycle(lifecycle)
36
- return output_vo.to_dict()
37
- except Exception:
1
+ from typing import Union
2
+ import pathlib
3
+ import sys
4
+
5
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
+ sys.path.insert(0, str(ROOT_DIR))
7
+ from datamax.parser.base import BaseLife
8
+ from datamax.parser.base import MarkdownOutputVo
9
+ from bs4 import BeautifulSoup
10
+ import os
11
+
12
+ class HtmlParser(BaseLife):
13
+ def __init__(self, file_path: Union[str, list]):
14
+ super().__init__()
15
+ self.file_path = file_path
16
+
17
+ @staticmethod
18
+ def read_html_file(file_path: str) -> str:
19
+ try:
20
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
21
+ data = f.read()
22
+ soup = BeautifulSoup(data, 'html.parser')
23
+ return soup.get_text(separator='\n', strip=True)
24
+ except Exception:
25
+ raise
26
+
27
+ def parse(self, file_path: str) -> MarkdownOutputVo:
28
+ try:
29
+ title = os.path.splitext(os.path.basename(file_path))[0]
30
+ content = self.read_html_file(file_path=file_path)
31
+ mk_content = content
32
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
33
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
34
+ output_vo = MarkdownOutputVo(title, mk_content)
35
+ output_vo.add_lifecycle(lifecycle)
36
+ return output_vo.to_dict()
37
+ except Exception:
38
38
  raise
@@ -1,34 +1,34 @@
1
- import os
2
- import pathlib
3
- import sys
4
- from datamax.utils import setup_environment
5
-
6
- setup_environment(use_gpu=True)
7
- os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
8
- from datamax.parser.base import MarkdownOutputVo
9
-
10
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
11
- sys.path.insert(0, str(ROOT_DIR))
12
- from datamax.parser.base import BaseLife
13
- from datamax.parser.pdf_parser import PdfParser
14
- from PIL import Image
15
-
16
- class ImageParser(BaseLife):
17
- def __init__(self,file_path: str):
18
- super().__init__()
19
- self.file_path = file_path
20
-
21
- def parse(self, file_path: str) -> MarkdownOutputVo:
22
- try:
23
- title = self.get_file_extension(file_path)
24
- output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
25
- image = Image.open(file_path)
26
- image.save(output_pdf_path, 'PDF', resolution=100.0)
27
- pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
28
- output_vo = pdf_parser.parse(output_pdf_path)
29
- if os.path.exists(output_pdf_path):
30
- # shutil.rmtree(f'uploaded_files/markdown')
31
- os.remove(output_pdf_path)
32
- return output_vo
33
- except Exception as e:
34
- raise e
1
+ import os
2
+ import pathlib
3
+ import sys
4
+ from datamax.utils import setup_environment
5
+
6
+ setup_environment(use_gpu=True)
7
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
8
+ from datamax.parser.base import MarkdownOutputVo
9
+
10
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
11
+ sys.path.insert(0, str(ROOT_DIR))
12
+ from datamax.parser.base import BaseLife
13
+ from datamax.parser.pdf_parser import PdfParser
14
+ from PIL import Image
15
+
16
+ class ImageParser(BaseLife):
17
+ def __init__(self,file_path: str):
18
+ super().__init__()
19
+ self.file_path = file_path
20
+
21
+ def parse(self, file_path: str) -> MarkdownOutputVo:
22
+ try:
23
+ title = os.path.splitext(os.path.basename(file_path))[0]
24
+ output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
25
+ image = Image.open(file_path)
26
+ image.save(output_pdf_path, 'PDF', resolution=100.0)
27
+ pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
28
+ output_vo = pdf_parser.parse(output_pdf_path)
29
+ if os.path.exists(output_pdf_path):
30
+ # shutil.rmtree(f'uploaded_files/markdown')
31
+ os.remove(output_pdf_path)
32
+ return output_vo
33
+ except Exception as e:
34
+ raise e
@@ -1,10 +1,32 @@
1
- from datamax.parser.base import MarkdownOutputVo
2
-
3
-
4
- class Parser:
5
-
6
- def __init__(self, file_path):
7
- self.file_path = file_path
8
-
9
- def parse(self) -> MarkdownOutputVo:
10
- pass
1
+ import json
2
+
3
+ from datamax.parser.base import BaseLife, MarkdownOutputVo
4
+
5
+
6
+ class JsonParser(BaseLife):
7
+
8
+ def __init__(self, file_path):
9
+ super().__init__()
10
+ self.file_path = file_path
11
+
12
+ @staticmethod
13
+ def read_json_file(file_path: str) -> str:
14
+ """Read and pretty print a JSON file."""
15
+ with open(file_path, "r", encoding="utf-8") as f:
16
+ data = json.load(f)
17
+ return json.dumps(data, indent=2, ensure_ascii=False)
18
+
19
+ def parse(self, file_path: str) -> MarkdownOutputVo:
20
+ try:
21
+ content = self.read_json_file(file_path)
22
+ lifecycle = self.generate_lifecycle(
23
+ source_file=file_path,
24
+ domain="Technology",
25
+ usage_purpose="Documentation",
26
+ life_type="LLM_ORIGIN",
27
+ )
28
+ output_vo = MarkdownOutputVo(self.get_file_extension(file_path), content)
29
+ output_vo.add_lifecycle(lifecycle)
30
+ return output_vo.to_dict()
31
+ except Exception as e:
32
+ raise e
@@ -1,73 +1,73 @@
1
- import pathlib
2
- import sys
3
- from typing import Union
4
-
5
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
- sys.path.insert(0, str(ROOT_DIR))
7
- from datamax.parser.base import BaseLife
8
- from datamax.parser.base import MarkdownOutputVo
9
- from loguru import logger
10
-
11
-
12
- class MarkdownParser(BaseLife):
13
- """
14
- Parser for Markdown files that follows the same pattern as PdfParser.
15
- Handles .md and .markdown file extensions.
16
- """
17
-
18
- def __init__(self,
19
- file_path: Union[str, list],
20
- ):
21
- super().__init__()
22
- self.file_path = file_path
23
-
24
- @staticmethod
25
- def read_markdown_file(file_path: str) -> str:
26
- """
27
- Reads the content of a markdown file.
28
-
29
- Args:
30
- file_path: Path to the markdown file
31
-
32
- Returns:
33
- str: Content of the markdown file
34
- """
35
- try:
36
- with open(file_path, 'r', encoding='utf-8') as f:
37
- return f.read()
38
- except Exception as e:
39
- logger.error(f"Error reading markdown file {file_path}: {e}")
40
- raise
41
-
42
- def parse(self, file_path: str) -> MarkdownOutputVo:
43
- """
44
- Parses a markdown file and returns a MarkdownOutputVo.
45
-
46
- Args:
47
- file_path: Path to the markdown file
48
-
49
- Returns:
50
- MarkdownOutputVo: Structured output containing the markdown content
51
- """
52
- try:
53
- title = self.get_file_extension(file_path)
54
-
55
- # Read markdown content
56
- md_content = self.read_markdown_file(file_path)
57
-
58
- # Generate lifecycle metadata
59
- lifecycle = self.generate_lifecycle(
60
- source_file=file_path,
61
- domain="Technology",
62
- usage_purpose="Documentation",
63
- life_type="LLM_ORIGIN"
64
- )
65
-
66
- # Create and return output VO
67
- output_vo = MarkdownOutputVo(title, md_content)
68
- output_vo.add_lifecycle(lifecycle)
69
- return output_vo.to_dict()
70
-
71
- except Exception as e:
72
- logger.error(f"Failed to parse markdown file {file_path}: {e}")
1
+ import pathlib
2
+ import sys
3
+ from typing import Union
4
+
5
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
6
+ sys.path.insert(0, str(ROOT_DIR))
7
+ from datamax.parser.base import BaseLife
8
+ from datamax.parser.base import MarkdownOutputVo
9
+ from loguru import logger
10
+ import os
11
+
12
+ class MarkdownParser(BaseLife):
13
+ """
14
+ Parser for Markdown files that follows the same pattern as PdfParser.
15
+ Handles .md and .markdown file extensions.
16
+ """
17
+
18
+ def __init__(self,
19
+ file_path: Union[str, list],
20
+ ):
21
+ super().__init__()
22
+ self.file_path = file_path
23
+
24
+ @staticmethod
25
+ def read_markdown_file(file_path: str) -> str:
26
+ """
27
+ Reads the content of a markdown file.
28
+
29
+ Args:
30
+ file_path: Path to the markdown file
31
+
32
+ Returns:
33
+ str: Content of the markdown file
34
+ """
35
+ try:
36
+ with open(file_path, 'r', encoding='utf-8') as f:
37
+ return f.read()
38
+ except Exception as e:
39
+ logger.error(f"Error reading markdown file {file_path}: {e}")
40
+ raise
41
+
42
+ def parse(self, file_path: str) -> MarkdownOutputVo:
43
+ """
44
+ Parses a markdown file and returns a MarkdownOutputVo.
45
+
46
+ Args:
47
+ file_path: Path to the markdown file
48
+
49
+ Returns:
50
+ MarkdownOutputVo: Structured output containing the markdown content
51
+ """
52
+ try:
53
+ title = os.path.splitext(os.path.basename(file_path))[0]
54
+
55
+ # Read markdown content
56
+ md_content = self.read_markdown_file(file_path)
57
+
58
+ # Generate lifecycle metadata
59
+ lifecycle = self.generate_lifecycle(
60
+ source_file=file_path,
61
+ domain="Technology",
62
+ usage_purpose="Documentation",
63
+ life_type="LLM_ORIGIN"
64
+ )
65
+
66
+ # Create and return output VO
67
+ output_vo = MarkdownOutputVo(title, md_content)
68
+ output_vo.add_lifecycle(lifecycle)
69
+ return output_vo.to_dict()
70
+
71
+ except Exception as e:
72
+ logger.error(f"Failed to parse markdown file {file_path}: {e}")
73
73
  raise
@@ -1,101 +1,101 @@
1
- import os
2
- import pathlib
3
- import sys
4
- import subprocess
5
- from typing import Union
6
-
7
- ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
8
- sys.path.insert(0, str(ROOT_DIR))
9
- from datamax.parser.base import BaseLife
10
- from datamax.parser.base import MarkdownOutputVo
11
- from langchain_community.document_loaders import PyMuPDFLoader
12
- from loguru import logger
13
- from datamax.utils.mineru_operator import pdf_processor
14
-
15
-
16
- class PdfParser(BaseLife):
17
-
18
- def __init__(self,
19
- file_path: Union[str, list],
20
- use_mineru: bool = False,
21
- ):
22
- super().__init__()
23
-
24
- self.file_path = file_path
25
- self.use_mineru = use_mineru
26
-
27
- def mineru_process(self, input_pdf_filename, output_dir):
28
- proc = None
29
- try:
30
- logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
31
- command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
32
- proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
33
-
34
- # 等待命令执行完成
35
- stdout, stderr = proc.communicate()
36
- # 检查命令是否成功执行
37
- if proc.returncode != 0:
38
- raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
39
-
40
- logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
41
-
42
- except Exception as e:
43
- logger.error(f"Error: {e}")
44
- if proc is not None:
45
- proc.kill()
46
- proc.wait()
47
- logger.info("The process was terminated due to an error.")
48
- raise # Re-raise the exception to let the caller handle it
49
-
50
- finally:
51
- # 确保子进程已经结束
52
- if proc is not None:
53
- if proc.poll() is None:
54
- proc.kill()
55
- proc.wait()
56
- logger.info("The process was terminated due to timeout or completion.")
57
-
58
- @staticmethod
59
- def read_pdf_file(file_path) -> str:
60
- try:
61
- pdf_loader = PyMuPDFLoader(file_path)
62
- pdf_documents = pdf_loader.load()
63
- result_text = ''
64
- for page in pdf_documents:
65
- result_text += page.page_content
66
- return result_text
67
- except Exception as e:
68
- raise e
69
-
70
- def parse(self, file_path: str) -> MarkdownOutputVo:
71
- try:
72
- title = self.get_file_extension(file_path)
73
-
74
- if self.use_mineru:
75
- output_dir = 'uploaded_files'
76
- output_folder_name = os.path.basename(file_path).replace(".pdf", "")
77
- # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
78
- # if os.path.exists(output_mineru):
79
- # pass
80
- # else:
81
- # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
82
- # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
83
-
84
- # todo: 是否有必要跟api的默认保存路径保持一致
85
- output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
86
-
87
- if os.path.exists(output_mineru):
88
- mk_content = open(output_mineru, 'r', encoding='utf-8').read()
89
- else:
90
- mk_content = pdf_processor.process_pdf(file_path)
91
- else:
92
- content = self.read_pdf_file(file_path=file_path)
93
- mk_content = content
94
-
95
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
96
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
97
- output_vo = MarkdownOutputVo(title, mk_content)
98
- output_vo.add_lifecycle(lifecycle)
99
- return output_vo.to_dict()
100
- except Exception:
101
- raise
1
+ import os
2
+ import pathlib
3
+ import sys
4
+ import subprocess
5
+ from typing import Union
6
+
7
+ ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
8
+ sys.path.insert(0, str(ROOT_DIR))
9
+ from datamax.parser.base import BaseLife
10
+ from datamax.parser.base import MarkdownOutputVo
11
+ from langchain_community.document_loaders import PyMuPDFLoader
12
+ from loguru import logger
13
+ from datamax.utils.mineru_operator import pdf_processor
14
+ import os
15
+
16
+ class PdfParser(BaseLife):
17
+
18
+ def __init__(self,
19
+ file_path: Union[str, list],
20
+ use_mineru: bool = False,
21
+ ):
22
+ super().__init__()
23
+
24
+ self.file_path = file_path
25
+ self.use_mineru = use_mineru
26
+
27
+ def mineru_process(self, input_pdf_filename, output_dir):
28
+ proc = None
29
+ try:
30
+ logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
31
+ command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
32
+ proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
33
+
34
+ # 等待命令执行完成
35
+ stdout, stderr = proc.communicate()
36
+ # 检查命令是否成功执行
37
+ if proc.returncode != 0:
38
+ raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
39
+
40
+ logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
41
+
42
+ except Exception as e:
43
+ logger.error(f"Error: {e}")
44
+ if proc is not None:
45
+ proc.kill()
46
+ proc.wait()
47
+ logger.info("The process was terminated due to an error.")
48
+ raise # Re-raise the exception to let the caller handle it
49
+
50
+ finally:
51
+ # 确保子进程已经结束
52
+ if proc is not None:
53
+ if proc.poll() is None:
54
+ proc.kill()
55
+ proc.wait()
56
+ logger.info("The process was terminated due to timeout or completion.")
57
+
58
+ @staticmethod
59
+ def read_pdf_file(file_path) -> str:
60
+ try:
61
+ pdf_loader = PyMuPDFLoader(file_path)
62
+ pdf_documents = pdf_loader.load()
63
+ result_text = ''
64
+ for page in pdf_documents:
65
+ result_text += page.page_content
66
+ return result_text
67
+ except Exception as e:
68
+ raise e
69
+
70
+ def parse(self, file_path: str) -> MarkdownOutputVo:
71
+ try:
72
+ title = os.path.splitext(os.path.basename(file_path))[0]
73
+
74
+ if self.use_mineru:
75
+ output_dir = 'uploaded_files'
76
+ output_folder_name = os.path.basename(file_path).replace(".pdf", "")
77
+ # output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
78
+ # if os.path.exists(output_mineru):
79
+ # pass
80
+ # else:
81
+ # self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
82
+ # mk_content = open(output_mineru, 'r', encoding='utf-8').read()
83
+
84
+ # todo: 是否有必要跟api的默认保存路径保持一致
85
+ output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
86
+
87
+ if os.path.exists(output_mineru):
88
+ mk_content = open(output_mineru, 'r', encoding='utf-8').read()
89
+ else:
90
+ mk_content = pdf_processor.process_pdf(file_path)
91
+ else:
92
+ content = self.read_pdf_file(file_path=file_path)
93
+ mk_content = content
94
+
95
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
96
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
97
+ output_vo = MarkdownOutputVo(title, mk_content)
98
+ output_vo.add_lifecycle(lifecycle)
99
+ return output_vo.to_dict()
100
+ except Exception:
101
+ raise