pydatamax 0.1.13__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/{MinioHandler.py → minio_handler.py} +171 -171
- datamax/loader/{OssHandler.py → oss_handler.py} +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +525 -61
- datamax/parser/docx_parser.py +512 -62
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -208
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- pydatamax-0.1.15.dist-info/METADATA +340 -0
- pydatamax-0.1.15.dist-info/RECORD +38 -0
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.13.dist-info/METADATA +0 -280
- pydatamax-0.1.13.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.13.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
datamax/parser/epub_parser.py
CHANGED
@@ -1,41 +1,41 @@
|
|
1
|
-
import ebooklib
|
2
|
-
from typing import Union
|
3
|
-
from bs4 import BeautifulSoup
|
4
|
-
from ebooklib import epub
|
5
|
-
from datamax.parser.base import BaseLife
|
6
|
-
from datamax.parser.base import MarkdownOutputVo
|
7
|
-
|
8
|
-
|
9
|
-
class EpubParser(BaseLife):
|
10
|
-
def __init__(self, file_path: Union[str, list]):
|
11
|
-
super().__init__()
|
12
|
-
self.file_path = file_path
|
13
|
-
|
14
|
-
@staticmethod
|
15
|
-
def read_epub_file(file_path: str) -> str:
|
16
|
-
try:
|
17
|
-
book = epub.read_epub(file_path)
|
18
|
-
content = ""
|
19
|
-
for item in book.get_items():
|
20
|
-
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
21
|
-
chapter_content = item.get_content().decode('utf-8')
|
22
|
-
soup = BeautifulSoup(chapter_content, 'html.parser')
|
23
|
-
text = soup.get_text()
|
24
|
-
text = text.replace('\u3000', ' ')
|
25
|
-
content += text
|
26
|
-
return content
|
27
|
-
except Exception as e:
|
28
|
-
raise e
|
29
|
-
|
30
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
31
|
-
try:
|
32
|
-
title =
|
33
|
-
content = self.read_epub_file(file_path=file_path)
|
34
|
-
mk_content = content
|
35
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
36
|
-
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
37
|
-
output_vo = MarkdownOutputVo(title, mk_content)
|
38
|
-
output_vo.add_lifecycle(lifecycle)
|
39
|
-
return output_vo.to_dict()
|
40
|
-
except Exception as e:
|
41
|
-
raise e
|
1
|
+
import ebooklib
|
2
|
+
from typing import Union
|
3
|
+
from bs4 import BeautifulSoup
|
4
|
+
from ebooklib import epub
|
5
|
+
from datamax.parser.base import BaseLife
|
6
|
+
from datamax.parser.base import MarkdownOutputVo
|
7
|
+
import os
|
8
|
+
|
9
|
+
class EpubParser(BaseLife):
|
10
|
+
def __init__(self, file_path: Union[str, list]):
|
11
|
+
super().__init__()
|
12
|
+
self.file_path = file_path
|
13
|
+
|
14
|
+
@staticmethod
|
15
|
+
def read_epub_file(file_path: str) -> str:
|
16
|
+
try:
|
17
|
+
book = epub.read_epub(file_path)
|
18
|
+
content = ""
|
19
|
+
for item in book.get_items():
|
20
|
+
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
21
|
+
chapter_content = item.get_content().decode('utf-8')
|
22
|
+
soup = BeautifulSoup(chapter_content, 'html.parser')
|
23
|
+
text = soup.get_text()
|
24
|
+
text = text.replace('\u3000', ' ')
|
25
|
+
content += text
|
26
|
+
return content
|
27
|
+
except Exception as e:
|
28
|
+
raise e
|
29
|
+
|
30
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
31
|
+
try:
|
32
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
33
|
+
content = self.read_epub_file(file_path=file_path)
|
34
|
+
mk_content = content
|
35
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
36
|
+
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
37
|
+
output_vo = MarkdownOutputVo(title, mk_content)
|
38
|
+
output_vo.add_lifecycle(lifecycle)
|
39
|
+
return output_vo.to_dict()
|
40
|
+
except Exception as e:
|
41
|
+
raise e
|
datamax/parser/html_parser.py
CHANGED
@@ -1,38 +1,38 @@
|
|
1
|
-
from typing import Union
|
2
|
-
import pathlib
|
3
|
-
import sys
|
4
|
-
|
5
|
-
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
6
|
-
sys.path.insert(0, str(ROOT_DIR))
|
7
|
-
from datamax.parser.base import BaseLife
|
8
|
-
from datamax.parser.base import MarkdownOutputVo
|
9
|
-
from bs4 import BeautifulSoup
|
10
|
-
|
11
|
-
|
12
|
-
class HtmlParser(BaseLife):
|
13
|
-
def __init__(self, file_path: Union[str, list]):
|
14
|
-
super().__init__()
|
15
|
-
self.file_path = file_path
|
16
|
-
|
17
|
-
@staticmethod
|
18
|
-
def read_html_file(file_path: str) -> str:
|
19
|
-
try:
|
20
|
-
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
21
|
-
data = f.read()
|
22
|
-
soup = BeautifulSoup(data, 'html.parser')
|
23
|
-
return soup.get_text(separator='\n', strip=True)
|
24
|
-
except Exception:
|
25
|
-
raise
|
26
|
-
|
27
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
28
|
-
try:
|
29
|
-
title =
|
30
|
-
content = self.read_html_file(file_path=file_path)
|
31
|
-
mk_content = content
|
32
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
33
|
-
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
34
|
-
output_vo = MarkdownOutputVo(title, mk_content)
|
35
|
-
output_vo.add_lifecycle(lifecycle)
|
36
|
-
return output_vo.to_dict()
|
37
|
-
except Exception:
|
1
|
+
from typing import Union
|
2
|
+
import pathlib
|
3
|
+
import sys
|
4
|
+
|
5
|
+
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
6
|
+
sys.path.insert(0, str(ROOT_DIR))
|
7
|
+
from datamax.parser.base import BaseLife
|
8
|
+
from datamax.parser.base import MarkdownOutputVo
|
9
|
+
from bs4 import BeautifulSoup
|
10
|
+
import os
|
11
|
+
|
12
|
+
class HtmlParser(BaseLife):
|
13
|
+
def __init__(self, file_path: Union[str, list]):
|
14
|
+
super().__init__()
|
15
|
+
self.file_path = file_path
|
16
|
+
|
17
|
+
@staticmethod
|
18
|
+
def read_html_file(file_path: str) -> str:
|
19
|
+
try:
|
20
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
21
|
+
data = f.read()
|
22
|
+
soup = BeautifulSoup(data, 'html.parser')
|
23
|
+
return soup.get_text(separator='\n', strip=True)
|
24
|
+
except Exception:
|
25
|
+
raise
|
26
|
+
|
27
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
28
|
+
try:
|
29
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
30
|
+
content = self.read_html_file(file_path=file_path)
|
31
|
+
mk_content = content
|
32
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
33
|
+
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
34
|
+
output_vo = MarkdownOutputVo(title, mk_content)
|
35
|
+
output_vo.add_lifecycle(lifecycle)
|
36
|
+
return output_vo.to_dict()
|
37
|
+
except Exception:
|
38
38
|
raise
|
datamax/parser/image_parser.py
CHANGED
@@ -1,34 +1,34 @@
|
|
1
|
-
import os
|
2
|
-
import pathlib
|
3
|
-
import sys
|
4
|
-
from datamax.utils import setup_environment
|
5
|
-
|
6
|
-
setup_environment(use_gpu=True)
|
7
|
-
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
8
|
-
from datamax.parser.base import MarkdownOutputVo
|
9
|
-
|
10
|
-
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
11
|
-
sys.path.insert(0, str(ROOT_DIR))
|
12
|
-
from datamax.parser.base import BaseLife
|
13
|
-
from datamax.parser.pdf_parser import PdfParser
|
14
|
-
from PIL import Image
|
15
|
-
|
16
|
-
class ImageParser(BaseLife):
|
17
|
-
def __init__(self,file_path: str):
|
18
|
-
super().__init__()
|
19
|
-
self.file_path = file_path
|
20
|
-
|
21
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
22
|
-
try:
|
23
|
-
title =
|
24
|
-
output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
|
25
|
-
image = Image.open(file_path)
|
26
|
-
image.save(output_pdf_path, 'PDF', resolution=100.0)
|
27
|
-
pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
|
28
|
-
output_vo = pdf_parser.parse(output_pdf_path)
|
29
|
-
if os.path.exists(output_pdf_path):
|
30
|
-
# shutil.rmtree(f'uploaded_files/markdown')
|
31
|
-
os.remove(output_pdf_path)
|
32
|
-
return output_vo
|
33
|
-
except Exception as e:
|
34
|
-
raise e
|
1
|
+
import os
|
2
|
+
import pathlib
|
3
|
+
import sys
|
4
|
+
from datamax.utils import setup_environment
|
5
|
+
|
6
|
+
setup_environment(use_gpu=True)
|
7
|
+
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
|
8
|
+
from datamax.parser.base import MarkdownOutputVo
|
9
|
+
|
10
|
+
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
11
|
+
sys.path.insert(0, str(ROOT_DIR))
|
12
|
+
from datamax.parser.base import BaseLife
|
13
|
+
from datamax.parser.pdf_parser import PdfParser
|
14
|
+
from PIL import Image
|
15
|
+
|
16
|
+
class ImageParser(BaseLife):
|
17
|
+
def __init__(self,file_path: str):
|
18
|
+
super().__init__()
|
19
|
+
self.file_path = file_path
|
20
|
+
|
21
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
22
|
+
try:
|
23
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
24
|
+
output_pdf_path = f'{os.path.basename(file_path).strip(title)}.pdf'
|
25
|
+
image = Image.open(file_path)
|
26
|
+
image.save(output_pdf_path, 'PDF', resolution=100.0)
|
27
|
+
pdf_parser = PdfParser(output_pdf_path, use_mineru=True)
|
28
|
+
output_vo = pdf_parser.parse(output_pdf_path)
|
29
|
+
if os.path.exists(output_pdf_path):
|
30
|
+
# shutil.rmtree(f'uploaded_files/markdown')
|
31
|
+
os.remove(output_pdf_path)
|
32
|
+
return output_vo
|
33
|
+
except Exception as e:
|
34
|
+
raise e
|
datamax/parser/json_parser.py
CHANGED
@@ -1,10 +1,32 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
1
|
+
import json
|
2
|
+
|
3
|
+
from datamax.parser.base import BaseLife, MarkdownOutputVo
|
4
|
+
|
5
|
+
|
6
|
+
class JsonParser(BaseLife):
|
7
|
+
|
8
|
+
def __init__(self, file_path):
|
9
|
+
super().__init__()
|
10
|
+
self.file_path = file_path
|
11
|
+
|
12
|
+
@staticmethod
|
13
|
+
def read_json_file(file_path: str) -> str:
|
14
|
+
"""Read and pretty print a JSON file."""
|
15
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
16
|
+
data = json.load(f)
|
17
|
+
return json.dumps(data, indent=2, ensure_ascii=False)
|
18
|
+
|
19
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
20
|
+
try:
|
21
|
+
content = self.read_json_file(file_path)
|
22
|
+
lifecycle = self.generate_lifecycle(
|
23
|
+
source_file=file_path,
|
24
|
+
domain="Technology",
|
25
|
+
usage_purpose="Documentation",
|
26
|
+
life_type="LLM_ORIGIN",
|
27
|
+
)
|
28
|
+
output_vo = MarkdownOutputVo(self.get_file_extension(file_path), content)
|
29
|
+
output_vo.add_lifecycle(lifecycle)
|
30
|
+
return output_vo.to_dict()
|
31
|
+
except Exception as e:
|
32
|
+
raise e
|
datamax/parser/md_parser.py
CHANGED
@@ -1,73 +1,73 @@
|
|
1
|
-
import pathlib
|
2
|
-
import sys
|
3
|
-
from typing import Union
|
4
|
-
|
5
|
-
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
6
|
-
sys.path.insert(0, str(ROOT_DIR))
|
7
|
-
from datamax.parser.base import BaseLife
|
8
|
-
from datamax.parser.base import MarkdownOutputVo
|
9
|
-
from loguru import logger
|
10
|
-
|
11
|
-
|
12
|
-
class MarkdownParser(BaseLife):
|
13
|
-
"""
|
14
|
-
Parser for Markdown files that follows the same pattern as PdfParser.
|
15
|
-
Handles .md and .markdown file extensions.
|
16
|
-
"""
|
17
|
-
|
18
|
-
def __init__(self,
|
19
|
-
file_path: Union[str, list],
|
20
|
-
):
|
21
|
-
super().__init__()
|
22
|
-
self.file_path = file_path
|
23
|
-
|
24
|
-
@staticmethod
|
25
|
-
def read_markdown_file(file_path: str) -> str:
|
26
|
-
"""
|
27
|
-
Reads the content of a markdown file.
|
28
|
-
|
29
|
-
Args:
|
30
|
-
file_path: Path to the markdown file
|
31
|
-
|
32
|
-
Returns:
|
33
|
-
str: Content of the markdown file
|
34
|
-
"""
|
35
|
-
try:
|
36
|
-
with open(file_path, 'r', encoding='utf-8') as f:
|
37
|
-
return f.read()
|
38
|
-
except Exception as e:
|
39
|
-
logger.error(f"Error reading markdown file {file_path}: {e}")
|
40
|
-
raise
|
41
|
-
|
42
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
43
|
-
"""
|
44
|
-
Parses a markdown file and returns a MarkdownOutputVo.
|
45
|
-
|
46
|
-
Args:
|
47
|
-
file_path: Path to the markdown file
|
48
|
-
|
49
|
-
Returns:
|
50
|
-
MarkdownOutputVo: Structured output containing the markdown content
|
51
|
-
"""
|
52
|
-
try:
|
53
|
-
title =
|
54
|
-
|
55
|
-
# Read markdown content
|
56
|
-
md_content = self.read_markdown_file(file_path)
|
57
|
-
|
58
|
-
# Generate lifecycle metadata
|
59
|
-
lifecycle = self.generate_lifecycle(
|
60
|
-
source_file=file_path,
|
61
|
-
domain="Technology",
|
62
|
-
usage_purpose="Documentation",
|
63
|
-
life_type="LLM_ORIGIN"
|
64
|
-
)
|
65
|
-
|
66
|
-
# Create and return output VO
|
67
|
-
output_vo = MarkdownOutputVo(title, md_content)
|
68
|
-
output_vo.add_lifecycle(lifecycle)
|
69
|
-
return output_vo.to_dict()
|
70
|
-
|
71
|
-
except Exception as e:
|
72
|
-
logger.error(f"Failed to parse markdown file {file_path}: {e}")
|
1
|
+
import pathlib
|
2
|
+
import sys
|
3
|
+
from typing import Union
|
4
|
+
|
5
|
+
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
6
|
+
sys.path.insert(0, str(ROOT_DIR))
|
7
|
+
from datamax.parser.base import BaseLife
|
8
|
+
from datamax.parser.base import MarkdownOutputVo
|
9
|
+
from loguru import logger
|
10
|
+
import os
|
11
|
+
|
12
|
+
class MarkdownParser(BaseLife):
|
13
|
+
"""
|
14
|
+
Parser for Markdown files that follows the same pattern as PdfParser.
|
15
|
+
Handles .md and .markdown file extensions.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(self,
|
19
|
+
file_path: Union[str, list],
|
20
|
+
):
|
21
|
+
super().__init__()
|
22
|
+
self.file_path = file_path
|
23
|
+
|
24
|
+
@staticmethod
|
25
|
+
def read_markdown_file(file_path: str) -> str:
|
26
|
+
"""
|
27
|
+
Reads the content of a markdown file.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
file_path: Path to the markdown file
|
31
|
+
|
32
|
+
Returns:
|
33
|
+
str: Content of the markdown file
|
34
|
+
"""
|
35
|
+
try:
|
36
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
37
|
+
return f.read()
|
38
|
+
except Exception as e:
|
39
|
+
logger.error(f"Error reading markdown file {file_path}: {e}")
|
40
|
+
raise
|
41
|
+
|
42
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
43
|
+
"""
|
44
|
+
Parses a markdown file and returns a MarkdownOutputVo.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
file_path: Path to the markdown file
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
MarkdownOutputVo: Structured output containing the markdown content
|
51
|
+
"""
|
52
|
+
try:
|
53
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
54
|
+
|
55
|
+
# Read markdown content
|
56
|
+
md_content = self.read_markdown_file(file_path)
|
57
|
+
|
58
|
+
# Generate lifecycle metadata
|
59
|
+
lifecycle = self.generate_lifecycle(
|
60
|
+
source_file=file_path,
|
61
|
+
domain="Technology",
|
62
|
+
usage_purpose="Documentation",
|
63
|
+
life_type="LLM_ORIGIN"
|
64
|
+
)
|
65
|
+
|
66
|
+
# Create and return output VO
|
67
|
+
output_vo = MarkdownOutputVo(title, md_content)
|
68
|
+
output_vo.add_lifecycle(lifecycle)
|
69
|
+
return output_vo.to_dict()
|
70
|
+
|
71
|
+
except Exception as e:
|
72
|
+
logger.error(f"Failed to parse markdown file {file_path}: {e}")
|
73
73
|
raise
|
datamax/parser/pdf_parser.py
CHANGED
@@ -1,101 +1,101 @@
|
|
1
|
-
import os
|
2
|
-
import pathlib
|
3
|
-
import sys
|
4
|
-
import subprocess
|
5
|
-
from typing import Union
|
6
|
-
|
7
|
-
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
8
|
-
sys.path.insert(0, str(ROOT_DIR))
|
9
|
-
from datamax.parser.base import BaseLife
|
10
|
-
from datamax.parser.base import MarkdownOutputVo
|
11
|
-
from langchain_community.document_loaders import PyMuPDFLoader
|
12
|
-
from loguru import logger
|
13
|
-
from datamax.utils.mineru_operator import pdf_processor
|
14
|
-
|
15
|
-
|
16
|
-
class PdfParser(BaseLife):
|
17
|
-
|
18
|
-
def __init__(self,
|
19
|
-
file_path: Union[str, list],
|
20
|
-
use_mineru: bool = False,
|
21
|
-
):
|
22
|
-
super().__init__()
|
23
|
-
|
24
|
-
self.file_path = file_path
|
25
|
-
self.use_mineru = use_mineru
|
26
|
-
|
27
|
-
def mineru_process(self, input_pdf_filename, output_dir):
|
28
|
-
proc = None
|
29
|
-
try:
|
30
|
-
logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
|
31
|
-
command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
|
32
|
-
proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
33
|
-
|
34
|
-
# 等待命令执行完成
|
35
|
-
stdout, stderr = proc.communicate()
|
36
|
-
# 检查命令是否成功执行
|
37
|
-
if proc.returncode != 0:
|
38
|
-
raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
|
39
|
-
|
40
|
-
logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
|
41
|
-
|
42
|
-
except Exception as e:
|
43
|
-
logger.error(f"Error: {e}")
|
44
|
-
if proc is not None:
|
45
|
-
proc.kill()
|
46
|
-
proc.wait()
|
47
|
-
logger.info("The process was terminated due to an error.")
|
48
|
-
raise # Re-raise the exception to let the caller handle it
|
49
|
-
|
50
|
-
finally:
|
51
|
-
# 确保子进程已经结束
|
52
|
-
if proc is not None:
|
53
|
-
if proc.poll() is None:
|
54
|
-
proc.kill()
|
55
|
-
proc.wait()
|
56
|
-
logger.info("The process was terminated due to timeout or completion.")
|
57
|
-
|
58
|
-
@staticmethod
|
59
|
-
def read_pdf_file(file_path) -> str:
|
60
|
-
try:
|
61
|
-
pdf_loader = PyMuPDFLoader(file_path)
|
62
|
-
pdf_documents = pdf_loader.load()
|
63
|
-
result_text = ''
|
64
|
-
for page in pdf_documents:
|
65
|
-
result_text += page.page_content
|
66
|
-
return result_text
|
67
|
-
except Exception as e:
|
68
|
-
raise e
|
69
|
-
|
70
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
71
|
-
try:
|
72
|
-
title =
|
73
|
-
|
74
|
-
if self.use_mineru:
|
75
|
-
output_dir = 'uploaded_files'
|
76
|
-
output_folder_name = os.path.basename(file_path).replace(".pdf", "")
|
77
|
-
# output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
|
78
|
-
# if os.path.exists(output_mineru):
|
79
|
-
# pass
|
80
|
-
# else:
|
81
|
-
# self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
|
82
|
-
# mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
83
|
-
|
84
|
-
# todo: 是否有必要跟api的默认保存路径保持一致
|
85
|
-
output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
|
86
|
-
|
87
|
-
if os.path.exists(output_mineru):
|
88
|
-
mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
89
|
-
else:
|
90
|
-
mk_content = pdf_processor.process_pdf(file_path)
|
91
|
-
else:
|
92
|
-
content = self.read_pdf_file(file_path=file_path)
|
93
|
-
mk_content = content
|
94
|
-
|
95
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
96
|
-
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
97
|
-
output_vo = MarkdownOutputVo(title, mk_content)
|
98
|
-
output_vo.add_lifecycle(lifecycle)
|
99
|
-
return output_vo.to_dict()
|
100
|
-
except Exception:
|
101
|
-
raise
|
1
|
+
import os
|
2
|
+
import pathlib
|
3
|
+
import sys
|
4
|
+
import subprocess
|
5
|
+
from typing import Union
|
6
|
+
|
7
|
+
ROOT_DIR: pathlib.Path = pathlib.Path(__file__).parent.parent.parent.resolve()
|
8
|
+
sys.path.insert(0, str(ROOT_DIR))
|
9
|
+
from datamax.parser.base import BaseLife
|
10
|
+
from datamax.parser.base import MarkdownOutputVo
|
11
|
+
from langchain_community.document_loaders import PyMuPDFLoader
|
12
|
+
from loguru import logger
|
13
|
+
from datamax.utils.mineru_operator import pdf_processor
|
14
|
+
import os
|
15
|
+
|
16
|
+
class PdfParser(BaseLife):
|
17
|
+
|
18
|
+
def __init__(self,
|
19
|
+
file_path: Union[str, list],
|
20
|
+
use_mineru: bool = False,
|
21
|
+
):
|
22
|
+
super().__init__()
|
23
|
+
|
24
|
+
self.file_path = file_path
|
25
|
+
self.use_mineru = use_mineru
|
26
|
+
|
27
|
+
def mineru_process(self, input_pdf_filename, output_dir):
|
28
|
+
proc = None
|
29
|
+
try:
|
30
|
+
logger.info(f"mineru is working...\n input_pdf_filename: {input_pdf_filename} | output_dir: ./{output_dir}. plz waiting!")
|
31
|
+
command = ['magic-pdf', '-p', input_pdf_filename, '-o', output_dir]
|
32
|
+
proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
33
|
+
|
34
|
+
# 等待命令执行完成
|
35
|
+
stdout, stderr = proc.communicate()
|
36
|
+
# 检查命令是否成功执行
|
37
|
+
if proc.returncode != 0:
|
38
|
+
raise Exception(f"mineru failed with return code {proc.returncode}: {stderr.decode()}")
|
39
|
+
|
40
|
+
logger.info(f"Markdown saved in {output_dir}, input file is {input_pdf_filename}")
|
41
|
+
|
42
|
+
except Exception as e:
|
43
|
+
logger.error(f"Error: {e}")
|
44
|
+
if proc is not None:
|
45
|
+
proc.kill()
|
46
|
+
proc.wait()
|
47
|
+
logger.info("The process was terminated due to an error.")
|
48
|
+
raise # Re-raise the exception to let the caller handle it
|
49
|
+
|
50
|
+
finally:
|
51
|
+
# 确保子进程已经结束
|
52
|
+
if proc is not None:
|
53
|
+
if proc.poll() is None:
|
54
|
+
proc.kill()
|
55
|
+
proc.wait()
|
56
|
+
logger.info("The process was terminated due to timeout or completion.")
|
57
|
+
|
58
|
+
@staticmethod
|
59
|
+
def read_pdf_file(file_path) -> str:
|
60
|
+
try:
|
61
|
+
pdf_loader = PyMuPDFLoader(file_path)
|
62
|
+
pdf_documents = pdf_loader.load()
|
63
|
+
result_text = ''
|
64
|
+
for page in pdf_documents:
|
65
|
+
result_text += page.page_content
|
66
|
+
return result_text
|
67
|
+
except Exception as e:
|
68
|
+
raise e
|
69
|
+
|
70
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
71
|
+
try:
|
72
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
73
|
+
|
74
|
+
if self.use_mineru:
|
75
|
+
output_dir = 'uploaded_files'
|
76
|
+
output_folder_name = os.path.basename(file_path).replace(".pdf", "")
|
77
|
+
# output_mineru = f'{output_dir}/{output_folder_name}/auto/{output_folder_name}.md'
|
78
|
+
# if os.path.exists(output_mineru):
|
79
|
+
# pass
|
80
|
+
# else:
|
81
|
+
# self.mineru_process(input_pdf_filename=file_path, output_dir=output_dir)
|
82
|
+
# mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
83
|
+
|
84
|
+
# todo: 是否有必要跟api的默认保存路径保持一致
|
85
|
+
output_mineru = f'{output_dir}/markdown/{output_folder_name}.md'
|
86
|
+
|
87
|
+
if os.path.exists(output_mineru):
|
88
|
+
mk_content = open(output_mineru, 'r', encoding='utf-8').read()
|
89
|
+
else:
|
90
|
+
mk_content = pdf_processor.process_pdf(file_path)
|
91
|
+
else:
|
92
|
+
content = self.read_pdf_file(file_path=file_path)
|
93
|
+
mk_content = content
|
94
|
+
|
95
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
96
|
+
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
97
|
+
output_vo = MarkdownOutputVo(title, mk_content)
|
98
|
+
output_vo.add_lifecycle(lifecycle)
|
99
|
+
return output_vo.to_dict()
|
100
|
+
except Exception:
|
101
|
+
raise
|