pydatamax 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamax/__init__.py +1 -1
- datamax/loader/core.py +118 -118
- datamax/loader/minio_handler.py +171 -171
- datamax/loader/oss_handler.py +191 -191
- datamax/parser/__init__.py +2 -4
- datamax/parser/base.py +76 -76
- datamax/parser/core.py +406 -288
- datamax/parser/csv_parser.py +31 -10
- datamax/parser/doc_parser.py +466 -10
- datamax/parser/docx_parser.py +449 -11
- datamax/parser/epub_parser.py +41 -41
- datamax/parser/html_parser.py +37 -37
- datamax/parser/image_parser.py +34 -34
- datamax/parser/json_parser.py +32 -10
- datamax/parser/md_parser.py +72 -72
- datamax/parser/pdf_parser.py +101 -101
- datamax/parser/ppt_parser.py +70 -20
- datamax/parser/pptx_parser.py +45 -45
- datamax/parser/txt_parser.py +45 -45
- datamax/parser/xls_parser.py +26 -26
- datamax/parser/xlsx_parser.py +212 -215
- datamax/utils/__init__.py +23 -2
- datamax/utils/constants.py +58 -58
- datamax/utils/data_cleaner.py +275 -237
- datamax/utils/env_setup.py +79 -79
- datamax/utils/gotocr_pdf.py +265 -265
- datamax/utils/mineru_operator.py +62 -62
- datamax/utils/paddleocr_pdf_operator.py +90 -90
- datamax/utils/ppt_extract.py +140 -140
- datamax/utils/qa_generator.py +369 -376
- datamax/utils/tokenizer.py +21 -21
- datamax/utils/uno_handler.py +426 -0
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/METADATA +117 -5
- pydatamax-0.1.15.dist-info/RECORD +38 -0
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
- pydatamax-0.1.14.dist-info/RECORD +0 -39
- tests/__init__.py +0 -0
- tests/test_basic.py +0 -20
- {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
datamax/parser/pptx_parser.py
CHANGED
@@ -1,45 +1,45 @@
|
|
1
|
-
import os
|
2
|
-
from typing import Union
|
3
|
-
from pptx import Presentation
|
4
|
-
from datamax.parser.base import BaseLife
|
5
|
-
from datamax.parser.base import MarkdownOutputVo
|
6
|
-
|
7
|
-
|
8
|
-
class PPtxParser(BaseLife):
|
9
|
-
def __init__(self, file_path: Union[str, list]):
|
10
|
-
super().__init__()
|
11
|
-
self.file_path = file_path
|
12
|
-
|
13
|
-
@staticmethod
|
14
|
-
def read_ppt_file(file_path: str):
|
15
|
-
try:
|
16
|
-
content = ''
|
17
|
-
prs = Presentation(file_path)
|
18
|
-
for slide in prs.slides:
|
19
|
-
for shape in slide.shapes:
|
20
|
-
if shape.has_text_frame:
|
21
|
-
content += shape.text + '\n'
|
22
|
-
# if shape.shape_type == 13:
|
23
|
-
# if not os.path.exists("extracted_images"):
|
24
|
-
# os.makedirs("extracted_images")
|
25
|
-
# image = shape.image
|
26
|
-
# image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
|
27
|
-
# with open(image_filename, 'wb') as img_file:
|
28
|
-
# img_file.write(image.blob)
|
29
|
-
# content += ('[' + image_filename + ']')
|
30
|
-
return content
|
31
|
-
except Exception:
|
32
|
-
raise
|
33
|
-
|
34
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
35
|
-
try:
|
36
|
-
title =
|
37
|
-
content = self.read_ppt_file(file_path=file_path)
|
38
|
-
mk_content = content
|
39
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
40
|
-
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
41
|
-
output_vo = MarkdownOutputVo(title, mk_content)
|
42
|
-
output_vo.add_lifecycle(lifecycle)
|
43
|
-
return output_vo.to_dict()
|
44
|
-
except Exception:
|
45
|
-
raise
|
1
|
+
import os
|
2
|
+
from typing import Union
|
3
|
+
from pptx import Presentation
|
4
|
+
from datamax.parser.base import BaseLife
|
5
|
+
from datamax.parser.base import MarkdownOutputVo
|
6
|
+
|
7
|
+
|
8
|
+
class PPtxParser(BaseLife):
|
9
|
+
def __init__(self, file_path: Union[str, list]):
|
10
|
+
super().__init__()
|
11
|
+
self.file_path = file_path
|
12
|
+
|
13
|
+
@staticmethod
|
14
|
+
def read_ppt_file(file_path: str):
|
15
|
+
try:
|
16
|
+
content = ''
|
17
|
+
prs = Presentation(file_path)
|
18
|
+
for slide in prs.slides:
|
19
|
+
for shape in slide.shapes:
|
20
|
+
if shape.has_text_frame:
|
21
|
+
content += shape.text + '\n'
|
22
|
+
# if shape.shape_type == 13:
|
23
|
+
# if not os.path.exists("extracted_images"):
|
24
|
+
# os.makedirs("extracted_images")
|
25
|
+
# image = shape.image
|
26
|
+
# image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
|
27
|
+
# with open(image_filename, 'wb') as img_file:
|
28
|
+
# img_file.write(image.blob)
|
29
|
+
# content += ('[' + image_filename + ']')
|
30
|
+
return content
|
31
|
+
except Exception:
|
32
|
+
raise
|
33
|
+
|
34
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
35
|
+
try:
|
36
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
37
|
+
content = self.read_ppt_file(file_path=file_path)
|
38
|
+
mk_content = content
|
39
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
40
|
+
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
41
|
+
output_vo = MarkdownOutputVo(title, mk_content)
|
42
|
+
output_vo.add_lifecycle(lifecycle)
|
43
|
+
return output_vo.to_dict()
|
44
|
+
except Exception:
|
45
|
+
raise
|
datamax/parser/txt_parser.py
CHANGED
@@ -1,46 +1,46 @@
|
|
1
|
-
import chardet
|
2
|
-
from typing import Union
|
3
|
-
from datamax.parser.base import BaseLife
|
4
|
-
from datamax.parser.base import MarkdownOutputVo
|
5
|
-
|
6
|
-
|
7
|
-
class TxtParser(BaseLife):
|
8
|
-
def __init__(self, file_path: Union[str, list]):
|
9
|
-
super().__init__()
|
10
|
-
self.file_path = file_path
|
11
|
-
|
12
|
-
@staticmethod
|
13
|
-
def detect_encoding(file_path: str):
|
14
|
-
try:
|
15
|
-
with open(file_path, 'rb') as f:
|
16
|
-
result = chardet.detect(f.read())
|
17
|
-
return result['encoding']
|
18
|
-
except Exception as e:
|
19
|
-
raise e
|
20
|
-
|
21
|
-
@staticmethod
|
22
|
-
def read_txt_file(file_path: str) -> str:
|
23
|
-
"""
|
24
|
-
Reads the Txt file in the specified path and returns its contents.
|
25
|
-
:param file_path: indicates the path of the Txt file to be read.
|
26
|
-
:return: str: Txt file contents.
|
27
|
-
"""
|
28
|
-
try:
|
29
|
-
encoding = TxtParser.detect_encoding(file_path)
|
30
|
-
with open(file_path, 'r', encoding=encoding) as file:
|
31
|
-
return file.read()
|
32
|
-
except Exception as e:
|
33
|
-
raise e
|
34
|
-
|
35
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
36
|
-
try:
|
37
|
-
title =
|
38
|
-
content = self.read_txt_file(file_path=file_path) # 真实数据是从load加载
|
39
|
-
mk_content = content
|
40
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
41
|
-
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
42
|
-
output_vo = MarkdownOutputVo(title, mk_content)
|
43
|
-
output_vo.add_lifecycle(lifecycle)
|
44
|
-
return output_vo.to_dict()
|
45
|
-
except Exception as e:
|
1
|
+
import chardet
|
2
|
+
from typing import Union
|
3
|
+
from datamax.parser.base import BaseLife
|
4
|
+
from datamax.parser.base import MarkdownOutputVo
|
5
|
+
import os
|
6
|
+
|
7
|
+
class TxtParser(BaseLife):
|
8
|
+
def __init__(self, file_path: Union[str, list]):
|
9
|
+
super().__init__()
|
10
|
+
self.file_path = file_path
|
11
|
+
|
12
|
+
@staticmethod
|
13
|
+
def detect_encoding(file_path: str):
|
14
|
+
try:
|
15
|
+
with open(file_path, 'rb') as f:
|
16
|
+
result = chardet.detect(f.read())
|
17
|
+
return result['encoding']
|
18
|
+
except Exception as e:
|
19
|
+
raise e
|
20
|
+
|
21
|
+
@staticmethod
|
22
|
+
def read_txt_file(file_path: str) -> str:
|
23
|
+
"""
|
24
|
+
Reads the Txt file in the specified path and returns its contents.
|
25
|
+
:param file_path: indicates the path of the Txt file to be read.
|
26
|
+
:return: str: Txt file contents.
|
27
|
+
"""
|
28
|
+
try:
|
29
|
+
encoding = TxtParser.detect_encoding(file_path)
|
30
|
+
with open(file_path, 'r', encoding=encoding) as file:
|
31
|
+
return file.read()
|
32
|
+
except Exception as e:
|
33
|
+
raise e
|
34
|
+
|
35
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
36
|
+
try:
|
37
|
+
title = os.path.splitext(os.path.basename(file_path))[0]
|
38
|
+
content = self.read_txt_file(file_path=file_path) # 真实数据是从load加载
|
39
|
+
mk_content = content
|
40
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
41
|
+
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
42
|
+
output_vo = MarkdownOutputVo(title, mk_content)
|
43
|
+
output_vo.add_lifecycle(lifecycle)
|
44
|
+
return output_vo.to_dict()
|
45
|
+
except Exception as e:
|
46
46
|
raise e
|
datamax/parser/xls_parser.py
CHANGED
@@ -1,26 +1,26 @@
|
|
1
|
-
from datamax.parser.base import MarkdownOutputVo
|
2
|
-
from datamax.parser.base import BaseLife
|
3
|
-
import pandas as pd
|
4
|
-
import warnings
|
5
|
-
|
6
|
-
warnings.filterwarnings("ignore")
|
7
|
-
|
8
|
-
|
9
|
-
class XlsParser(BaseLife):
|
10
|
-
"""xlsx or xls table use markitdown from Microsoft so magic for table!"""
|
11
|
-
|
12
|
-
def __init__(self, file_path):
|
13
|
-
super().__init__()
|
14
|
-
self.file_path = file_path
|
15
|
-
|
16
|
-
def parse(self, file_path: str) -> MarkdownOutputVo:
|
17
|
-
try:
|
18
|
-
df = pd.read_excel(file_path)
|
19
|
-
mk_content = df.to_markdown(index=False)
|
20
|
-
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
21
|
-
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
22
|
-
output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
|
23
|
-
output_vo.add_lifecycle(lifecycle)
|
24
|
-
return output_vo.to_dict()
|
25
|
-
except Exception as e:
|
26
|
-
raise e
|
1
|
+
from datamax.parser.base import MarkdownOutputVo
|
2
|
+
from datamax.parser.base import BaseLife
|
3
|
+
import pandas as pd
|
4
|
+
import warnings
|
5
|
+
|
6
|
+
warnings.filterwarnings("ignore")
|
7
|
+
|
8
|
+
|
9
|
+
class XlsParser(BaseLife):
|
10
|
+
"""xlsx or xls table use markitdown from Microsoft so magic for table!"""
|
11
|
+
|
12
|
+
def __init__(self, file_path):
|
13
|
+
super().__init__()
|
14
|
+
self.file_path = file_path
|
15
|
+
|
16
|
+
def parse(self, file_path: str) -> MarkdownOutputVo:
|
17
|
+
try:
|
18
|
+
df = pd.read_excel(file_path)
|
19
|
+
mk_content = df.to_markdown(index=False)
|
20
|
+
lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
|
21
|
+
usage_purpose="Documentation", life_type="LLM_ORIGIN")
|
22
|
+
output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
|
23
|
+
output_vo.add_lifecycle(lifecycle)
|
24
|
+
return output_vo.to_dict()
|
25
|
+
except Exception as e:
|
26
|
+
raise e
|