pydatamax 0.1.14__py3-none-any.whl → 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. datamax/__init__.py +1 -1
  2. datamax/loader/core.py +118 -118
  3. datamax/loader/minio_handler.py +171 -171
  4. datamax/loader/oss_handler.py +191 -191
  5. datamax/parser/__init__.py +2 -4
  6. datamax/parser/base.py +76 -76
  7. datamax/parser/core.py +406 -288
  8. datamax/parser/csv_parser.py +31 -10
  9. datamax/parser/doc_parser.py +466 -10
  10. datamax/parser/docx_parser.py +449 -11
  11. datamax/parser/epub_parser.py +41 -41
  12. datamax/parser/html_parser.py +37 -37
  13. datamax/parser/image_parser.py +34 -34
  14. datamax/parser/json_parser.py +32 -10
  15. datamax/parser/md_parser.py +72 -72
  16. datamax/parser/pdf_parser.py +101 -101
  17. datamax/parser/ppt_parser.py +70 -20
  18. datamax/parser/pptx_parser.py +45 -45
  19. datamax/parser/txt_parser.py +45 -45
  20. datamax/parser/xls_parser.py +26 -26
  21. datamax/parser/xlsx_parser.py +212 -215
  22. datamax/utils/__init__.py +23 -2
  23. datamax/utils/constants.py +58 -58
  24. datamax/utils/data_cleaner.py +275 -237
  25. datamax/utils/env_setup.py +79 -79
  26. datamax/utils/gotocr_pdf.py +265 -265
  27. datamax/utils/mineru_operator.py +62 -62
  28. datamax/utils/paddleocr_pdf_operator.py +90 -90
  29. datamax/utils/ppt_extract.py +140 -140
  30. datamax/utils/qa_generator.py +369 -376
  31. datamax/utils/tokenizer.py +21 -21
  32. datamax/utils/uno_handler.py +426 -0
  33. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/METADATA +117 -5
  34. pydatamax-0.1.15.dist-info/RECORD +38 -0
  35. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/licenses/LICENSE +21 -21
  36. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/top_level.txt +0 -1
  37. pydatamax-0.1.14.dist-info/RECORD +0 -39
  38. tests/__init__.py +0 -0
  39. tests/test_basic.py +0 -20
  40. {pydatamax-0.1.14.dist-info → pydatamax-0.1.15.dist-info}/WHEEL +0 -0
@@ -1,45 +1,45 @@
1
- import os
2
- from typing import Union
3
- from pptx import Presentation
4
- from datamax.parser.base import BaseLife
5
- from datamax.parser.base import MarkdownOutputVo
6
-
7
-
8
- class PPtxParser(BaseLife):
9
- def __init__(self, file_path: Union[str, list]):
10
- super().__init__()
11
- self.file_path = file_path
12
-
13
- @staticmethod
14
- def read_ppt_file(file_path: str):
15
- try:
16
- content = ''
17
- prs = Presentation(file_path)
18
- for slide in prs.slides:
19
- for shape in slide.shapes:
20
- if shape.has_text_frame:
21
- content += shape.text + '\n'
22
- # if shape.shape_type == 13:
23
- # if not os.path.exists("extracted_images"):
24
- # os.makedirs("extracted_images")
25
- # image = shape.image
26
- # image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
27
- # with open(image_filename, 'wb') as img_file:
28
- # img_file.write(image.blob)
29
- # content += ('[' + image_filename + ']')
30
- return content
31
- except Exception:
32
- raise
33
-
34
- def parse(self, file_path: str) -> MarkdownOutputVo:
35
- try:
36
- title = self.get_file_extension(file_path)
37
- content = self.read_ppt_file(file_path=file_path)
38
- mk_content = content
39
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
40
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
41
- output_vo = MarkdownOutputVo(title, mk_content)
42
- output_vo.add_lifecycle(lifecycle)
43
- return output_vo.to_dict()
44
- except Exception:
45
- raise
1
+ import os
2
+ from typing import Union
3
+ from pptx import Presentation
4
+ from datamax.parser.base import BaseLife
5
+ from datamax.parser.base import MarkdownOutputVo
6
+
7
+
8
+ class PPtxParser(BaseLife):
9
+ def __init__(self, file_path: Union[str, list]):
10
+ super().__init__()
11
+ self.file_path = file_path
12
+
13
+ @staticmethod
14
+ def read_ppt_file(file_path: str):
15
+ try:
16
+ content = ''
17
+ prs = Presentation(file_path)
18
+ for slide in prs.slides:
19
+ for shape in slide.shapes:
20
+ if shape.has_text_frame:
21
+ content += shape.text + '\n'
22
+ # if shape.shape_type == 13:
23
+ # if not os.path.exists("extracted_images"):
24
+ # os.makedirs("extracted_images")
25
+ # image = shape.image
26
+ # image_filename = f'extracted_images/image_{shape.shape_id}.{image.ext}'
27
+ # with open(image_filename, 'wb') as img_file:
28
+ # img_file.write(image.blob)
29
+ # content += ('[' + image_filename + ']')
30
+ return content
31
+ except Exception:
32
+ raise
33
+
34
+ def parse(self, file_path: str) -> MarkdownOutputVo:
35
+ try:
36
+ title = os.path.splitext(os.path.basename(file_path))[0]
37
+ content = self.read_ppt_file(file_path=file_path)
38
+ mk_content = content
39
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
40
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
41
+ output_vo = MarkdownOutputVo(title, mk_content)
42
+ output_vo.add_lifecycle(lifecycle)
43
+ return output_vo.to_dict()
44
+ except Exception:
45
+ raise
@@ -1,46 +1,46 @@
1
- import chardet
2
- from typing import Union
3
- from datamax.parser.base import BaseLife
4
- from datamax.parser.base import MarkdownOutputVo
5
-
6
-
7
- class TxtParser(BaseLife):
8
- def __init__(self, file_path: Union[str, list]):
9
- super().__init__()
10
- self.file_path = file_path
11
-
12
- @staticmethod
13
- def detect_encoding(file_path: str):
14
- try:
15
- with open(file_path, 'rb') as f:
16
- result = chardet.detect(f.read())
17
- return result['encoding']
18
- except Exception as e:
19
- raise e
20
-
21
- @staticmethod
22
- def read_txt_file(file_path: str) -> str:
23
- """
24
- Reads the Txt file in the specified path and returns its contents.
25
- :param file_path: indicates the path of the Txt file to be read.
26
- :return: str: Txt file contents.
27
- """
28
- try:
29
- encoding = TxtParser.detect_encoding(file_path)
30
- with open(file_path, 'r', encoding=encoding) as file:
31
- return file.read()
32
- except Exception as e:
33
- raise e
34
-
35
- def parse(self, file_path: str) -> MarkdownOutputVo:
36
- try:
37
- title = self.get_file_extension(file_path)
38
- content = self.read_txt_file(file_path=file_path) # 真实数据是从load加载
39
- mk_content = content
40
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
41
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
42
- output_vo = MarkdownOutputVo(title, mk_content)
43
- output_vo.add_lifecycle(lifecycle)
44
- return output_vo.to_dict()
45
- except Exception as e:
1
+ import chardet
2
+ from typing import Union
3
+ from datamax.parser.base import BaseLife
4
+ from datamax.parser.base import MarkdownOutputVo
5
+ import os
6
+
7
+ class TxtParser(BaseLife):
8
+ def __init__(self, file_path: Union[str, list]):
9
+ super().__init__()
10
+ self.file_path = file_path
11
+
12
+ @staticmethod
13
+ def detect_encoding(file_path: str):
14
+ try:
15
+ with open(file_path, 'rb') as f:
16
+ result = chardet.detect(f.read())
17
+ return result['encoding']
18
+ except Exception as e:
19
+ raise e
20
+
21
+ @staticmethod
22
+ def read_txt_file(file_path: str) -> str:
23
+ """
24
+ Reads the Txt file in the specified path and returns its contents.
25
+ :param file_path: indicates the path of the Txt file to be read.
26
+ :return: str: Txt file contents.
27
+ """
28
+ try:
29
+ encoding = TxtParser.detect_encoding(file_path)
30
+ with open(file_path, 'r', encoding=encoding) as file:
31
+ return file.read()
32
+ except Exception as e:
33
+ raise e
34
+
35
+ def parse(self, file_path: str) -> MarkdownOutputVo:
36
+ try:
37
+ title = os.path.splitext(os.path.basename(file_path))[0]
38
+ content = self.read_txt_file(file_path=file_path) # 真实数据是从load加载
39
+ mk_content = content
40
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
41
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
42
+ output_vo = MarkdownOutputVo(title, mk_content)
43
+ output_vo.add_lifecycle(lifecycle)
44
+ return output_vo.to_dict()
45
+ except Exception as e:
46
46
  raise e
@@ -1,26 +1,26 @@
1
- from datamax.parser.base import MarkdownOutputVo
2
- from datamax.parser.base import BaseLife
3
- import pandas as pd
4
- import warnings
5
-
6
- warnings.filterwarnings("ignore")
7
-
8
-
9
- class XlsParser(BaseLife):
10
- """xlsx or xls table use markitdown from Microsoft so magic for table!"""
11
-
12
- def __init__(self, file_path):
13
- super().__init__()
14
- self.file_path = file_path
15
-
16
- def parse(self, file_path: str) -> MarkdownOutputVo:
17
- try:
18
- df = pd.read_excel(file_path)
19
- mk_content = df.to_markdown(index=False)
20
- lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
21
- usage_purpose="Documentation", life_type="LLM_ORIGIN")
22
- output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
23
- output_vo.add_lifecycle(lifecycle)
24
- return output_vo.to_dict()
25
- except Exception as e:
26
- raise e
1
+ from datamax.parser.base import MarkdownOutputVo
2
+ from datamax.parser.base import BaseLife
3
+ import pandas as pd
4
+ import warnings
5
+
6
+ warnings.filterwarnings("ignore")
7
+
8
+
9
+ class XlsParser(BaseLife):
10
+ """xlsx or xls table use markitdown from Microsoft so magic for table!"""
11
+
12
+ def __init__(self, file_path):
13
+ super().__init__()
14
+ self.file_path = file_path
15
+
16
+ def parse(self, file_path: str) -> MarkdownOutputVo:
17
+ try:
18
+ df = pd.read_excel(file_path)
19
+ mk_content = df.to_markdown(index=False)
20
+ lifecycle = self.generate_lifecycle(source_file=file_path, domain="Technology",
21
+ usage_purpose="Documentation", life_type="LLM_ORIGIN")
22
+ output_vo = MarkdownOutputVo(self.get_file_extension(file_path), mk_content)
23
+ output_vo.add_lifecycle(lifecycle)
24
+ return output_vo.to_dict()
25
+ except Exception as e:
26
+ raise e